From 51fccb75d26751a9f385f4df07185cd22261c678 Mon Sep 17 00:00:00 2001 From: zwang28 <70626450+zwang28@users.noreply.github.com> Date: Thu, 13 Jun 2024 10:11:45 +0800 Subject: [PATCH 01/19] feat(connector): add SQL Server sink (#17154) --- Cargo.lock | 155 ++++++ ci/docker-compose.yml | 11 + ci/scripts/e2e-sqlserver-sink-test.sh | 80 ++++ ci/workflows/main-cron.yml | 19 + ci/workflows/pull-request.yml | 15 + e2e_test/sink/sqlserver_sink.slt | 59 +++ src/connector/Cargo.toml | 1 + src/connector/src/sink/mod.rs | 14 + src/connector/src/sink/sqlserver.rs | 649 ++++++++++++++++++++++++++ src/connector/src/with_options.rs | 1 + src/connector/with_options_sink.yaml | 27 ++ 11 files changed, 1031 insertions(+) create mode 100755 ci/scripts/e2e-sqlserver-sink-test.sh create mode 100644 e2e_test/sink/sqlserver_sink.slt create mode 100644 src/connector/src/sink/sqlserver.rs diff --git a/Cargo.lock b/Cargo.lock index a3e602a653713..462b542829cee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1097,6 +1097,19 @@ dependencies = [ "syn 2.0.57", ] +[[package]] +name = "asynchronous-codec" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4057f2c32adbb2fc158e22fb38433c8e9bbf76b75a4732c7c0cbaf695fb65568" +dependencies = [ + "bytes", + "futures-sink", + "futures-util", + "memchr", + "pin-project-lite", +] + [[package]] name = "atoi" version = "2.0.0" @@ -2654,6 +2667,12 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "connection-string" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "510ca239cf13b7f8d16a2b48f263de7b4f8c566f0af58d901031473c76afb1e3" + [[package]] name = "console" version = "0.15.7" @@ -4540,6 +4559,70 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + [[package]] name = "encoding_rs" version = "0.8.33" @@ -8955,6 +9038,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04bfa62906ce8d9badf8d1764501640ae7f0bcea3437a209315830e0f73564d1" +[[package]] +name = "pretty-hex" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6fa0831dd7cc608c38a5e323422a0077678fa5744aa2be4ad91c4ece8eec8d5" + [[package]] name = "pretty-xmlish" version = "0.1.13" @@ -10653,6 +10742,7 @@ dependencies = [ "tempfile", "thiserror", "thiserror-ext", + "tiberius", "time", "tokio-postgres", "tokio-retry", @@ -12000,6 +12090,18 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rustls" +version = "0.20.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b80e3dec595989ea8510028f30c408a4630db12c9cbb8de34203b89d6577e99" +dependencies = [ + "log", + "ring 0.16.20", + "sct", + "webpki", +] + [[package]] name = "rustls" version = "0.21.11" @@ -14303,6 +14405,37 @@ dependencies = [ "ordered-float 2.10.0", ] +[[package]] +name = "tiberius" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc6e2bf3e4b5be181a2a2ceff4b9b12e2684010d436a6958bd564fbc8094d44d" +dependencies = [ + "async-trait", + "asynchronous-codec", + "bigdecimal 0.3.1", + "byteorder", + "bytes", + "chrono", + "connection-string", + "encoding", + "enumflags2", + "futures-util", + "num-traits", + "once_cell", + "pin-project-lite", + "pretty-hex", + "rust_decimal", + "rustls-native-certs 0.6.3", + "rustls-pemfile 1.0.4", + "thiserror", + "time", + "tokio-rustls 0.23.4", + "tokio-util", + "tracing", + "uuid", +] + [[package]] name = "tikv-jemalloc-ctl" version = "0.5.4" @@ -14506,6 +14639,17 @@ dependencies = [ "rand", ] +[[package]] +name = "tokio-rustls" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" +dependencies = [ + "rustls 0.20.9", + "tokio", + "webpki", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -14568,6 +14712,7 @@ checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d" dependencies = [ "bytes", "futures-core", + "futures-io", "futures-sink", "pin-project-lite", "tokio", @@ -15918,6 +16063,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki" +version = "0.22.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53" +dependencies = [ + "ring 0.17.5", + "untrusted 0.9.0", +] + [[package]] name = "webpki-roots" version = "0.25.2" diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml index 4786f85327d14..dd10bc8c98c1d 100644 --- a/ci/docker-compose.yml +++ b/ci/docker-compose.yml @@ -100,6 +100,7 @@ services: - doris-server - starrocks-fe-server - starrocks-be-server + - sqlserver-server volumes: - ..:/risingwave @@ -204,6 +205,16 @@ services: timeout: 5s retries: 30 + sqlserver-server: + container_name: sqlserver-server + image: mcr.microsoft.com/mssql/server:2022-latest + hostname: sqlserver-server + ports: + - 1433:1433 + environment: + ACCEPT_EULA: 'Y' + SA_PASSWORD: 'SomeTestOnly@SA' + starrocks-fe-server: container_name: starrocks-fe-server image: starrocks/fe-ubuntu:3.1.7 diff --git a/ci/scripts/e2e-sqlserver-sink-test.sh b/ci/scripts/e2e-sqlserver-sink-test.sh new file mode 100755 index 0000000000000..f1f62941375ce --- /dev/null +++ b/ci/scripts/e2e-sqlserver-sink-test.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +# Exits as soon as any line fails. +set -euo pipefail + +source ci/scripts/common.sh + +while getopts 'p:' opt; do + case ${opt} in + p ) + profile=$OPTARG + ;; + \? ) + echo "Invalid Option: -$OPTARG" 1>&2 + exit 1 + ;; + : ) + echo "Invalid option: $OPTARG requires an argument" 1>&2 + ;; + esac +done +shift $((OPTIND -1)) + +download_and_prepare_rw "$profile" source + +echo "--- starting risingwave cluster" +risedev ci-start ci-sink-test +sleep 1 + +echo "--- create SQL Server table" +curl https://packages.microsoft.com/keys/microsoft.asc | sudo apt-key add - +curl https://packages.microsoft.com/config/ubuntu/20.04/prod.list | sudo tee /etc/apt/sources.list.d/msprod.list +apt-get update -y +ACCEPT_EULA=Y DEBIAN_FRONTEND=noninteractive apt-get install -y mssql-tools unixodbc-dev +export PATH="/opt/mssql-tools/bin/:$PATH" +sleep 2 + +sqlcmd -S sqlserver-server -U SA -P SomeTestOnly@SA -Q " +CREATE DATABASE SinkTest; +GO +USE SinkTest; +CREATE TABLE t_many_data_type ( + k1 int, k2 int, + c_boolean bit, + c_int16 smallint, + c_int32 int, + c_int64 bigint, + c_float32 float, + c_float64 float, + c_decimal decimal, + c_date date, + c_time time, + c_timestamp datetime2, + c_timestampz datetime2, + c_nvarchar nvarchar(1024), + c_varbinary varbinary(1024), +PRIMARY KEY (k1,k2)); +GO" +sleep 2 + +echo "--- testing sinks" +sqllogictest -p 4566 -d dev './e2e_test/sink/sqlserver_sink.slt' +sleep 1 +sqlcmd -S sqlserver-server -U SA -P SomeTestOnly@SA -h -1 -Q " +SELECT * FROM SinkTest.dbo.t_many_data_type; +GO" > ./query_result.txt + +mapfile -t actual < <(tr -s '[:space:]' '\n' < query_result.txt) +actual=("${actual[@]:1}") +expected=(0 0 0 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 1 1 0 55 55 1 1.0 1.0 1 2022-04-08 18:20:49.0000000 2022-03-13 01:00:00.0000000 2022-03-13 01:00:00.0000000 Hello World! 0xDE00BEEF 1 2 0 66 66 1 1.0 1.0 1 2022-04-08 18:20:49.0000000 2022-03-13 01:00:00.0000000 2022-03-13 01:00:00.0000000 Hello World! 0xDE00BEEF 1 4 0 2 2 1 1.0 1.0 1 2022-04-08 18:20:49.0000000 2022-03-13 01:00:00.0000000 2022-03-13 01:00:00.0000000 Hello World! 0xDE00BEEF "(4" rows "affected)") + +if [[ ${#actual[@]} -eq ${#expected[@]} && ${actual[@]} == ${expected[@]} ]]; then + echo "SQL Server sink check passed" +else + cat ./query_result.txt + echo "The output is not as expected." +fi + +echo "--- Kill cluster" +risedev ci-kill diff --git a/ci/workflows/main-cron.yml b/ci/workflows/main-cron.yml index 7a1054a0d481b..c9eaf5cf0c38d 100644 --- a/ci/workflows/main-cron.yml +++ b/ci/workflows/main-cron.yml @@ -868,6 +868,25 @@ steps: timeout_in_minutes: 10 retry: *auto-retry + - label: "end-to-end sqlserver sink test" + key: "e2e-sqlserver-sink-tests" + command: "ci/scripts/e2e-sqlserver-sink-test.sh -p ci-release" + if: | + !(build.pull_request.labels includes "ci/main-cron/skip-ci") && build.env("CI_STEPS") == null + || build.pull_request.labels includes "ci/run-e2e-sqlserver-sink-tests" + || build.env("CI_STEPS") =~ /(^|,)e2e-sqlserver-sink-tests?(,|$$)/ + depends_on: + - "build" + - "build-other" + plugins: + - docker-compose#v5.1.0: + run: sink-test-env + config: ci/docker-compose.yml + mount-buildkite-agent: true + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 10 + retry: *auto-retry + - label: "end-to-end pulsar sink test" key: "e2e-pulsar-sink-tests" command: "ci/scripts/e2e-pulsar-sink-test.sh -p ci-release" diff --git a/ci/workflows/pull-request.yml b/ci/workflows/pull-request.yml index 385790e830232..20f0ef4128247 100644 --- a/ci/workflows/pull-request.yml +++ b/ci/workflows/pull-request.yml @@ -315,6 +315,21 @@ steps: timeout_in_minutes: 10 retry: *auto-retry + - label: "end-to-end sqlserver sink test" + if: build.pull_request.labels includes "ci/run-e2e-sqlserver-sink-tests" || build.env("CI_STEPS") =~ /(^|,)e2e-sqlserver-sink-tests?(,|$$)/ + command: "ci/scripts/e2e-sqlserver-sink-test.sh -p ci-dev" + depends_on: + - "build" + - "build-other" + plugins: + - docker-compose#v5.1.0: + run: sink-test-env + config: ci/docker-compose.yml + mount-buildkite-agent: true + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 10 + retry: *auto-retry + - label: "end-to-end deltalake sink test" if: build.pull_request.labels includes "ci/run- e2e-deltalake-sink-rust-tests" || build.env("CI_STEPS") =~ /(^|,) e2e-deltalake-sink-rust-tests?(,|$$)/ command: "ci/scripts/e2e-deltalake-sink-rust-test.sh -p ci-dev" diff --git a/e2e_test/sink/sqlserver_sink.slt b/e2e_test/sink/sqlserver_sink.slt new file mode 100644 index 0000000000000..156b8b865ffc8 --- /dev/null +++ b/e2e_test/sink/sqlserver_sink.slt @@ -0,0 +1,59 @@ +statement ok +create table t_many_data_type_rw ( + k1 int, k2 int, + c_boolean bool, + c_int16 smallint, + c_int32 int, + c_int64 bigint, + c_float32 float, + c_float64 double, + c_decimal decimal, + c_date date, + c_time time, + c_timestamp timestamp, + c_timestampz timestamp, + c_nvarchar string, + c_varbinary bytea); + +statement ok +create sink s_many_data_type from t_many_data_type_rw with ( + connector = 'sqlserver', + type = 'upsert', + sqlserver.host = 'sqlserver-server', + sqlserver.port = 1433, + sqlserver.user = 'SA', + sqlserver.password = 'SomeTestOnly@SA', + sqlserver.database = 'SinkTest', + sqlserver.table = 't_many_data_type', + primary_key = 'k1,k2', +); + +statement ok +insert into t_many_data_type_rw values +(0,0,false,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL), +(1,1,false,1,1,1,1.0,1.0,1.0,date '2022-04-08',time '18:20:49','2022-03-13 01:00:00'::timestamp,'2022-03-13 01:00:00Z'::timestamptz,'Hello World!','\xDe00BeEf'), +(1,2,false,2,2,1,1.0,1.0,1.0,date '2022-04-08',time '18:20:49','2022-03-13 01:00:00'::timestamp,'2022-03-13 01:00:00Z'::timestamptz,'Hello World!','\xDe00BeEf'), +(1,3,false,2,2,1,1.0,1.0,1.0,date '2022-04-08',time '18:20:49','2022-03-13 01:00:00'::timestamp,'2022-03-13 01:00:00Z'::timestamptz,'Hello World!','\xDe00BeEf'), +(1,4,false,2,2,1,1.0,1.0,1.0,date '2022-04-08',time '18:20:49','2022-03-13 01:00:00'::timestamp,'2022-03-13 01:00:00Z'::timestamptz,'Hello World!','\xDe00BeEf'), +(1,1,false,2,2,1,1.0,1.0,1.0,date '2022-04-08',time '18:20:49','2022-03-13 01:00:00'::timestamp,'2022-03-13 01:00:00Z'::timestamptz,'Hello World!','\xDe00BeEf'); +flush; + +statement ok +delete from t_many_data_type_rw where k1=1 and k2=2; +delete from t_many_data_type_rw where k1=1 and k2=3; +flush; + +statement ok +insert into t_many_data_type_rw values +(1,1,false,55,55,1,1.0,1.0,1.0,date '2022-04-08',time '18:20:49','2022-03-13 01:00:00'::timestamp,'2022-03-13 01:00:00Z'::timestamptz,'Hello World!','\xDe00BeEf'), +(1,2,false,66,66,1,1.0,1.0,1.0,date '2022-04-08',time '18:20:49','2022-03-13 01:00:00'::timestamp,'2022-03-13 01:00:00Z'::timestamptz,'Hello World!','\xDe00BeEf'); +flush; + +statement ok +FLUSH; + +statement ok +DROP SINK s_many_data_type; + +statement ok +DROP TABLE t_many_data_type_rw; diff --git a/src/connector/Cargo.toml b/src/connector/Cargo.toml index 1464fcc215b32..7cb6f23e5ec7e 100644 --- a/src/connector/Cargo.toml +++ b/src/connector/Cargo.toml @@ -136,6 +136,7 @@ strum_macros = "0.26" tempfile = "3" thiserror = "1" thiserror-ext = { workspace = true } +tiberius = { version = "0.12", default-features = false, features = ["chrono", "time", "tds73", "rust_decimal", "bigdecimal", "rustls"] } time = "0.3.30" tokio = { version = "0.2", package = "madsim-tokio", features = [ "rt", diff --git a/src/connector/src/sink/mod.rs b/src/connector/src/sink/mod.rs index 6bfe20fa18c12..e5a5f6143e419 100644 --- a/src/connector/src/sink/mod.rs +++ b/src/connector/src/sink/mod.rs @@ -38,6 +38,7 @@ pub mod redis; pub mod remote; pub mod snowflake; pub mod snowflake_connector; +pub mod sqlserver; pub mod starrocks; pub mod test_sink; pub mod trivial; @@ -100,6 +101,7 @@ macro_rules! for_all_sinks { { DeltaLake, $crate::sink::deltalake::DeltaLakeSink }, { BigQuery, $crate::sink::big_query::BigQuerySink }, { DynamoDb, $crate::sink::dynamodb::DynamoDbSink }, + { SqlServer, $crate::sink::sqlserver::SqlServerSink }, { Test, $crate::sink::test_sink::TestSink }, { Table, $crate::sink::trivial::TableSink } } @@ -577,6 +579,12 @@ pub enum SinkError { #[backtrace] anyhow::Error, ), + #[error("SQL Server error: {0}")] + SqlServer( + #[source] + #[backtrace] + anyhow::Error, + ), #[error(transparent)] Connector( #[from] @@ -614,3 +622,9 @@ impl From for SinkError { SinkError::Redis(value.to_report_string()) } } + +impl From for SinkError { + fn from(err: tiberius::error::Error) -> Self { + SinkError::SqlServer(anyhow!(err)) + } +} diff --git a/src/connector/src/sink/sqlserver.rs b/src/connector/src/sink/sqlserver.rs new file mode 100644 index 0000000000000..acdad3c47627e --- /dev/null +++ b/src/connector/src/sink/sqlserver.rs @@ -0,0 +1,649 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; + +use anyhow::anyhow; +use async_trait::async_trait; +use risingwave_common::array::{Op, RowRef, StreamChunk}; +use risingwave_common::buffer::Bitmap; +use risingwave_common::catalog::Schema; +use risingwave_common::row::{OwnedRow, Row}; +use risingwave_common::types::{DataType, Decimal}; +use serde_derive::Deserialize; +use serde_with::{serde_as, DisplayFromStr}; +use simd_json::prelude::ArrayTrait; +use tiberius::numeric::Numeric; +use tiberius::{AuthMethod, Client, ColumnData, Config, Query}; +use tokio::net::TcpStream; +use tokio_util::compat::TokioAsyncWriteCompatExt; +use with_options::WithOptions; + +use super::{SinkError, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT}; +use crate::sink::writer::{LogSinkerOf, SinkWriter, SinkWriterExt}; +use crate::sink::{DummySinkCommitCoordinator, Result, Sink, SinkParam, SinkWriterParam}; + +pub const SQLSERVER_SINK: &str = "sqlserver"; + +fn default_max_batch_rows() -> usize { + 1024 +} + +#[serde_as] +#[derive(Clone, Debug, Deserialize, WithOptions)] +pub struct SqlServerConfig { + #[serde(rename = "sqlserver.host")] + pub host: String, + #[serde(rename = "sqlserver.port")] + #[serde_as(as = "DisplayFromStr")] + pub port: u16, + #[serde(rename = "sqlserver.user")] + pub user: String, + #[serde(rename = "sqlserver.password")] + pub password: String, + #[serde(rename = "sqlserver.database")] + pub database: String, + #[serde(rename = "sqlserver.table")] + pub table: String, + #[serde( + rename = "sqlserver.max_batch_rows", + default = "default_max_batch_rows" + )] + #[serde_as(as = "DisplayFromStr")] + pub max_batch_rows: usize, + pub r#type: String, // accept "append-only" or "upsert" +} + +impl SqlServerConfig { + pub fn from_btreemap(properties: BTreeMap) -> Result { + let config = + serde_json::from_value::(serde_json::to_value(properties).unwrap()) + .map_err(|e| SinkError::Config(anyhow!(e)))?; + if config.r#type != SINK_TYPE_APPEND_ONLY && config.r#type != SINK_TYPE_UPSERT { + return Err(SinkError::Config(anyhow!( + "`{}` must be {}, or {}", + SINK_TYPE_OPTION, + SINK_TYPE_APPEND_ONLY, + SINK_TYPE_UPSERT + ))); + } + Ok(config) + } +} + +#[derive(Debug)] +pub struct SqlServerSink { + pub config: SqlServerConfig, + schema: Schema, + pk_indices: Vec, + is_append_only: bool, +} + +impl SqlServerSink { + pub fn new( + mut config: SqlServerConfig, + schema: Schema, + pk_indices: Vec, + is_append_only: bool, + ) -> Result { + // Rewrite config because tiberius allows a maximum of 2100 params in one query request. + const TIBERIUS_PARAM_MAX: usize = 2000; + let params_per_op = schema.fields().len(); + let tiberius_max_batch_rows = if params_per_op == 0 { + config.max_batch_rows + } else { + ((TIBERIUS_PARAM_MAX as f64 / params_per_op as f64).floor()) as usize + }; + if tiberius_max_batch_rows == 0 { + return Err(SinkError::SqlServer(anyhow!(format!( + "too many column {}", + params_per_op + )))); + } + config.max_batch_rows = std::cmp::min(config.max_batch_rows, tiberius_max_batch_rows); + Ok(Self { + config, + schema, + pk_indices, + is_append_only, + }) + } +} + +impl TryFrom for SqlServerSink { + type Error = SinkError; + + fn try_from(param: SinkParam) -> std::result::Result { + let schema = param.schema(); + let config = SqlServerConfig::from_btreemap(param.properties)?; + SqlServerSink::new( + config, + schema, + param.downstream_pk, + param.sink_type.is_append_only(), + ) + } +} + +impl Sink for SqlServerSink { + type Coordinator = DummySinkCommitCoordinator; + type LogSinker = LogSinkerOf; + + const SINK_NAME: &'static str = SQLSERVER_SINK; + + async fn validate(&self) -> Result<()> { + if !self.is_append_only && self.pk_indices.is_empty() { + return Err(SinkError::Config(anyhow!( + "Primary key not defined for upsert SQL Server sink (please define in `primary_key` field)"))); + } + + for f in self.schema.fields() { + check_data_type_compatibility(&f.data_type)?; + } + + // Query table metadata from SQL Server. + let mut sql_server_table_metadata = HashMap::new(); + let mut sql_client = SqlClient::new(&self.config).await?; + let query_table_metadata_error = || { + SinkError::SqlServer(anyhow!(format!( + "SQL Server table {} metadata error", + self.config.table + ))) + }; + static QUERY_TABLE_METADATA: &str = r#" +SELECT + col.name AS ColumnName, + pk.index_id AS PkIndex +FROM + sys.columns col +LEFT JOIN + sys.index_columns ic ON ic.object_id = col.object_id AND ic.column_id = col.column_id +LEFT JOIN + sys.indexes pk ON pk.object_id = col.object_id AND pk.index_id = ic.index_id AND pk.is_primary_key = 1 +WHERE + col.object_id = OBJECT_ID(@P1) +ORDER BY + col.column_id;"#; + let rows = sql_client + .client + .query(QUERY_TABLE_METADATA, &[&self.config.table]) + .await? + .into_results() + .await?; + for row in rows.into_iter().flatten() { + let mut iter = row.into_iter(); + let ColumnData::String(Some(col_name)) = + iter.next().ok_or_else(query_table_metadata_error)? + else { + return Err(query_table_metadata_error()); + }; + let ColumnData::I32(col_pk_index) = + iter.next().ok_or_else(query_table_metadata_error)? + else { + return Err(query_table_metadata_error()); + }; + sql_server_table_metadata.insert(col_name.into_owned(), col_pk_index.is_some()); + } + + // Validate Column name and Primary Key + for (idx, col) in self.schema.fields().iter().enumerate() { + let rw_is_pk = self.pk_indices.contains(&idx); + match sql_server_table_metadata.get(&col.name) { + None => { + return Err(SinkError::SqlServer(anyhow!(format!( + "column {} not found in the downstream SQL Server table {}", + col.name, self.config.table + )))); + } + Some(sql_server_is_pk) => { + if self.is_append_only { + continue; + } + if rw_is_pk && !*sql_server_is_pk { + return Err(SinkError::SqlServer(anyhow!(format!( + "column {} specified in primary_key mismatches with the downstream SQL Server table {} PK", + col.name, self.config.table, + )))); + } + if !rw_is_pk && *sql_server_is_pk { + return Err(SinkError::SqlServer(anyhow!(format!( + "column {} unspecified in primary_key mismatches with the downstream SQL Server table {} PK", + col.name, self.config.table, + )))); + } + } + } + } + + if !self.is_append_only { + let sql_server_pk_count = sql_server_table_metadata + .values() + .filter(|is_pk| **is_pk) + .count(); + if sql_server_pk_count != self.pk_indices.len() { + return Err(SinkError::SqlServer(anyhow!(format!( + "primary key does not match between RisingWave sink ({}) and SQL Server table {} ({})", + self.pk_indices.len(), + self.config.table, + sql_server_pk_count, + )))); + } + } + + Ok(()) + } + + async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result { + Ok(SqlServerSinkWriter::new( + self.config.clone(), + self.schema.clone(), + self.pk_indices.clone(), + self.is_append_only, + ) + .await? + .into_log_sinker(writer_param.sink_metrics)) + } +} + +enum SqlOp { + Insert(OwnedRow), + Merge(OwnedRow), + Delete(OwnedRow), +} + +pub struct SqlServerSinkWriter { + config: SqlServerConfig, + schema: Schema, + pk_indices: Vec, + is_append_only: bool, + sql_client: SqlClient, + ops: Vec, +} + +impl SqlServerSinkWriter { + async fn new( + config: SqlServerConfig, + schema: Schema, + pk_indices: Vec, + is_append_only: bool, + ) -> Result { + let sql_client = SqlClient::new(&config).await?; + let writer = Self { + config, + schema, + pk_indices, + is_append_only, + sql_client, + ops: vec![], + }; + Ok(writer) + } + + async fn delete_one(&mut self, row: RowRef<'_>) -> Result<()> { + if self.ops.len() + 1 >= self.config.max_batch_rows { + self.flush().await?; + } + self.ops.push(SqlOp::Delete(row.into_owned_row())); + Ok(()) + } + + async fn upsert_one(&mut self, row: RowRef<'_>) -> Result<()> { + if self.ops.len() + 1 >= self.config.max_batch_rows { + self.flush().await?; + } + self.ops.push(SqlOp::Merge(row.into_owned_row())); + Ok(()) + } + + async fn insert_one(&mut self, row: RowRef<'_>) -> Result<()> { + if self.ops.len() + 1 >= self.config.max_batch_rows { + self.flush().await?; + } + self.ops.push(SqlOp::Insert(row.into_owned_row())); + Ok(()) + } + + async fn flush(&mut self) -> Result<()> { + use std::fmt::Write; + if self.ops.is_empty() { + return Ok(()); + } + let mut query_str = String::new(); + let col_num = self.schema.fields.len(); + let mut next_param_id = 1; + let non_pk_col_indices = (0..col_num) + .filter(|idx| !self.pk_indices.contains(idx)) + .collect::>(); + let all_col_names = self + .schema + .fields + .iter() + .map(|f| format!("[{}]", f.name)) + .collect::>() + .join(","); + let all_source_col_names = self + .schema + .fields + .iter() + .map(|f| format!("[SOURCE].[{}]", f.name)) + .collect::>() + .join(","); + let pk_match = self + .pk_indices + .iter() + .map(|idx| { + format!( + "[SOURCE].[{}]=[TARGET].[{}]", + &self.schema[*idx].name, &self.schema[*idx].name + ) + }) + .collect::>() + .join(" AND "); + let param_placeholders = |param_id: &mut usize| { + let params = (*param_id..(*param_id + col_num)) + .map(|i| format!("@P{}", i)) + .collect::>() + .join(","); + *param_id += col_num; + params + }; + let set_all_source_col = non_pk_col_indices + .iter() + .map(|idx| { + format!( + "[{}]=[SOURCE].[{}]", + &self.schema[*idx].name, &self.schema[*idx].name + ) + }) + .collect::>() + .join(","); + // TODO: avoid repeating the SQL + for op in &self.ops { + match op { + SqlOp::Insert(_) => { + write!( + &mut query_str, + "INSERT INTO [{}] ({}) VALUES ({});", + self.config.table, + all_col_names, + param_placeholders(&mut next_param_id), + ) + .unwrap(); + } + SqlOp::Merge(_) => { + write!( + &mut query_str, + r#"MERGE [{}] AS [TARGET] + USING (VALUES ({})) AS [SOURCE] ({}) + ON {} + WHEN MATCHED THEN UPDATE SET {} + WHEN NOT MATCHED THEN INSERT ({}) VALUES ({});"#, + self.config.table, + param_placeholders(&mut next_param_id), + all_col_names, + pk_match, + set_all_source_col, + all_col_names, + all_source_col_names, + ) + .unwrap(); + } + SqlOp::Delete(_) => { + write!( + &mut query_str, + r#"DELETE FROM [{}] WHERE {};"#, + self.config.table, + self.pk_indices + .iter() + .map(|idx| { + let condition = + format!("[{}]=@P{}", self.schema[*idx].name, next_param_id); + next_param_id += 1; + condition + }) + .collect::>() + .join(" AND "), + ) + .unwrap(); + } + } + } + + let mut query = Query::new(query_str); + for op in self.ops.drain(..) { + match op { + SqlOp::Insert(row) => { + bind_params(&mut query, row, &self.schema, 0..col_num)?; + } + SqlOp::Merge(row) => { + bind_params(&mut query, row, &self.schema, 0..col_num)?; + } + SqlOp::Delete(row) => { + bind_params( + &mut query, + row, + &self.schema, + self.pk_indices.iter().copied(), + )?; + } + } + } + query.execute(&mut self.sql_client.client).await?; + Ok(()) + } +} + +#[async_trait] +impl SinkWriter for SqlServerSinkWriter { + async fn begin_epoch(&mut self, _epoch: u64) -> Result<()> { + Ok(()) + } + + async fn write_batch(&mut self, chunk: StreamChunk) -> Result<()> { + for (op, row) in chunk.rows() { + match op { + Op::Insert => { + if self.is_append_only { + self.insert_one(row).await?; + } else { + self.upsert_one(row).await?; + } + } + Op::UpdateInsert => { + debug_assert!(!self.is_append_only); + self.upsert_one(row).await?; + } + Op::Delete => { + debug_assert!(!self.is_append_only); + self.delete_one(row).await?; + } + Op::UpdateDelete => {} + } + } + Ok(()) + } + + async fn barrier(&mut self, is_checkpoint: bool) -> Result { + if is_checkpoint { + self.flush().await?; + } + Ok(()) + } + + async fn abort(&mut self) -> Result<()> { + Ok(()) + } + + async fn update_vnode_bitmap(&mut self, _vnode_bitmap: Arc) -> Result<()> { + Ok(()) + } +} + +struct SqlClient { + client: Client>, +} + +impl SqlClient { + async fn new(msconfig: &SqlServerConfig) -> Result { + let mut config = Config::new(); + config.host(&msconfig.host); + config.port(msconfig.port); + config.authentication(AuthMethod::sql_server(&msconfig.user, &msconfig.password)); + config.database(&msconfig.database); + config.trust_cert(); + + let tcp = TcpStream::connect(config.get_addr()).await.unwrap(); + tcp.set_nodelay(true).unwrap(); + let client = Client::connect(config, tcp.compat_write()).await?; + Ok(Self { client }) + } +} + +fn bind_params( + query: &mut Query<'_>, + row: impl Row, + schema: &Schema, + col_indices: impl Iterator, +) -> Result<()> { + use risingwave_common::types::ScalarRefImpl; + for col_idx in col_indices { + match row.datum_at(col_idx) { + Some(data_ref) => match data_ref { + ScalarRefImpl::Int16(v) => query.bind(v), + ScalarRefImpl::Int32(v) => query.bind(v), + ScalarRefImpl::Int64(v) => query.bind(v), + ScalarRefImpl::Float32(v) => query.bind(v.into_inner()), + ScalarRefImpl::Float64(v) => query.bind(v.into_inner()), + ScalarRefImpl::Utf8(v) => query.bind(v.to_owned()), + ScalarRefImpl::Bool(v) => query.bind(v), + ScalarRefImpl::Decimal(v) => match v { + Decimal::Normalized(d) => { + query.bind(decimal_to_sql(&d)); + } + Decimal::NaN | Decimal::PositiveInf | Decimal::NegativeInf => { + tracing::warn!( + "Inf, -Inf, Nan in RisingWave decimal is converted into SQL Server null!" + ); + query.bind(None as Option); + } + }, + ScalarRefImpl::Date(v) => query.bind(v.0), + ScalarRefImpl::Timestamp(v) => query.bind(v.0), + ScalarRefImpl::Timestamptz(v) => query.bind(v.timestamp_micros()), + ScalarRefImpl::Time(v) => query.bind(v.0), + ScalarRefImpl::Bytea(v) => query.bind(v.to_vec()), + ScalarRefImpl::Interval(_) => return Err(data_type_not_supported("Interval")), + ScalarRefImpl::Jsonb(_) => return Err(data_type_not_supported("Jsonb")), + ScalarRefImpl::Struct(_) => return Err(data_type_not_supported("Struct")), + ScalarRefImpl::List(_) => return Err(data_type_not_supported("List")), + ScalarRefImpl::Int256(_) => return Err(data_type_not_supported("Int256")), + ScalarRefImpl::Serial(_) => return Err(data_type_not_supported("Serial")), + }, + None => match schema[col_idx].data_type { + DataType::Boolean => { + query.bind(None as Option); + } + DataType::Int16 => { + query.bind(None as Option); + } + DataType::Int32 => { + query.bind(None as Option); + } + DataType::Int64 => { + query.bind(None as Option); + } + DataType::Float32 => { + query.bind(None as Option); + } + DataType::Float64 => { + query.bind(None as Option); + } + DataType::Decimal => { + query.bind(None as Option); + } + DataType::Date => { + query.bind(None as Option); + } + DataType::Time => { + query.bind(None as Option); + } + DataType::Timestamp => { + query.bind(None as Option); + } + DataType::Timestamptz => { + query.bind(None as Option); + } + DataType::Varchar => { + query.bind(None as Option); + } + DataType::Bytea => { + query.bind(None as Option>); + } + DataType::Interval => return Err(data_type_not_supported("Interval")), + DataType::Struct(_) => return Err(data_type_not_supported("Struct")), + DataType::List(_) => return Err(data_type_not_supported("List")), + DataType::Jsonb => return Err(data_type_not_supported("Jsonb")), + DataType::Serial => return Err(data_type_not_supported("Serial")), + DataType::Int256 => return Err(data_type_not_supported("Int256")), + }, + }; + } + Ok(()) +} + +fn data_type_not_supported(data_type_name: &str) -> SinkError { + SinkError::SqlServer(anyhow!(format!( + "{data_type_name} is not supported in SQL Server" + ))) +} + +fn check_data_type_compatibility(data_type: &DataType) -> Result<()> { + match data_type { + DataType::Boolean + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Float32 + | DataType::Float64 + | DataType::Decimal + | DataType::Date + | DataType::Varchar + | DataType::Time + | DataType::Timestamp + | DataType::Timestamptz + | DataType::Bytea => Ok(()), + DataType::Interval => Err(data_type_not_supported("Interval")), + DataType::Struct(_) => Err(data_type_not_supported("Struct")), + DataType::List(_) => Err(data_type_not_supported("List")), + DataType::Jsonb => Err(data_type_not_supported("Jsonb")), + DataType::Serial => Err(data_type_not_supported("Serial")), + DataType::Int256 => Err(data_type_not_supported("Int256")), + } +} + +/// The implementation is copied from tiberius crate. +fn decimal_to_sql(decimal: &rust_decimal::Decimal) -> Numeric { + let unpacked = decimal.unpack(); + + let mut value = (((unpacked.hi as u128) << 64) + + ((unpacked.mid as u128) << 32) + + unpacked.lo as u128) as i128; + + if decimal.is_sign_negative() { + value = -value; + } + + Numeric::new_with_scale(value, decimal.scale() as u8) +} diff --git a/src/connector/src/with_options.rs b/src/connector/src/with_options.rs index 7e0eb4ce71604..3207a7bbbde2f 100644 --- a/src/connector/src/with_options.rs +++ b/src/connector/src/with_options.rs @@ -53,6 +53,7 @@ impl WithOptions for BTreeMap {} impl WithOptions for String {} impl WithOptions for bool {} impl WithOptions for usize {} +impl WithOptions for u16 {} impl WithOptions for u32 {} impl WithOptions for u64 {} impl WithOptions for i32 {} diff --git a/src/connector/with_options_sink.yaml b/src/connector/with_options_sink.yaml index 54bde5a8325e2..731bb900335ee 100644 --- a/src/connector/with_options_sink.yaml +++ b/src/connector/with_options_sink.yaml @@ -751,6 +751,33 @@ SnowflakeConfig: field_type: String comments: The s3 region, e.g., us-east-2 required: true +SqlServerConfig: + fields: + - name: sqlserver.host + field_type: String + required: true + - name: sqlserver.port + field_type: u16 + required: true + - name: sqlserver.user + field_type: String + required: true + - name: sqlserver.password + field_type: String + required: true + - name: sqlserver.database + field_type: String + required: true + - name: sqlserver.table + field_type: String + required: true + - name: sqlserver.max_batch_rows + field_type: usize + required: false + default: '1024' + - name: r#type + field_type: String + required: true StarrocksConfig: fields: - name: starrocks.host From 3e1ee0d78d4a97d6c23830c198d242233a53264b Mon Sep 17 00:00:00 2001 From: zwang28 <70626450+zwang28@users.noreply.github.com> Date: Thu, 13 Jun 2024 10:11:55 +0800 Subject: [PATCH 02/19] fix(meta): include secrets in metadata backup (#17225) --- ci/workflows/pull-request.yml | 18 ++ proto/backup_service.proto | 1 + src/meta/model_v2/src/secret.rs | 3 +- .../meta_snapshot_builder_v2.rs | 168 ++----------- .../src/backup_restore/restore_impl/v2.rs | 46 ++-- src/storage/backup/src/lib.rs | 4 + src/storage/backup/src/meta_snapshot_v2.rs | 220 ++++++++---------- 7 files changed, 164 insertions(+), 296 deletions(-) diff --git a/ci/workflows/pull-request.yml b/ci/workflows/pull-request.yml index 20f0ef4128247..cc5e670fe078a 100644 --- a/ci/workflows/pull-request.yml +++ b/ci/workflows/pull-request.yml @@ -121,6 +121,24 @@ steps: timeout_in_minutes: 8 retry: *auto-retry + - label: "meta backup test" + key: "e2e-meta-backup-test" + command: "ci/scripts/run-meta-backup-test.sh -p ci-dev -m ci-3streaming-2serving-3fe" + if: | + build.pull_request.labels includes "ci/run-e2e-meta-backup-test" + depends_on: + - "build" + - "build-other" + - "docslt" + plugins: + - docker-compose#v5.1.0: + run: rw-build-env + config: ci/docker-compose.yml + mount-buildkite-agent: true + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 45 + retry: *auto-retry + - label: "end-to-end test (parallel)" command: "ci/scripts/e2e-test-parallel.sh -p ci-dev" if: | diff --git a/proto/backup_service.proto b/proto/backup_service.proto index 75420149042d9..48fe46ed7eac2 100644 --- a/proto/backup_service.proto +++ b/proto/backup_service.proto @@ -49,6 +49,7 @@ message MetaSnapshotMetadata { uint64 safe_epoch = 4; optional uint32 format_version = 5; optional string remarks = 6; + optional string rw_version = 7; } service BackupService { diff --git a/src/meta/model_v2/src/secret.rs b/src/meta/model_v2/src/secret.rs index af3590dd0de58..0d122267bb4bc 100644 --- a/src/meta/model_v2/src/secret.rs +++ b/src/meta/model_v2/src/secret.rs @@ -15,8 +15,9 @@ use risingwave_pb::catalog::PbSecret; use sea_orm::entity::prelude::*; use sea_orm::ActiveValue::Set; +use serde::{Deserialize, Serialize}; -#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq, Serialize, Deserialize)] #[sea_orm(table_name = "secret")] pub struct Model { #[sea_orm(primary_key, auto_increment = false)] diff --git a/src/meta/src/backup_restore/meta_snapshot_builder_v2.rs b/src/meta/src/backup_restore/meta_snapshot_builder_v2.rs index c2bafcd071c68..4a81ae9d6ad08 100644 --- a/src/meta/src/backup_restore/meta_snapshot_builder_v2.rs +++ b/src/meta/src/backup_restore/meta_snapshot_builder_v2.rs @@ -31,6 +31,25 @@ fn map_db_err(e: DbErr) -> BackupError { BackupError::MetaStorage(e.into()) } +macro_rules! define_set_metadata { + ($( {$name:ident, $mod_path:ident::$mod_name:ident} ),*) => { + pub async fn set_metadata( + metadata: &mut MetadataV2, + txn: &sea_orm::DatabaseTransaction, + ) -> BackupResult<()> { + $( + metadata.$name = $mod_path::$mod_name::Entity::find() + .all(txn) + .await + .map_err(map_db_err)?; + )* + Ok(()) + } + }; +} + +risingwave_backup::for_all_metadata_models_v2!(define_set_metadata); + pub struct MetaSnapshotV2Builder { snapshot: MetaSnapshotV2, meta_store: SqlMetaStore, @@ -91,151 +110,14 @@ impl MetaSnapshotV2Builder { } redo_state }; - let version_stats = model_v2::prelude::HummockVersionStats::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let compaction_configs = model_v2::prelude::CompactionConfig::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let actors = model_v2::prelude::Actor::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let clusters = model_v2::prelude::Cluster::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let actor_dispatchers = model_v2::prelude::ActorDispatcher::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let catalog_versions = model_v2::prelude::CatalogVersion::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let connections = model_v2::prelude::Connection::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let databases = model_v2::prelude::Database::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let fragments = model_v2::prelude::Fragment::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let functions = model_v2::prelude::Function::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let indexes = model_v2::prelude::Index::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let objects = model_v2::prelude::Object::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let object_dependencies = model_v2::prelude::ObjectDependency::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let schemas = model_v2::prelude::Schema::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let sinks = model_v2::prelude::Sink::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let sources = model_v2::prelude::Source::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let streaming_jobs = model_v2::prelude::StreamingJob::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let subscriptions = model_v2::prelude::Subscription::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let system_parameters = model_v2::prelude::SystemParameter::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let tables = model_v2::prelude::Table::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let users = model_v2::prelude::User::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let user_privileges = model_v2::prelude::UserPrivilege::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let views = model_v2::prelude::View::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let workers = model_v2::prelude::Worker::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let worker_properties = model_v2::prelude::WorkerProperty::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let hummock_sequences = model_v2::prelude::HummockSequence::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let seaql_migrations = model_v2::serde_seaql_migration::Entity::find() - .all(&txn) - .await - .map_err(map_db_err)?; - let session_parameters = model_v2::prelude::SessionParameter::find() - .all(&txn) - .await - .map_err(map_db_err)?; - - txn.commit().await.map_err(map_db_err)?; - self.snapshot.metadata = MetadataV2 { - seaql_migrations, + let mut metadata = MetadataV2 { hummock_version, - version_stats, - compaction_configs, - actors, - clusters, - actor_dispatchers, - catalog_versions, - connections, - databases, - fragments, - functions, - indexes, - objects, - object_dependencies, - schemas, - sinks, - sources, - streaming_jobs, - subscriptions, - system_parameters, - tables, - users, - user_privileges, - views, - workers, - worker_properties, - hummock_sequences, - session_parameters, + ..Default::default() }; + set_metadata(&mut metadata, &txn).await?; + + txn.commit().await.map_err(map_db_err)?; + self.snapshot.metadata = metadata; Ok(()) } diff --git a/src/meta/src/backup_restore/restore_impl/v2.rs b/src/meta/src/backup_restore/restore_impl/v2.rs index cccb5c9b641d7..13492c56316a2 100644 --- a/src/meta/src/backup_restore/restore_impl/v2.rs +++ b/src/meta/src/backup_restore/restore_impl/v2.rs @@ -93,40 +93,28 @@ impl WriterModelV2ToMetaStoreV2 { } } +macro_rules! define_write_model_v2_to_meta_store_v2 { + ($( {$name:ident, $mod_path:ident::$mod_name:ident} ),*) => { + async fn write_model_v2_to_meta_store_v2( + metadata: &risingwave_backup::meta_snapshot_v2::MetadataV2, + db: &sea_orm::DatabaseConnection, + ) -> BackupResult<()> { + $( + insert_models(metadata.$name.clone(), db).await?; + )* + Ok(()) + } + }; +} + +risingwave_backup::for_all_metadata_models_v2!(define_write_model_v2_to_meta_store_v2); + #[async_trait::async_trait] impl Writer for WriterModelV2ToMetaStoreV2 { async fn write(&self, target_snapshot: MetaSnapshot) -> BackupResult<()> { let metadata = target_snapshot.metadata; let db = &self.meta_store.conn; - insert_models(metadata.seaql_migrations.clone(), db).await?; - insert_models(metadata.clusters.clone(), db).await?; - insert_models(metadata.version_stats.clone(), db).await?; - insert_models(metadata.compaction_configs.clone(), db).await?; - insert_models(metadata.hummock_sequences.clone(), db).await?; - insert_models(metadata.workers.clone(), db).await?; - insert_models(metadata.worker_properties.clone(), db).await?; - insert_models(metadata.users.clone(), db).await?; - insert_models(metadata.user_privileges.clone(), db).await?; - insert_models(metadata.objects.clone(), db).await?; - insert_models(metadata.object_dependencies.clone(), db).await?; - insert_models(metadata.databases.clone(), db).await?; - insert_models(metadata.schemas.clone(), db).await?; - insert_models(metadata.streaming_jobs.clone(), db).await?; - insert_models(metadata.fragments.clone(), db).await?; - insert_models(metadata.actors.clone(), db).await?; - insert_models(metadata.actor_dispatchers.clone(), db).await?; - insert_models(metadata.connections.clone(), db).await?; - insert_models(metadata.sources.clone(), db).await?; - insert_models(metadata.tables.clone(), db).await?; - insert_models(metadata.sinks.clone(), db).await?; - insert_models(metadata.views.clone(), db).await?; - insert_models(metadata.indexes.clone(), db).await?; - insert_models(metadata.functions.clone(), db).await?; - insert_models(metadata.system_parameters.clone(), db).await?; - insert_models(metadata.catalog_versions.clone(), db).await?; - insert_models(metadata.subscriptions.clone(), db).await?; - insert_models(metadata.session_parameters.clone(), db).await?; - + write_model_v2_to_meta_store_v2(&metadata, db).await?; // update_auto_inc must be called last. update_auto_inc(&metadata, db).await?; Ok(()) diff --git a/src/storage/backup/src/lib.rs b/src/storage/backup/src/lib.rs index 56c3ed551a282..ed8dccf8d1e49 100644 --- a/src/storage/backup/src/lib.rs +++ b/src/storage/backup/src/lib.rs @@ -36,6 +36,7 @@ use std::collections::HashSet; use std::hash::Hasher; use itertools::Itertools; +use risingwave_common::RW_VERSION; use risingwave_hummock_sdk::version::HummockVersion; use risingwave_hummock_sdk::{HummockSstableObjectId, HummockVersionId}; use risingwave_pb::backup_service::{PbMetaSnapshotManifest, PbMetaSnapshotMetadata}; @@ -57,6 +58,7 @@ pub struct MetaSnapshotMetadata { #[serde(default)] pub format_version: u32, pub remarks: Option, + pub rw_version: Option, } impl MetaSnapshotMetadata { @@ -74,6 +76,7 @@ impl MetaSnapshotMetadata { safe_epoch: v.visible_table_safe_epoch(), format_version, remarks, + rw_version: Some(RW_VERSION.to_owned()), } } } @@ -112,6 +115,7 @@ impl From<&MetaSnapshotMetadata> for PbMetaSnapshotMetadata { safe_epoch: m.safe_epoch, format_version: Some(m.format_version), remarks: m.remarks.clone(), + rw_version: m.rw_version.clone(), } } } diff --git a/src/storage/backup/src/meta_snapshot_v2.rs b/src/storage/backup/src/meta_snapshot_v2.rs index feb5f12540d03..bec07a80cf19d 100644 --- a/src/storage/backup/src/meta_snapshot_v2.rs +++ b/src/storage/backup/src/meta_snapshot_v2.rs @@ -16,7 +16,6 @@ use std::fmt::{Display, Formatter}; use bytes::{Buf, BufMut}; use risingwave_hummock_sdk::version::HummockVersion; -use risingwave_meta_model_v2 as model_v2; use serde::{Deserialize, Serialize}; use crate::meta_snapshot::{MetaSnapshot, Metadata}; @@ -29,39 +28,100 @@ impl From for BackupError { } } -#[derive(Default)] -pub struct MetadataV2 { - pub seaql_migrations: Vec, - pub hummock_version: HummockVersion, - pub version_stats: Vec, - pub compaction_configs: Vec, - pub actors: Vec, - pub clusters: Vec, - pub actor_dispatchers: Vec, - pub catalog_versions: Vec, - pub connections: Vec, - pub databases: Vec, - pub fragments: Vec, - pub functions: Vec, - pub indexes: Vec, - pub objects: Vec, - pub object_dependencies: Vec, - pub schemas: Vec, - pub sinks: Vec, - pub sources: Vec, - pub streaming_jobs: Vec, - pub subscriptions: Vec, - pub system_parameters: Vec, - pub tables: Vec, - pub users: Vec, - pub user_privileges: Vec, - pub views: Vec, - pub workers: Vec, - pub worker_properties: Vec, - pub hummock_sequences: Vec, - pub session_parameters: Vec, +/// Add new item in the end. Do not change the order. +#[macro_export] +macro_rules! for_all_metadata_models_v2 { + ($macro:ident) => { + $macro! { + {seaql_migrations, risingwave_meta_model_v2::serde_seaql_migration}, + {version_stats, risingwave_meta_model_v2::hummock_version_stats}, + {compaction_configs, risingwave_meta_model_v2::compaction_config}, + {actors, risingwave_meta_model_v2::actor}, + {clusters, risingwave_meta_model_v2::cluster}, + {actor_dispatchers, risingwave_meta_model_v2::actor_dispatcher}, + {catalog_versions, risingwave_meta_model_v2::catalog_version}, + {connections, risingwave_meta_model_v2::connection}, + {databases, risingwave_meta_model_v2::database}, + {fragments, risingwave_meta_model_v2::fragment}, + {functions, risingwave_meta_model_v2::function}, + {indexes, risingwave_meta_model_v2::index}, + {objects, risingwave_meta_model_v2::object}, + {object_dependencies, risingwave_meta_model_v2::object_dependency}, + {schemas, risingwave_meta_model_v2::schema}, + {sinks, risingwave_meta_model_v2::sink}, + {sources, risingwave_meta_model_v2::source}, + {streaming_jobs, risingwave_meta_model_v2::streaming_job}, + {subscriptions, risingwave_meta_model_v2::subscription}, + {system_parameters, risingwave_meta_model_v2::system_parameter}, + {tables, risingwave_meta_model_v2::table}, + {users, risingwave_meta_model_v2::user}, + {user_privileges, risingwave_meta_model_v2::user_privilege}, + {views, risingwave_meta_model_v2::view}, + {workers, risingwave_meta_model_v2::worker}, + {worker_properties, risingwave_meta_model_v2::worker_property}, + {hummock_sequences, risingwave_meta_model_v2::hummock_sequence}, + {session_parameters, risingwave_meta_model_v2::session_parameter}, + {secrets, risingwave_meta_model_v2::secret} + } + }; } +macro_rules! define_metadata_v2 { + ($({ $name:ident, $mod_path:ident::$mod_name:ident }),*) => { + #[derive(Default)] + pub struct MetadataV2 { + pub hummock_version: HummockVersion, + $( + pub $name: Vec<$mod_path::$mod_name::Model>, + )* + } + }; +} + +for_all_metadata_models_v2!(define_metadata_v2); + +macro_rules! define_encode_metadata { + ($( {$name:ident, $mod_path:ident::$mod_name:ident} ),*) => { + fn encode_metadata( + metadata: &MetadataV2, + buf: &mut Vec, + ) -> BackupResult<()> { + let mut _idx = 0; + $( + if _idx == 1 { + put_1(buf, &metadata.hummock_version.to_protobuf())?; + } + put_n(buf, &metadata.$name)?; + _idx += 1; + )* + Ok(()) + } + }; +} + +for_all_metadata_models_v2!(define_encode_metadata); + +macro_rules! define_decode_metadata { + ($( {$name:ident, $mod_path:ident::$mod_name:ident} ),*) => { + fn decode_metadata( + metadata: &mut MetadataV2, + mut buf: &[u8], + ) -> BackupResult<()> { + let mut _idx = 0; + $( + if _idx == 1 { + metadata.hummock_version = HummockVersion::from_persisted_protobuf(&get_1(&mut buf)?); + } + metadata.$name = get_n(&mut buf)?; + _idx += 1; + )* + Ok(()) + } + }; +} + +for_all_metadata_models_v2!(define_decode_metadata); + impl Display for MetadataV2 { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { writeln!(f, "clusters: {:#?}", self.clusters)?; @@ -77,102 +137,16 @@ impl Display for MetadataV2 { impl Metadata for MetadataV2 { fn encode_to(&self, buf: &mut Vec) -> BackupResult<()> { - put_n(buf, &self.seaql_migrations)?; - put_1(buf, &self.hummock_version.to_protobuf())?; - put_n(buf, &self.version_stats)?; - put_n(buf, &self.compaction_configs)?; - put_n(buf, &self.actors)?; - put_n(buf, &self.clusters)?; - put_n(buf, &self.actor_dispatchers)?; - put_n(buf, &self.catalog_versions)?; - put_n(buf, &self.connections)?; - put_n(buf, &self.databases)?; - put_n(buf, &self.fragments)?; - put_n(buf, &self.functions)?; - put_n(buf, &self.indexes)?; - put_n(buf, &self.objects)?; - put_n(buf, &self.object_dependencies)?; - put_n(buf, &self.schemas)?; - put_n(buf, &self.sinks)?; - put_n(buf, &self.sources)?; - put_n(buf, &self.streaming_jobs)?; - put_n(buf, &self.subscriptions)?; - put_n(buf, &self.system_parameters)?; - put_n(buf, &self.tables)?; - put_n(buf, &self.users)?; - put_n(buf, &self.user_privileges)?; - put_n(buf, &self.views)?; - put_n(buf, &self.workers)?; - put_n(buf, &self.worker_properties)?; - put_n(buf, &self.hummock_sequences)?; - put_n(buf, &self.session_parameters)?; - Ok(()) + encode_metadata(self, buf) } - fn decode(mut buf: &[u8]) -> BackupResult + fn decode(buf: &[u8]) -> BackupResult where Self: Sized, { - let seaql_migrations = get_n(&mut buf)?; - let pb_hummock_version = get_1(&mut buf)?; - let version_stats = get_n(&mut buf)?; - let compaction_configs = get_n(&mut buf)?; - let actors = get_n(&mut buf)?; - let clusters = get_n(&mut buf)?; - let actor_dispatchers = get_n(&mut buf)?; - let catalog_versions = get_n(&mut buf)?; - let connections = get_n(&mut buf)?; - let databases = get_n(&mut buf)?; - let fragments = get_n(&mut buf)?; - let functions = get_n(&mut buf)?; - let indexes = get_n(&mut buf)?; - let objects = get_n(&mut buf)?; - let object_dependencies = get_n(&mut buf)?; - let schemas = get_n(&mut buf)?; - let sinks = get_n(&mut buf)?; - let sources = get_n(&mut buf)?; - let streaming_jobs = get_n(&mut buf)?; - let subscriptions = get_n(&mut buf)?; - let system_parameters = get_n(&mut buf)?; - let tables = get_n(&mut buf)?; - let users = get_n(&mut buf)?; - let user_privileges = get_n(&mut buf)?; - let views = get_n(&mut buf)?; - let workers = get_n(&mut buf)?; - let worker_properties = get_n(&mut buf)?; - let hummock_sequences = get_n(&mut buf)?; - let session_parameters = get_n(&mut buf)?; - Ok(Self { - seaql_migrations, - hummock_version: HummockVersion::from_persisted_protobuf(&pb_hummock_version), - version_stats, - compaction_configs, - actors, - clusters, - actor_dispatchers, - catalog_versions, - connections, - databases, - fragments, - functions, - indexes, - objects, - object_dependencies, - schemas, - sinks, - sources, - streaming_jobs, - subscriptions, - system_parameters, - tables, - users, - user_privileges, - views, - workers, - worker_properties, - hummock_sequences, - session_parameters, - }) + let mut metadata = Self::default(); + decode_metadata(&mut metadata, buf)?; + Ok(metadata) } fn hummock_version_ref(&self) -> &HummockVersion { From 9abdbc9329f7108213264fc247906bfc07b39541 Mon Sep 17 00:00:00 2001 From: Patrick Lewis Date: Wed, 12 Jun 2024 22:16:04 -0400 Subject: [PATCH 03/19] chore(ci): remove obsolete version property from docker compose configs (#16566) --- ci/docker-compose.yml | 1 - docker/docker-compose-distributed.yml | 1 - docker/docker-compose-with-azblob.yml | 1 - docker/docker-compose-with-gcs.yml | 1 - docker/docker-compose-with-hdfs.yml | 1 - docker/docker-compose-with-local-fs.yml | 1 - docker/docker-compose-with-obs.yml | 1 - docker/docker-compose-with-oss.yml | 1 - docker/docker-compose-with-s3.yml | 1 - docker/docker-compose-with-sqlite.yml | 1 - docker/docker-compose.yml | 1 - integration_tests/ad-click/docker-compose.yml | 1 - integration_tests/ad-ctr/docker-compose.yml | 1 - integration_tests/big-query-sink/docker-compose.yml | 1 - .../cassandra-and-scylladb-sink/docker-compose.yml | 1 - integration_tests/cdn-metrics/docker-compose.yml | 1 - integration_tests/citus-cdc/docker-compose.yml | 1 - integration_tests/clickhouse-sink/docker-compose.yml | 1 - integration_tests/clickstream/docker-compose.yml | 1 - integration_tests/client-library/docker-compose.yml | 1 - integration_tests/cockroach-sink/docker-compose.yml | 1 - integration_tests/debezium-mysql/docker-compose.yml | 1 - integration_tests/debezium-postgres/docker-compose.yml | 1 - integration_tests/debezium-sqlserver/docker-compose.yml | 1 - integration_tests/deltalake-sink/docker-compose.yml | 1 - integration_tests/doris-sink/docker-compose.yml | 1 - integration_tests/elasticsearch-sink/docker-compose.yml | 1 - integration_tests/feature-store/docker-compose.yml | 1 - integration_tests/http-sink/docker-compose.yml | 1 - integration_tests/iceberg-sink/docker-compose.yml | 1 - integration_tests/kafka-cdc-sink/docker-compose.yml | 1 - integration_tests/kafka-cdc/docker-compose.yml | 1 - integration_tests/kinesis-s3-source/docker-compose.yml | 1 - integration_tests/livestream/docker-compose.yml | 1 - integration_tests/mindsdb/docker-compose.yml | 1 - integration_tests/mongodb-cdc/docker-compose.yaml | 1 - integration_tests/mongodb/docker-compose.yaml | 1 - integration_tests/mqtt/docker-compose.yml | 1 - integration_tests/mysql-cdc/docker-compose.yml | 1 - integration_tests/mysql-sink/docker-compose.yml | 1 - integration_tests/nats/docker-compose.yml | 1 - integration_tests/pinot-sink/docker-compose.yml | 1 - integration_tests/postgres-cdc/docker-compose.yml | 1 - integration_tests/postgres-sink/docker-compose.yml | 1 - integration_tests/presto-trino/docker-compose.yml | 1 - integration_tests/prometheus/docker-compose.yml | 1 - integration_tests/redis-sink/docker-compose.yml | 1 - integration_tests/schema-registry/docker-compose.yml | 1 - integration_tests/starrocks-sink/docker-compose.yml | 1 - integration_tests/superset/docker-compose.yml | 1 - integration_tests/twitter-pulsar/docker-compose.yml | 1 - integration_tests/twitter/docker-compose.yml | 1 - integration_tests/upsert-avro/docker-compose.yml | 3 +-- integration_tests/vector/docker-compose.yml | 1 - src/risedevtool/src/bin/risedev-compose.rs | 2 -- src/risedevtool/src/compose.rs | 1 - 56 files changed, 1 insertion(+), 58 deletions(-) diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml index dd10bc8c98c1d..b2a885a4ba2e6 100644 --- a/ci/docker-compose.yml +++ b/ci/docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.9" services: db: image: postgres:15-alpine diff --git a/docker/docker-compose-distributed.yml b/docker/docker-compose-distributed.yml index f94437f089197..80ce64cb90531 100644 --- a/docker/docker-compose-distributed.yml +++ b/docker/docker-compose-distributed.yml @@ -1,5 +1,4 @@ --- -version: "3" x-image: &image image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.9.1-rc.2} services: diff --git a/docker/docker-compose-with-azblob.yml b/docker/docker-compose-with-azblob.yml index 9fc76c4051817..e1bf11bb28ff0 100644 --- a/docker/docker-compose-with-azblob.yml +++ b/docker/docker-compose-with-azblob.yml @@ -1,5 +1,4 @@ --- -version: "3" x-image: &image image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.9.1-rc.2} services: diff --git a/docker/docker-compose-with-gcs.yml b/docker/docker-compose-with-gcs.yml index 36433d549c1c7..fcad2692f474d 100644 --- a/docker/docker-compose-with-gcs.yml +++ b/docker/docker-compose-with-gcs.yml @@ -1,5 +1,4 @@ --- -version: "3" x-image: &image image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.9.1-rc.2} services: diff --git a/docker/docker-compose-with-hdfs.yml b/docker/docker-compose-with-hdfs.yml index cf2b45078bac5..974cf922e77b4 100644 --- a/docker/docker-compose-with-hdfs.yml +++ b/docker/docker-compose-with-hdfs.yml @@ -1,5 +1,4 @@ --- -version: "3" services: compactor-0: image: ghcr.io/risingwavelabs/risingwave:RisingWave_1.6.1_HDFS_2.7-x86_64 diff --git a/docker/docker-compose-with-local-fs.yml b/docker/docker-compose-with-local-fs.yml index ab4545d649821..d7b5e22c36243 100644 --- a/docker/docker-compose-with-local-fs.yml +++ b/docker/docker-compose-with-local-fs.yml @@ -1,5 +1,4 @@ --- -version: "3" x-image: &image image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.9.1-rc.2} services: diff --git a/docker/docker-compose-with-obs.yml b/docker/docker-compose-with-obs.yml index b92d0cb077a16..45c84e32f652d 100644 --- a/docker/docker-compose-with-obs.yml +++ b/docker/docker-compose-with-obs.yml @@ -1,5 +1,4 @@ --- -version: "3" x-image: &image image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.9.1-rc.2} services: diff --git a/docker/docker-compose-with-oss.yml b/docker/docker-compose-with-oss.yml index adbfee86cc839..25aeae746e3b7 100644 --- a/docker/docker-compose-with-oss.yml +++ b/docker/docker-compose-with-oss.yml @@ -1,5 +1,4 @@ --- -version: "3" x-image: &image image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.9.1-rc.2} services: diff --git a/docker/docker-compose-with-s3.yml b/docker/docker-compose-with-s3.yml index eabe1f023ced7..75ffded8edcbc 100644 --- a/docker/docker-compose-with-s3.yml +++ b/docker/docker-compose-with-s3.yml @@ -1,5 +1,4 @@ --- -version: "3" x-image: &image image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.9.1-rc.2} services: diff --git a/docker/docker-compose-with-sqlite.yml b/docker/docker-compose-with-sqlite.yml index f65a4af7bc29f..065a60610f333 100644 --- a/docker/docker-compose-with-sqlite.yml +++ b/docker/docker-compose-with-sqlite.yml @@ -1,5 +1,4 @@ --- -version: "3" x-image: &image image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.9.1-rc.2} services: diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 5ba67f78c3f56..6042fbba261df 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" x-image: &image image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.9.1-rc.2} services: diff --git a/integration_tests/ad-click/docker-compose.yml b/integration_tests/ad-click/docker-compose.yml index 62d5c3fb76517..2b84bb00d9950 100644 --- a/integration_tests/ad-click/docker-compose.yml +++ b/integration_tests/ad-click/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/ad-ctr/docker-compose.yml b/integration_tests/ad-ctr/docker-compose.yml index 0298f014db11a..2bec6c35295b8 100644 --- a/integration_tests/ad-ctr/docker-compose.yml +++ b/integration_tests/ad-ctr/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/big-query-sink/docker-compose.yml b/integration_tests/big-query-sink/docker-compose.yml index 6c93903df8bba..279f43a43b217 100644 --- a/integration_tests/big-query-sink/docker-compose.yml +++ b/integration_tests/big-query-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/cassandra-and-scylladb-sink/docker-compose.yml b/integration_tests/cassandra-and-scylladb-sink/docker-compose.yml index 0fa224ddab9d0..9f09b203ef700 100644 --- a/integration_tests/cassandra-and-scylladb-sink/docker-compose.yml +++ b/integration_tests/cassandra-and-scylladb-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: cassandra: image: cassandra:4.0 diff --git a/integration_tests/cdn-metrics/docker-compose.yml b/integration_tests/cdn-metrics/docker-compose.yml index 87adef35f8cf4..05d3d786e6279 100644 --- a/integration_tests/cdn-metrics/docker-compose.yml +++ b/integration_tests/cdn-metrics/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/citus-cdc/docker-compose.yml b/integration_tests/citus-cdc/docker-compose.yml index 6ce8341047ee4..8afb665e02cd1 100644 --- a/integration_tests/citus-cdc/docker-compose.yml +++ b/integration_tests/citus-cdc/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/clickhouse-sink/docker-compose.yml b/integration_tests/clickhouse-sink/docker-compose.yml index 1cf61ff8dfa30..beb2ee1254739 100644 --- a/integration_tests/clickhouse-sink/docker-compose.yml +++ b/integration_tests/clickhouse-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: clickhouse-server: image: clickhouse/clickhouse-server:23.3.8.21-alpine diff --git a/integration_tests/clickstream/docker-compose.yml b/integration_tests/clickstream/docker-compose.yml index 857c93f0d7577..4015a3a976ced 100644 --- a/integration_tests/clickstream/docker-compose.yml +++ b/integration_tests/clickstream/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/client-library/docker-compose.yml b/integration_tests/client-library/docker-compose.yml index c6868eaa42140..c8a03d353b18e 100644 --- a/integration_tests/client-library/docker-compose.yml +++ b/integration_tests/client-library/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/cockroach-sink/docker-compose.yml b/integration_tests/cockroach-sink/docker-compose.yml index b6b0c8d9e6c5f..d325c57865baf 100644 --- a/integration_tests/cockroach-sink/docker-compose.yml +++ b/integration_tests/cockroach-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/debezium-mysql/docker-compose.yml b/integration_tests/debezium-mysql/docker-compose.yml index 3462e5e3d09d1..6cb577ac23886 100644 --- a/integration_tests/debezium-mysql/docker-compose.yml +++ b/integration_tests/debezium-mysql/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/debezium-postgres/docker-compose.yml b/integration_tests/debezium-postgres/docker-compose.yml index c81c33fb3e455..327cb44d6db7c 100644 --- a/integration_tests/debezium-postgres/docker-compose.yml +++ b/integration_tests/debezium-postgres/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/debezium-sqlserver/docker-compose.yml b/integration_tests/debezium-sqlserver/docker-compose.yml index e88cb36e548b7..9d4bbbf0a5bb6 100644 --- a/integration_tests/debezium-sqlserver/docker-compose.yml +++ b/integration_tests/debezium-sqlserver/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/deltalake-sink/docker-compose.yml b/integration_tests/deltalake-sink/docker-compose.yml index 70b1e3c22e325..2a799f9fcf45b 100644 --- a/integration_tests/deltalake-sink/docker-compose.yml +++ b/integration_tests/deltalake-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: spark: image: apache/spark:3.3.1 diff --git a/integration_tests/doris-sink/docker-compose.yml b/integration_tests/doris-sink/docker-compose.yml index e1a7f1ef5e90e..4b43632f51319 100644 --- a/integration_tests/doris-sink/docker-compose.yml +++ b/integration_tests/doris-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: fe: platform: linux/amd64 diff --git a/integration_tests/elasticsearch-sink/docker-compose.yml b/integration_tests/elasticsearch-sink/docker-compose.yml index c885b7136a606..097de4beb5490 100644 --- a/integration_tests/elasticsearch-sink/docker-compose.yml +++ b/integration_tests/elasticsearch-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: elasticsearch7: image: docker.elastic.co/elasticsearch/elasticsearch:7.11.0 diff --git a/integration_tests/feature-store/docker-compose.yml b/integration_tests/feature-store/docker-compose.yml index 71633cce20a19..77de22a8c3522 100644 --- a/integration_tests/feature-store/docker-compose.yml +++ b/integration_tests/feature-store/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: kafka: image: confluentinc/cp-kafka:7.1.0 diff --git a/integration_tests/http-sink/docker-compose.yml b/integration_tests/http-sink/docker-compose.yml index 12546c4f5dd28..9a7c42b1443e0 100644 --- a/integration_tests/http-sink/docker-compose.yml +++ b/integration_tests/http-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/iceberg-sink/docker-compose.yml b/integration_tests/iceberg-sink/docker-compose.yml index 91cec5dd24430..84bda01b21ceb 100644 --- a/integration_tests/iceberg-sink/docker-compose.yml +++ b/integration_tests/iceberg-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" x-airflow-common: &airflow-common image: apache/airflow:2.6.2-python3.10 diff --git a/integration_tests/kafka-cdc-sink/docker-compose.yml b/integration_tests/kafka-cdc-sink/docker-compose.yml index 81f892354b8a0..1cebe9b73f284 100644 --- a/integration_tests/kafka-cdc-sink/docker-compose.yml +++ b/integration_tests/kafka-cdc-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/kafka-cdc/docker-compose.yml b/integration_tests/kafka-cdc/docker-compose.yml index f42c4399178d0..6eaa5b5ead7ab 100644 --- a/integration_tests/kafka-cdc/docker-compose.yml +++ b/integration_tests/kafka-cdc/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/kinesis-s3-source/docker-compose.yml b/integration_tests/kinesis-s3-source/docker-compose.yml index dc91e2095cbde..74dabde96f7ba 100644 --- a/integration_tests/kinesis-s3-source/docker-compose.yml +++ b/integration_tests/kinesis-s3-source/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/livestream/docker-compose.yml b/integration_tests/livestream/docker-compose.yml index 8dffce371562a..e263b704bc90d 100644 --- a/integration_tests/livestream/docker-compose.yml +++ b/integration_tests/livestream/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/mindsdb/docker-compose.yml b/integration_tests/mindsdb/docker-compose.yml index 40fe4e6192fa3..0cd82b10a6529 100644 --- a/integration_tests/mindsdb/docker-compose.yml +++ b/integration_tests/mindsdb/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/mongodb-cdc/docker-compose.yaml b/integration_tests/mongodb-cdc/docker-compose.yaml index eaf519b440569..de09a204d991b 100644 --- a/integration_tests/mongodb-cdc/docker-compose.yaml +++ b/integration_tests/mongodb-cdc/docker-compose.yaml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/mongodb/docker-compose.yaml b/integration_tests/mongodb/docker-compose.yaml index 59ac89215ec14..a2855c200e6b0 100644 --- a/integration_tests/mongodb/docker-compose.yaml +++ b/integration_tests/mongodb/docker-compose.yaml @@ -1,4 +1,3 @@ -version: "3" services: mongodb: image: mongo:4.4 diff --git a/integration_tests/mqtt/docker-compose.yml b/integration_tests/mqtt/docker-compose.yml index 04f73404be6aa..b91ddd482509c 100644 --- a/integration_tests/mqtt/docker-compose.yml +++ b/integration_tests/mqtt/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/mysql-cdc/docker-compose.yml b/integration_tests/mysql-cdc/docker-compose.yml index c0bba2ccc008b..b2779c42c05b6 100644 --- a/integration_tests/mysql-cdc/docker-compose.yml +++ b/integration_tests/mysql-cdc/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/mysql-sink/docker-compose.yml b/integration_tests/mysql-sink/docker-compose.yml index 3e1fc5544276f..8f8c4f9aa4336 100644 --- a/integration_tests/mysql-sink/docker-compose.yml +++ b/integration_tests/mysql-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/nats/docker-compose.yml b/integration_tests/nats/docker-compose.yml index 891c865744747..930f1a719fd7f 100644 --- a/integration_tests/nats/docker-compose.yml +++ b/integration_tests/nats/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/pinot-sink/docker-compose.yml b/integration_tests/pinot-sink/docker-compose.yml index fc4ad250880ce..c7d08dcc005e9 100644 --- a/integration_tests/pinot-sink/docker-compose.yml +++ b/integration_tests/pinot-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/postgres-cdc/docker-compose.yml b/integration_tests/postgres-cdc/docker-compose.yml index 7650da0779178..333ee2f4080c3 100644 --- a/integration_tests/postgres-cdc/docker-compose.yml +++ b/integration_tests/postgres-cdc/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/postgres-sink/docker-compose.yml b/integration_tests/postgres-sink/docker-compose.yml index 4d8638fdc3c07..6f5a16db64c24 100644 --- a/integration_tests/postgres-sink/docker-compose.yml +++ b/integration_tests/postgres-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/presto-trino/docker-compose.yml b/integration_tests/presto-trino/docker-compose.yml index a56135a4ae597..5de9dc34eb78a 100644 --- a/integration_tests/presto-trino/docker-compose.yml +++ b/integration_tests/presto-trino/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/prometheus/docker-compose.yml b/integration_tests/prometheus/docker-compose.yml index de3249df9253a..cd840807ea1ac 100644 --- a/integration_tests/prometheus/docker-compose.yml +++ b/integration_tests/prometheus/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/redis-sink/docker-compose.yml b/integration_tests/redis-sink/docker-compose.yml index dce27ae99895c..8f3c3eb9cdd85 100644 --- a/integration_tests/redis-sink/docker-compose.yml +++ b/integration_tests/redis-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: redis: image: 'redis:latest' diff --git a/integration_tests/schema-registry/docker-compose.yml b/integration_tests/schema-registry/docker-compose.yml index 80d4b90e4f7d2..2209fb7e20aee 100644 --- a/integration_tests/schema-registry/docker-compose.yml +++ b/integration_tests/schema-registry/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/starrocks-sink/docker-compose.yml b/integration_tests/starrocks-sink/docker-compose.yml index 70918713643d6..e3a06cc33587a 100644 --- a/integration_tests/starrocks-sink/docker-compose.yml +++ b/integration_tests/starrocks-sink/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: starrocks-fe: image: starrocks/fe-ubuntu:3.1.7 diff --git a/integration_tests/superset/docker-compose.yml b/integration_tests/superset/docker-compose.yml index 746a80fb9a064..3d7b9ed2494ca 100644 --- a/integration_tests/superset/docker-compose.yml +++ b/integration_tests/superset/docker-compose.yml @@ -9,7 +9,6 @@ x-superset-volumes: - ./docker:/app/docker - superset_home:/app/superset_home -version: "3.7" services: risingwave-standalone: extends: diff --git a/integration_tests/twitter-pulsar/docker-compose.yml b/integration_tests/twitter-pulsar/docker-compose.yml index d684be6b876a8..a3e91c9f8751a 100644 --- a/integration_tests/twitter-pulsar/docker-compose.yml +++ b/integration_tests/twitter-pulsar/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/twitter/docker-compose.yml b/integration_tests/twitter/docker-compose.yml index 37b2723cb8e50..e59e71b3839ce 100644 --- a/integration_tests/twitter/docker-compose.yml +++ b/integration_tests/twitter/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/integration_tests/upsert-avro/docker-compose.yml b/integration_tests/upsert-avro/docker-compose.yml index 291528f6fb319..9176ca053ba4e 100644 --- a/integration_tests/upsert-avro/docker-compose.yml +++ b/integration_tests/upsert-avro/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: @@ -53,4 +52,4 @@ volumes: external: false message_queue: external: false -name: risingwave-compose \ No newline at end of file +name: risingwave-compose diff --git a/integration_tests/vector/docker-compose.yml b/integration_tests/vector/docker-compose.yml index 4c2e6100b714a..13101925ac4a3 100644 --- a/integration_tests/vector/docker-compose.yml +++ b/integration_tests/vector/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: "3" services: risingwave-standalone: extends: diff --git a/src/risedevtool/src/bin/risedev-compose.rs b/src/risedevtool/src/bin/risedev-compose.rs index 5ff56916deca6..89bb0592d0d85 100644 --- a/src/risedevtool/src/bin/risedev-compose.rs +++ b/src/risedevtool/src/bin/risedev-compose.rs @@ -245,7 +245,6 @@ fn main() -> Result<()> { } }); let compose_file = ComposeFile { - version: "3".into(), services: services.clone(), volumes: node_volumes, name: format!("risingwave-{}", opts.profile), @@ -303,7 +302,6 @@ fn main() -> Result<()> { } } let compose_file = ComposeFile { - version: "3".into(), services, volumes, name: format!("risingwave-{}", opts.profile), diff --git a/src/risedevtool/src/compose.rs b/src/risedevtool/src/compose.rs index 779ca23557622..a490560157527 100644 --- a/src/risedevtool/src/compose.rs +++ b/src/risedevtool/src/compose.rs @@ -56,7 +56,6 @@ pub struct HealthCheck { #[derive(Debug, Clone, Serialize)] pub struct ComposeFile { - pub version: String, pub services: BTreeMap, pub volumes: BTreeMap, pub name: String, From 13cdd9525791968624f93459e681b0ae2fd995ab Mon Sep 17 00:00:00 2001 From: Li0k Date: Thu, 13 Jun 2024 11:06:54 +0800 Subject: [PATCH 04/19] feat(storage): remove magic number MAX_COMPACT_LEVEL_COUNT (#17219) --- proto/hummock.proto | 4 ++++ src/common/src/config.rs | 6 +++++- .../src/cmd_impl/hummock/compaction_group.rs | 4 ++++ src/ctl/src/lib.rs | 4 ++++ .../src/hummock/compaction/compaction_config.rs | 1 + .../picker/base_level_compaction_picker.rs | 1 + .../picker/compaction_task_validator.rs | 16 +++++----------- .../picker/intra_compaction_picker.rs | 1 + .../picker/min_overlap_compaction_picker.rs | 17 +++++++++++++++-- src/meta/src/hummock/compaction/picker/mod.rs | 2 -- .../compaction/picker/tier_compaction_picker.rs | 6 +----- .../hummock/manager/compaction_group_manager.rs | 4 +++- 12 files changed, 44 insertions(+), 22 deletions(-) diff --git a/proto/hummock.proto b/proto/hummock.proto index 2fc4c1a6d4b42..c4ac75f2fed64 100644 --- a/proto/hummock.proto +++ b/proto/hummock.proto @@ -706,6 +706,7 @@ message RiseCtlUpdateCompactionConfigRequest { bool enable_emergency_picker = 15; uint32 tombstone_reclaim_ratio = 16; CompressionAlgorithm compression_algorithm = 17; + uint32 max_l0_compact_level_count = 18; } } repeated uint64 compaction_group_ids = 1; @@ -885,6 +886,9 @@ message CompactionConfig { uint32 level0_overlapping_sub_level_compact_level_count = 18; uint32 tombstone_reclaim_ratio = 19; bool enable_emergency_picker = 20; + + // The limitation of the level count of l0 compaction + uint32 max_l0_compact_level_count = 21; } message TableStats { diff --git a/src/common/src/config.rs b/src/common/src/config.rs index 6395cbf37ed15..6cad79849153a 100644 --- a/src/common/src/config.rs +++ b/src/common/src/config.rs @@ -1786,8 +1786,8 @@ pub mod default { const DEFAULT_MIN_OVERLAPPING_SUB_LEVEL_COMPACT_LEVEL_COUNT: u32 = 12; const DEFAULT_TOMBSTONE_RATIO_PERCENT: u32 = 40; const DEFAULT_EMERGENCY_PICKER: bool = true; - const DEFAULT_MAX_LEVEL: u32 = 6; + const DEFAULT_MAX_L0_COMPACT_LEVEL_COUNT: u32 = 42; use crate::catalog::hummock::CompactionFilterFlag; @@ -1854,6 +1854,10 @@ pub mod default { pub fn max_level() -> u32 { DEFAULT_MAX_LEVEL } + + pub fn max_l0_compact_level_count() -> u32 { + DEFAULT_MAX_L0_COMPACT_LEVEL_COUNT + } } pub mod object_store_config { diff --git a/src/ctl/src/cmd_impl/hummock/compaction_group.rs b/src/ctl/src/cmd_impl/hummock/compaction_group.rs index 3fb83e9d16cb3..d58aeb7bffe79 100644 --- a/src/ctl/src/cmd_impl/hummock/compaction_group.rs +++ b/src/ctl/src/cmd_impl/hummock/compaction_group.rs @@ -65,6 +65,7 @@ pub fn build_compaction_config_vec( enable_emergency_picker: Option, tombstone_reclaim_ratio: Option, compress_algorithm: Option, + max_l0_compact_level: Option, ) -> Vec { let mut configs = vec![]; if let Some(c) = max_bytes_for_level_base { @@ -115,6 +116,9 @@ pub fn build_compaction_config_vec( if let Some(c) = compress_algorithm { configs.push(MutableConfig::CompressionAlgorithm(c)) } + if let Some(c) = max_l0_compact_level { + configs.push(MutableConfig::MaxL0CompactLevelCount(c)) + } configs } diff --git a/src/ctl/src/lib.rs b/src/ctl/src/lib.rs index 1276d4bfce439..1f50250276d6e 100644 --- a/src/ctl/src/lib.rs +++ b/src/ctl/src/lib.rs @@ -259,6 +259,8 @@ enum HummockCommands { compression_level: Option, #[clap(long)] compression_algorithm: Option, + #[clap(long)] + max_l0_compact_level: Option, }, /// Split given compaction group into two. Moves the given tables to the new group. SplitCompactionGroup { @@ -690,6 +692,7 @@ async fn start_impl(opts: CliOpts, context: &CtlContext) -> Result<()> { tombstone_reclaim_ratio, compression_level, compression_algorithm, + max_l0_compact_level, }) => { cmd_impl::hummock::update_compaction_config( context, @@ -719,6 +722,7 @@ async fn start_impl(opts: CliOpts, context: &CtlContext) -> Result<()> { } else { None }, + max_l0_compact_level, ), ) .await? diff --git a/src/meta/src/hummock/compaction/compaction_config.rs b/src/meta/src/hummock/compaction/compaction_config.rs index e6750084432ec..de91bf4f79ded 100644 --- a/src/meta/src/hummock/compaction/compaction_config.rs +++ b/src/meta/src/hummock/compaction/compaction_config.rs @@ -64,6 +64,7 @@ impl CompactionConfigBuilder { compaction_config::level0_overlapping_sub_level_compact_level_count(), tombstone_reclaim_ratio: compaction_config::tombstone_reclaim_ratio(), enable_emergency_picker: compaction_config::enable_emergency_picker(), + max_l0_compact_level_count: compaction_config::max_l0_compact_level_count(), }, } } diff --git a/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs index f98e14203d95b..a2c3a7d52802e 100644 --- a/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs @@ -166,6 +166,7 @@ impl LevelCompactionPicker { self.config.level0_max_compact_file_number, overlap_strategy.clone(), self.developer_config.enable_check_task_level_overlap, + self.config.max_l0_compact_level_count as usize, ); let mut max_vnode_partition_idx = 0; diff --git a/src/meta/src/hummock/compaction/picker/compaction_task_validator.rs b/src/meta/src/hummock/compaction/picker/compaction_task_validator.rs index 29119ae283b0a..c7dd27a6b1907 100644 --- a/src/meta/src/hummock/compaction/picker/compaction_task_validator.rs +++ b/src/meta/src/hummock/compaction/picker/compaction_task_validator.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use risingwave_pb::hummock::CompactionConfig; -use super::{CompactionInput, LocalPickerStatistic, MAX_COMPACT_LEVEL_COUNT}; +use super::{CompactionInput, LocalPickerStatistic}; #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum ValidationRuleType { @@ -89,14 +89,8 @@ struct TierCompactionTaskValidationRule { impl CompactionTaskValidationRule for TierCompactionTaskValidationRule { fn validate(&self, input: &CompactionInput, stats: &mut LocalPickerStatistic) -> bool { - // Limit sstable file count to avoid using too much memory. - let overlapping_max_compact_file_numer = std::cmp::min( - self.config.level0_max_compact_file_number, - MAX_COMPACT_LEVEL_COUNT as u64, - ); - - if input.total_file_count >= overlapping_max_compact_file_numer - || input.input_levels.len() >= MAX_COMPACT_LEVEL_COUNT + if input.total_file_count >= self.config.level0_max_compact_file_number + || input.input_levels.len() >= self.config.max_l0_compact_level_count as usize { return true; } @@ -130,7 +124,7 @@ impl CompactionTaskValidationRule for IntraCompactionTaskValidationRule { fn validate(&self, input: &CompactionInput, stats: &mut LocalPickerStatistic) -> bool { if (input.total_file_count >= self.config.level0_max_compact_file_number && input.input_levels.len() > 1) - || input.input_levels.len() >= MAX_COMPACT_LEVEL_COUNT + || input.input_levels.len() >= self.config.max_l0_compact_level_count as usize { return true; } @@ -178,7 +172,7 @@ struct BaseCompactionTaskValidationRule { impl CompactionTaskValidationRule for BaseCompactionTaskValidationRule { fn validate(&self, input: &CompactionInput, stats: &mut LocalPickerStatistic) -> bool { if input.total_file_count >= self.config.level0_max_compact_file_number - || input.input_levels.len() >= MAX_COMPACT_LEVEL_COUNT + || input.input_levels.len() >= self.config.max_l0_compact_level_count as usize { return true; } diff --git a/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs index 993ad79d59b2e..6b5dcae7d0c31 100644 --- a/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs @@ -144,6 +144,7 @@ impl IntraCompactionPicker { self.config.level0_max_compact_file_number, overlap_strategy.clone(), self.developer_config.enable_check_task_level_overlap, + self.config.max_l0_compact_level_count as usize, ); let l0_select_tables_vec = non_overlap_sub_level_picker diff --git a/src/meta/src/hummock/compaction/picker/min_overlap_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/min_overlap_compaction_picker.rs index a0d896daa439c..57dd5469d42ae 100644 --- a/src/meta/src/hummock/compaction/picker/min_overlap_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/min_overlap_compaction_picker.rs @@ -20,7 +20,7 @@ use risingwave_hummock_sdk::prost_key_range::KeyRangeExt; use risingwave_pb::hummock::hummock_version::Levels; use risingwave_pb::hummock::{InputLevel, Level, LevelType, SstableInfo}; -use super::{CompactionInput, CompactionPicker, LocalPickerStatistic, MAX_COMPACT_LEVEL_COUNT}; +use super::{CompactionInput, CompactionPicker, LocalPickerStatistic}; use crate::hummock::compaction::overlap_strategy::OverlapStrategy; use crate::hummock::level_handler::LevelHandler; @@ -197,6 +197,7 @@ impl NonOverlapSubLevelPicker { max_file_count: u64, overlap_strategy: Arc, enable_check_task_level_overlap: bool, + max_expected_level_count: usize, ) -> Self { Self { min_compaction_bytes, @@ -205,7 +206,7 @@ impl NonOverlapSubLevelPicker { max_file_count, overlap_strategy, enable_check_task_level_overlap, - max_expected_level_count: MAX_COMPACT_LEVEL_COUNT, + max_expected_level_count, } } @@ -533,6 +534,8 @@ impl NonOverlapSubLevelPicker { pub mod tests { use std::collections::BTreeSet; + use risingwave_common::config::default::compaction_config; + use super::*; use crate::hummock::compaction::overlap_strategy::RangeOverlapStrategy; use crate::hummock::compaction::selector::tests::{ @@ -736,6 +739,7 @@ pub mod tests { 10000, Arc::new(RangeOverlapStrategy::default()), true, + compaction_config::max_l0_compact_level_count() as usize, ); let ret = picker.pick_l0_multi_non_overlap_level(&levels, &levels_handlers[0]); assert_eq!(6, ret.len()); @@ -750,6 +754,7 @@ pub mod tests { 10000, Arc::new(RangeOverlapStrategy::default()), true, + compaction_config::max_l0_compact_level_count() as usize, ); let ret = picker.pick_l0_multi_non_overlap_level(&levels, &levels_handlers[0]); assert_eq!(6, ret.len()); @@ -764,6 +769,7 @@ pub mod tests { 5, Arc::new(RangeOverlapStrategy::default()), true, + compaction_config::max_l0_compact_level_count() as usize, ); let ret = picker.pick_l0_multi_non_overlap_level(&levels, &levels_handlers[0]); assert_eq!(6, ret.len()); @@ -839,6 +845,7 @@ pub mod tests { 10000, Arc::new(RangeOverlapStrategy::default()), true, + compaction_config::max_l0_compact_level_count() as usize, ); let ret = picker.pick_l0_multi_non_overlap_level(&levels, &levels_handlers[0]); assert_eq!(6, ret.len()); @@ -854,6 +861,7 @@ pub mod tests { 10000, Arc::new(RangeOverlapStrategy::default()), true, + compaction_config::max_l0_compact_level_count() as usize, ); let ret = picker.pick_l0_multi_non_overlap_level(&levels, &levels_handlers[0]); assert_eq!(6, ret.len()); @@ -869,6 +877,7 @@ pub mod tests { max_file_count, Arc::new(RangeOverlapStrategy::default()), true, + compaction_config::max_l0_compact_level_count() as usize, ); let ret = picker.pick_l0_multi_non_overlap_level(&levels, &levels_handlers[0]); assert_eq!(6, ret.len()); @@ -892,6 +901,7 @@ pub mod tests { 10000, Arc::new(RangeOverlapStrategy::default()), true, + compaction_config::max_l0_compact_level_count() as usize, ); let ret = picker.pick_l0_multi_non_overlap_level(&levels, &levels_handlers[0]); assert_eq!(3, ret.len()); @@ -1019,6 +1029,7 @@ pub mod tests { 10000, Arc::new(RangeOverlapStrategy::default()), true, + compaction_config::max_l0_compact_level_count() as usize, ); let ret = picker.pick_l0_multi_non_overlap_level(&levels, &levels_handlers[0]); { @@ -1036,6 +1047,7 @@ pub mod tests { 10000, Arc::new(RangeOverlapStrategy::default()), true, + compaction_config::max_l0_compact_level_count() as usize, ); let ret = picker.pick_l0_multi_non_overlap_level(&levels, &levels_handlers[0]); { @@ -1053,6 +1065,7 @@ pub mod tests { 3, Arc::new(RangeOverlapStrategy::default()), true, + compaction_config::max_l0_compact_level_count() as usize, ); let ret = picker.pick_l0_multi_non_overlap_level(&levels, &levels_handlers[0]); { diff --git a/src/meta/src/hummock/compaction/picker/mod.rs b/src/meta/src/hummock/compaction/picker/mod.rs index f6dc46c99106c..6d464b9a33bcd 100644 --- a/src/meta/src/hummock/compaction/picker/mod.rs +++ b/src/meta/src/hummock/compaction/picker/mod.rs @@ -43,8 +43,6 @@ pub use ttl_reclaim_compaction_picker::{TtlPickerState, TtlReclaimCompactionPick use crate::hummock::level_handler::LevelHandler; -pub const MAX_COMPACT_LEVEL_COUNT: usize = 42; - #[derive(Default, Debug)] pub struct LocalPickerStatistic { pub skip_by_write_amp_limit: u64, diff --git a/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs index 9ed22ba551fcc..ce86b523f6e86 100644 --- a/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs @@ -21,7 +21,6 @@ use super::{ CompactionInput, CompactionPicker, CompactionTaskValidator, LocalPickerStatistic, ValidationRuleType, }; -use crate::hummock::compaction::picker::MAX_COMPACT_LEVEL_COUNT; use crate::hummock::level_handler::LevelHandler; pub struct TierCompactionPicker { @@ -87,10 +86,7 @@ impl TierCompactionPicker { let mut compaction_bytes = level.total_file_size; let mut compact_file_count = level.table_infos.len() as u64; // Limit sstable file count to avoid using too much memory. - let overlapping_max_compact_file_numer = std::cmp::min( - self.config.level0_max_compact_file_number, - MAX_COMPACT_LEVEL_COUNT as u64, - ); + let overlapping_max_compact_file_numer = self.config.level0_max_compact_file_number; for other in &l0.sub_levels[idx + 1..] { if compaction_bytes > max_compaction_bytes { diff --git a/src/meta/src/hummock/manager/compaction_group_manager.rs b/src/meta/src/hummock/manager/compaction_group_manager.rs index df5189d539180..063ce4e0a58b1 100644 --- a/src/meta/src/hummock/manager/compaction_group_manager.rs +++ b/src/meta/src/hummock/manager/compaction_group_manager.rs @@ -885,11 +885,13 @@ fn update_compaction_config(target: &mut CompactionConfig, items: &[MutableConfi MutableConfig::TombstoneReclaimRatio(c) => { target.tombstone_reclaim_ratio = *c; } - MutableConfig::CompressionAlgorithm(c) => { target.compression_algorithm[c.get_level() as usize] .clone_from(&c.compression_algorithm); } + MutableConfig::MaxL0CompactLevelCount(c) => { + target.max_l0_compact_level_count = *c; + } } } } From 32a1129da3a9d84d201ffb9a01b89976889e3a76 Mon Sep 17 00:00:00 2001 From: Li0k Date: Thu, 13 Jun 2024 14:18:46 +0800 Subject: [PATCH 05/19] fix(compactor): Compactor potential oom risk of builder (#16802) --- proto/hummock.proto | 1 + src/common/src/config.rs | 55 ++++++++++++++--- src/config/docs.md | 3 +- src/meta/src/hummock/compactor_manager.rs | 7 +++ src/meta/src/hummock/manager/compaction.rs | 61 +++++++++---------- src/meta/src/hummock/manager/timer_task.rs | 17 ++++-- src/storage/benches/bench_multi_builder.rs | 47 ++++++++------ src/storage/hummock_sdk/src/compact.rs | 11 ++-- .../src/hummock/compactor/compactor_runner.rs | 1 - .../compactor/fast_compactor_runner.rs | 19 +++--- src/storage/src/hummock/compactor/mod.rs | 48 +++++---------- .../src/hummock/sstable/multi_builder.rs | 50 +++++++++++---- src/storage/src/opts.rs | 5 ++ 13 files changed, 202 insertions(+), 123 deletions(-) diff --git a/proto/hummock.proto b/proto/hummock.proto index c4ac75f2fed64..89a0438fc43a7 100644 --- a/proto/hummock.proto +++ b/proto/hummock.proto @@ -340,6 +340,7 @@ message CompactTask { JOIN_HANDLE_FAILED = 11; TRACK_SST_OBJECT_ID_FAILED = 12; NO_AVAIL_CPU_RESOURCE_CANCELED = 13; + HEARTBEAT_PROGRESS_CANCELED = 14; } // SSTs to be compacted, which will be removed from LSM after compaction repeated InputLevel input_ssts = 1; diff --git a/src/common/src/config.rs b/src/common/src/config.rs index 6cad79849153a..d8bb65e90b062 100644 --- a/src/common/src/config.rs +++ b/src/common/src/config.rs @@ -763,6 +763,14 @@ pub struct StorageConfig { #[serde(default = "default::storage::mem_table_spill_threshold")] pub mem_table_spill_threshold: usize, + /// The concurrent uploading number of `SSTables` of buidler + #[serde(default = "default::storage::compactor_concurrent_uploading_sst_count")] + pub compactor_concurrent_uploading_sst_count: Option, + + /// Object storage configuration + /// 1. General configuration + /// 2. Some special configuration of Backend + /// 3. Retry and timeout configuration #[serde(default)] pub object_store: ObjectStoreConfig, } @@ -1024,9 +1032,13 @@ pub struct ObjectStoreConfig { #[serde(default = "default::object_store_config::object_store_set_atomic_write_dir")] pub object_store_set_atomic_write_dir: bool, + /// Retry and timeout configuration + /// Description retry strategy driven by exponential back-off + /// Exposes the timeout and retries of each Object store interface. Therefore, the total timeout for each interface is determined based on the interface's timeout/retry configuration and the exponential back-off policy. #[serde(default)] pub retry: ObjectStoreRetryConfig, + /// Some special configuration of S3 Backend #[serde(default)] pub s3: S3ObjectStoreConfig, } @@ -1080,66 +1092,89 @@ pub struct S3ObjectStoreDeveloperConfig { #[derive(Clone, Debug, Serialize, Deserialize, DefaultFromSerde)] pub struct ObjectStoreRetryConfig { + // A retry strategy driven by exponential back-off. + // The retry strategy is used for all object store operations. + /// Given a base duration for retry strategy in milliseconds. #[serde(default = "default::object_store_config::object_store_req_backoff_interval_ms")] pub req_backoff_interval_ms: u64, + + /// The max delay interval for the retry strategy. No retry delay will be longer than this `Duration`. #[serde(default = "default::object_store_config::object_store_req_backoff_max_delay_ms")] pub req_backoff_max_delay_ms: u64, + + /// A multiplicative factor that will be applied to the exponential back-off retry delay. #[serde(default = "default::object_store_config::object_store_req_backoff_factor")] pub req_backoff_factor: u64, - // upload + /// Maximum timeout for `upload` operation #[serde(default = "default::object_store_config::object_store_upload_attempt_timeout_ms")] pub upload_attempt_timeout_ms: u64, + + /// Total counts of `upload` operation retries #[serde(default = "default::object_store_config::object_store_upload_retry_attempts")] pub upload_retry_attempts: usize, - // streaming_upload_init + streaming_upload + /// Maximum timeout for `streaming_upload_init` and `streaming_upload` #[serde( default = "default::object_store_config::object_store_streaming_upload_attempt_timeout_ms" )] pub streaming_upload_attempt_timeout_ms: u64, + + /// Total counts of `streaming_upload` operation retries #[serde( default = "default::object_store_config::object_store_streaming_upload_retry_attempts" )] pub streaming_upload_retry_attempts: usize, - // read + /// Maximum timeout for `read` operation #[serde(default = "default::object_store_config::object_store_read_attempt_timeout_ms")] pub read_attempt_timeout_ms: u64, + + /// Total counts of `read` operation retries #[serde(default = "default::object_store_config::object_store_read_retry_attempts")] pub read_retry_attempts: usize, - // streaming_read_init + streaming_read + /// Maximum timeout for `streaming_read_init` and `streaming_read` operation #[serde( default = "default::object_store_config::object_store_streaming_read_attempt_timeout_ms" )] pub streaming_read_attempt_timeout_ms: u64, + + /// Total counts of `streaming_read operation` retries #[serde(default = "default::object_store_config::object_store_streaming_read_retry_attempts")] pub streaming_read_retry_attempts: usize, - // metadata + /// Maximum timeout for `metadata` operation #[serde(default = "default::object_store_config::object_store_metadata_attempt_timeout_ms")] pub metadata_attempt_timeout_ms: u64, + + /// Total counts of `metadata` operation retries #[serde(default = "default::object_store_config::object_store_metadata_retry_attempts")] pub metadata_retry_attempts: usize, - // delete + /// Maximum timeout for `delete` operation #[serde(default = "default::object_store_config::object_store_delete_attempt_timeout_ms")] pub delete_attempt_timeout_ms: u64, + + /// Total counts of `delete` operation retries #[serde(default = "default::object_store_config::object_store_delete_retry_attempts")] pub delete_retry_attempts: usize, - // delete_object + /// Maximum timeout for `delete_object` operation #[serde( default = "default::object_store_config::object_store_delete_objects_attempt_timeout_ms" )] pub delete_objects_attempt_timeout_ms: u64, + + /// Total counts of `delete_object` operation retries #[serde(default = "default::object_store_config::object_store_delete_objects_retry_attempts")] pub delete_objects_retry_attempts: usize, - // list + /// Maximum timeout for `list` operation #[serde(default = "default::object_store_config::object_store_list_attempt_timeout_ms")] pub list_attempt_timeout_ms: u64, + + /// Total counts of `list` operation retries #[serde(default = "default::object_store_config::object_store_list_retry_attempts")] pub list_retry_attempts: usize, } @@ -1509,6 +1544,10 @@ pub mod default { pub fn max_prefetch_block_number() -> usize { 16 } + + pub fn compactor_concurrent_uploading_sst_count() -> Option { + None + } } pub mod streaming { diff --git a/src/config/docs.md b/src/config/docs.md index 540d86085b342..ab33559260162 100644 --- a/src/config/docs.md +++ b/src/config/docs.md @@ -105,6 +105,7 @@ This page is automatically generated by `./risedev generate-example-config` | cache_refill | | | | check_compaction_result | | false | | compact_iter_recreate_timeout_ms | | 600000 | +| compactor_concurrent_uploading_sst_count | The concurrent uploading number of `SSTables` of buidler | | | compactor_fast_max_compact_delete_ratio | | 40 | | compactor_fast_max_compact_task_size | | 2147483648 | | compactor_iter_max_io_retry_times | | 8 | @@ -128,7 +129,7 @@ This page is automatically generated by `./risedev generate-example-config` | meta_file_cache | | | | min_sst_size_for_streaming_upload | Whether to enable streaming upload for sstable. | 33554432 | | min_sstable_size_mb | | 32 | -| object_store | | | +| object_store | Object storage configuration 1. General configuration 2. Some special configuration of Backend 3. Retry and timeout configuration | | | prefetch_buffer_capacity_mb | max memory usage for large query | | | share_buffer_compaction_worker_threads_number | Worker threads number of dedicated tokio runtime for share buffer compaction. 0 means use tokio's default value (number of CPU core). | 4 | | share_buffer_upload_concurrency | Number of tasks shared buffer can upload in parallel. | 8 | diff --git a/src/meta/src/hummock/compactor_manager.rs b/src/meta/src/hummock/compactor_manager.rs index d9b5a481f420a..0876f31d211e3 100644 --- a/src/meta/src/hummock/compactor_manager.rs +++ b/src/meta/src/hummock/compactor_manager.rs @@ -101,6 +101,13 @@ impl Compactor { Ok(()) } + pub fn cancel_tasks(&self, task_ids: &Vec) -> MetaResult<()> { + for task_id in task_ids { + self.cancel_task(*task_id)?; + } + Ok(()) + } + pub fn context_id(&self) -> HummockContextId { self.context_id } diff --git a/src/meta/src/hummock/manager/compaction.rs b/src/meta/src/hummock/manager/compaction.rs index cc5c878a8baef..7a27061c8d373 100644 --- a/src/meta/src/hummock/manager/compaction.rs +++ b/src/meta/src/hummock/manager/compaction.rs @@ -105,6 +105,7 @@ static CANCEL_STATUS_SET: LazyLock> = LazyLock::new(|| { TaskStatus::InvalidGroupCanceled, TaskStatus::NoAvailMemoryResourceCanceled, TaskStatus::NoAvailCpuResourceCanceled, + TaskStatus::HeartbeatProgressCanceled, ] .into_iter() .collect() @@ -464,40 +465,36 @@ impl HummockManager { progress, }) => { let compactor_manager = hummock_manager.compactor_manager.clone(); - let cancel_tasks = compactor_manager.update_task_heartbeats(&progress); - if let Some(compactor) = compactor_manager.get_compactor(context_id) { - // TODO: task cancellation can be batched - for task in cancel_tasks { - tracing::info!( - "Task with group_id {} task_id {} with context_id {} has expired due to lack of visible progress", - task.compaction_group_id, - task.task_id, - context_id, - ); + let cancel_tasks = compactor_manager.update_task_heartbeats(&progress).into_iter().map(|task|task.task_id).collect::>(); + if !cancel_tasks.is_empty() { + tracing::info!( + "Tasks cancel with task_ids {:?} with context_id {} has expired due to lack of visible progress", + cancel_tasks, + context_id, + ); - if let Err(e) = - hummock_manager - .cancel_compact_task(task.task_id, TaskStatus::HeartbeatCanceled) - .await - { - tracing::error!( - task_id = task.task_id, - error = %e.as_report(), - "Attempt to remove compaction task due to elapsed heartbeat failed. We will continue to track its heartbeat - until we can successfully report its status." - ); - } - - // Forcefully cancel the task so that it terminates - // early on the compactor - // node. - let _ = compactor.cancel_task(task.task_id); - tracing::info!( - "CancelTask operation for task_id {} has been sent to node with context_id {}", - context_id, - task.task_id + if let Err(e) = hummock_manager + .cancel_compact_tasks(cancel_tasks.clone(), TaskStatus::HeartbeatProgressCanceled) + .await + { + tracing::error!( + error = %e.as_report(), + "Attempt to remove compaction task due to elapsed heartbeat failed. We will continue to track its heartbeat + until we can successfully report its status." ); } + } + + if let Some(compactor) = compactor_manager.get_compactor(context_id) { + // Forcefully cancel the task so that it terminates + // early on the compactor + // node. + let _ = compactor.cancel_tasks(&cancel_tasks); + tracing::info!( + "CancelTask operation for task_id {:?} has been sent to node with context_id {}", + cancel_tasks, + context_id + ); } else { // Determine the validity of the compactor streaming rpc. When the compactor no longer exists in the manager, the stream will be removed. // Tip: Connectivity to the compactor will be determined through the `send_event` operation. When send fails, it will be removed from the manager @@ -1004,7 +1001,7 @@ impl HummockManager { Ok(ret[0]) } - async fn cancel_compact_tasks( + pub async fn cancel_compact_tasks( &self, tasks: Vec, task_status: TaskStatus, diff --git a/src/meta/src/hummock/manager/timer_task.rs b/src/meta/src/hummock/manager/timer_task.rs index e28a2d2e9fb02..6b68950a28e3b 100644 --- a/src/meta/src/hummock/manager/timer_task.rs +++ b/src/meta/src/hummock/manager/timer_task.rs @@ -257,16 +257,25 @@ impl HummockManager { // progress (meta + compactor) // 2. meta periodically scans the task and performs a cancel on // the meta side for tasks that are not updated by heartbeat - for task in compactor_manager.get_heartbeat_expired_tasks() { + let expired_tasks: Vec = compactor_manager + .get_heartbeat_expired_tasks() + .into_iter() + .map(|task| task.task_id) + .collect(); + if !expired_tasks.is_empty() { + tracing::info!( + expired_tasks = ?expired_tasks, + "Heartbeat expired compaction tasks detected. Attempting to cancel tasks.", + ); if let Err(e) = hummock_manager - .cancel_compact_task( - task.task_id, + .cancel_compact_tasks( + expired_tasks.clone(), TaskStatus::HeartbeatCanceled, ) .await { tracing::error!( - task_id = task.task_id, + expired_tasks = ?expired_tasks, error = %e.as_report(), "Attempt to remove compaction task due to elapsed heartbeat failed. We will continue to track its heartbeat until we can successfully report its status", diff --git a/src/storage/benches/bench_multi_builder.rs b/src/storage/benches/bench_multi_builder.rs index 08318891eac8c..b67be3467c48a 100644 --- a/src/storage/benches/bench_multi_builder.rs +++ b/src/storage/benches/bench_multi_builder.rs @@ -21,8 +21,6 @@ use std::time::Duration; use criterion::{criterion_group, criterion_main, Criterion}; use foyer::HybridCacheBuilder; -use futures::future::try_join_all; -use itertools::Itertools; use rand::random; use risingwave_common::catalog::TableId; use risingwave_common::config::{MetricLevel, ObjectStoreConfig}; @@ -118,17 +116,14 @@ async fn build_tables( .await .unwrap(); } - let split_table_outputs = builder.finish().await.unwrap(); - let ssts = split_table_outputs - .iter() - .map(|handle| handle.sst_info.sst_info.clone()) - .collect_vec(); - let join_handles = split_table_outputs + + builder + .finish() + .await + .unwrap() .into_iter() - .map(|o| o.upload_join_handle) - .collect_vec(); - try_join_all(join_handles).await.unwrap(); - ssts + .map(|info| info.sst_info) + .collect() } async fn generate_sstable_store(object_store: Arc) -> Arc { @@ -160,9 +155,9 @@ async fn generate_sstable_store(object_store: Arc) -> Arc u64 { let mut result = 0; // When building the SstableStreamIterator, sstable_syncable will fetch the SstableMeta and seek @@ -223,12 +222,10 @@ pub fn estimate_memory_for_compact_task( // output // builder will maintain SstableInfo + block_builder(block) + writer (block to vec) let estimated_meta_size = sst_capacity * task_max_sst_meta_ratio / 100; - if support_streaming_upload { - result += estimated_meta_size + 2 * block_size - } else { - result += estimated_meta_size + sst_capacity; // Use sst_capacity to avoid BatchUploader - // memory bursts. - } + + // FIXME: sst_capacity is the upper bound of the memory usage of the streaming sstable uploader + // A more reasonable memory limit method needs to be adopted, this is just a temporary fix. + result += estimated_meta_size + sst_capacity; result } diff --git a/src/storage/src/hummock/compactor/compactor_runner.rs b/src/storage/src/hummock/compactor/compactor_runner.rs index 05b3d7dd182d9..41a55518158de 100644 --- a/src/storage/src/hummock/compactor/compactor_runner.rs +++ b/src/storage/src/hummock/compactor/compactor_runner.rs @@ -383,7 +383,6 @@ pub async fn compact( .object_store_recv_buffer_size .unwrap_or(6 * 1024 * 1024) as u64, capacity as u64, - context.sstable_store.store().support_streaming_upload(), ) * compact_task.splits.len() as u64; tracing::info!( diff --git a/src/storage/src/hummock/compactor/fast_compactor_runner.rs b/src/storage/src/hummock/compactor/fast_compactor_runner.rs index 7a30a6fd2ef65..81f0210d945c1 100644 --- a/src/storage/src/hummock/compactor/fast_compactor_runner.rs +++ b/src/storage/src/hummock/compactor/fast_compactor_runner.rs @@ -316,6 +316,9 @@ impl CompactorRunner { context.compactor_metrics.clone(), Some(task_progress.clone()), task_config.table_vnode_partition.clone(), + context + .storage_opts + .compactor_concurrent_uploading_sst_count, ); assert_eq!( task.input_ssts.len(), @@ -505,17 +508,19 @@ impl CompactorRunner { ); let statistic = self.executor.take_statistics(); - let outputs = self.executor.builder.finish().await?; - let ssts = Compactor::report_progress( + let output_ssts = self.executor.builder.finish().await?; + Compactor::report_progress( self.metrics.clone(), Some(self.executor.task_progress.clone()), - outputs, + &output_ssts, false, - ) - .await?; - let sst_infos = ssts.iter().map(|sst| sst.sst_info.clone()).collect_vec(); + ); + let sst_infos = output_ssts + .iter() + .map(|sst| sst.sst_info.clone()) + .collect_vec(); assert!(can_concat(&sst_infos)); - Ok((ssts, statistic)) + Ok((output_ssts, statistic)) } } diff --git a/src/storage/src/hummock/compactor/mod.rs b/src/storage/src/hummock/compactor/mod.rs index cddaffc584f66..898d91d530b22 100644 --- a/src/storage/src/hummock/compactor/mod.rs +++ b/src/storage/src/hummock/compactor/mod.rs @@ -83,11 +83,10 @@ use crate::filter_key_extractor::{ use crate::hummock::compactor::compaction_utils::calculate_task_parallelism; use crate::hummock::compactor::compactor_runner::{compact_and_build_sst, compact_done}; use crate::hummock::iterator::{Forward, HummockIterator}; -use crate::hummock::multi_builder::SplitTableOutput; use crate::hummock::vacuum::Vacuum; use crate::hummock::{ - validate_ssts, BlockedXor16FilterBuilder, FilterBuilder, HummockError, - SharedComapctorObjectIdManager, SstableWriterFactory, UnifiedSstableWriterFactory, + validate_ssts, BlockedXor16FilterBuilder, FilterBuilder, SharedComapctorObjectIdManager, + SstableWriterFactory, UnifiedSstableWriterFactory, }; use crate::monitor::CompactorMetrics; @@ -175,20 +174,19 @@ impl Compactor { compact_timer.observe_duration(); - let ssts = Self::report_progress( + Self::report_progress( self.context.compactor_metrics.clone(), task_progress, - split_table_outputs, + &split_table_outputs, self.context.is_share_buffer_compact, - ) - .await?; + ); self.context .compactor_metrics .get_table_id_total_time_duration .observe(self.get_id_time.load(Ordering::Relaxed) as f64 / 1000.0 / 1000.0); - debug_assert!(ssts + debug_assert!(split_table_outputs .iter() .all(|table_info| table_info.sst_info.get_table_ids().is_sorted())); @@ -198,33 +196,20 @@ impl Compactor { "Finish Task {:?} split_index {:?} sst count {}", task_id, split_index, - ssts.len() + split_table_outputs.len() ); } - Ok((ssts, table_stats_map)) + Ok((split_table_outputs, table_stats_map)) } - pub async fn report_progress( + pub fn report_progress( metrics: Arc, task_progress: Option>, - split_table_outputs: Vec, + ssts: &Vec, is_share_buffer_compact: bool, - ) -> HummockResult> { - let mut ssts = Vec::with_capacity(split_table_outputs.len()); - let mut rets = vec![]; - - for SplitTableOutput { - sst_info, - upload_join_handle, - } in split_table_outputs - { + ) { + for sst_info in ssts { let sst_size = sst_info.file_size(); - ssts.push(sst_info); - let ret = upload_join_handle - .verbose_instrument_await("upload") - .await - .map_err(HummockError::sstable_upload_error); - rets.push(ret); if let Some(tracker) = &task_progress { tracker.inc_ssts_uploaded(); tracker.dec_num_pending_write_io(); @@ -235,10 +220,6 @@ impl Compactor { metrics.compaction_upload_sst_counts.inc(); } } - for ret in rets { - ret??; - } - Ok(ssts) } async fn compact_key_range_impl( @@ -249,7 +230,7 @@ impl Compactor { filter_key_extractor: Arc, task_progress: Option>, object_id_getter: Box, - ) -> HummockResult<(Vec, CompactionStatistics)> { + ) -> HummockResult<(Vec, CompactionStatistics)> { let builder_factory = RemoteBuilderFactory:: { object_id_getter, limiter: self.context.memory_limiter.clone(), @@ -266,6 +247,9 @@ impl Compactor { self.context.compactor_metrics.clone(), task_progress.clone(), self.task_config.table_vnode_partition.clone(), + self.context + .storage_opts + .compactor_concurrent_uploading_sst_count, ); let compaction_statistics = compact_and_build_sst( &mut sst_builder, diff --git a/src/storage/src/hummock/sstable/multi_builder.rs b/src/storage/src/hummock/sstable/multi_builder.rs index 7354cb89c81fc..4e364ce9f94f5 100644 --- a/src/storage/src/hummock/sstable/multi_builder.rs +++ b/src/storage/src/hummock/sstable/multi_builder.rs @@ -18,6 +18,8 @@ use std::sync::atomic::Ordering::SeqCst; use std::sync::Arc; use bytes::Bytes; +use futures::stream::FuturesUnordered; +use futures::StreamExt; use num_integer::Integer; use risingwave_common::hash::VirtualNode; use risingwave_hummock_sdk::key::{FullKey, UserKey}; @@ -29,8 +31,8 @@ use crate::hummock::sstable::filter::FilterBuilder; use crate::hummock::sstable_store::SstableStoreRef; use crate::hummock::value::HummockValue; use crate::hummock::{ - BatchUploadWriter, BlockMeta, CachePolicy, HummockResult, MemoryLimiter, SstableBuilder, - SstableBuilderOptions, SstableWriter, SstableWriterOptions, Xor16FilterBuilder, + BatchUploadWriter, BlockMeta, CachePolicy, HummockError, HummockResult, MemoryLimiter, + SstableBuilder, SstableBuilderOptions, SstableWriter, SstableWriterOptions, Xor16FilterBuilder, }; use crate::monitor::CompactorMetrics; @@ -43,11 +45,6 @@ pub trait TableBuilderFactory { async fn open_builder(&mut self) -> HummockResult>; } -pub struct SplitTableOutput { - pub sst_info: LocalSstableInfo, - pub upload_join_handle: UploadJoinHandle, -} - /// A wrapper for [`SstableBuilder`] which automatically split key-value pairs into multiple tables, /// based on their target capacity set in options. /// @@ -59,7 +56,7 @@ where /// When creating a new [`SstableBuilder`], caller use this factory to generate it. builder_factory: F, - sst_outputs: Vec, + sst_outputs: Vec, current_builder: Option>, @@ -75,6 +72,10 @@ where /// When vnode of the coming key is greater than `largest_vnode_in_current_partition`, we will /// switch SST. largest_vnode_in_current_partition: usize, + + concurrent_upload_join_handle: FuturesUnordered, + + concurrent_uploading_sst_count: Option, } impl CapacitySplitTableBuilder @@ -88,6 +89,7 @@ where compactor_metrics: Arc, task_progress: Option>, table_partition_vnode: BTreeMap, + concurrent_uploading_sst_count: Option, ) -> Self { Self { builder_factory, @@ -99,6 +101,8 @@ where table_partition_vnode, split_weight_by_vnode: 0, largest_vnode_in_current_partition: VirtualNode::MAX.to_index(), + concurrent_upload_join_handle: FuturesUnordered::new(), + concurrent_uploading_sst_count, } } @@ -113,6 +117,8 @@ where table_partition_vnode: BTreeMap::default(), split_weight_by_vnode: 0, largest_vnode_in_current_partition: VirtualNode::MAX.to_index(), + concurrent_upload_join_handle: FuturesUnordered::new(), + concurrent_uploading_sst_count: None, } } @@ -264,6 +270,7 @@ where /// If there's no builder created, or current one is already sealed before, then this function /// will be no-op. pub async fn seal_current(&mut self) -> HummockResult<()> { + use await_tree::InstrumentAwait; if let Some(builder) = self.current_builder.take() { let builder_output = builder.finish().await?; { @@ -302,17 +309,33 @@ where .observe(builder_output.epoch_count as _); } } - self.sst_outputs.push(SplitTableOutput { - upload_join_handle: builder_output.writer_output, - sst_info: builder_output.sst_info, - }); + + self.concurrent_upload_join_handle + .push(builder_output.writer_output); + + self.sst_outputs.push(builder_output.sst_info); + + if let Some(concurrent_uploading_sst_count) = self.concurrent_uploading_sst_count + && self.concurrent_upload_join_handle.len() >= concurrent_uploading_sst_count + { + self.concurrent_upload_join_handle + .next() + .verbose_instrument_await("upload") + .await + .unwrap() + .map_err(HummockError::sstable_upload_error)??; + } } Ok(()) } /// Finalizes all the tables to be ids, blocks and metadata. - pub async fn finish(mut self) -> HummockResult> { + pub async fn finish(mut self) -> HummockResult> { + use futures::future::try_join_all; self.seal_current().await?; + try_join_all(self.concurrent_upload_join_handle.into_iter()) + .await + .map_err(HummockError::sstable_upload_error)?; Ok(self.sst_outputs) } } @@ -504,6 +527,7 @@ mod tests { Arc::new(CompactorMetrics::unused()), None, table_partition_vnode, + None, ); let mut table_key = VirtualNode::from_index(0).to_be_bytes().to_vec(); diff --git a/src/storage/src/opts.rs b/src/storage/src/opts.rs index e9792c5d5e4db..143b6bba37981 100644 --- a/src/storage/src/opts.rs +++ b/src/storage/src/opts.rs @@ -128,6 +128,8 @@ pub struct StorageOpts { pub mem_table_spill_threshold: usize, + pub compactor_concurrent_uploading_sst_count: Option, + pub object_store_config: ObjectStoreConfig, } @@ -218,6 +220,9 @@ impl From<(&RwConfig, &SystemParamsReader, &StorageMemoryConfig)> for StorageOpt .compactor_fast_max_compact_delete_ratio, compactor_fast_max_compact_task_size: c.storage.compactor_fast_max_compact_task_size, compactor_iter_max_io_retry_times: c.storage.compactor_iter_max_io_retry_times, + compactor_concurrent_uploading_sst_count: c + .storage + .compactor_concurrent_uploading_sst_count, } } } From ef6d6c6f48429bb410265acfffdbb42676ae9c69 Mon Sep 17 00:00:00 2001 From: xxchan Date: Thu, 13 Jun 2024 14:38:20 +0800 Subject: [PATCH 06/19] feat(source): support JSON schema addtionalProperties (map) (#17110) Signed-off-by: xxchan --- e2e_test/source/basic/nosim_kafka.slt | 20 ++++++++++++++-- scripts/source/test_data/kafka_json_schema.1 | 4 ++-- src/connector/src/parser/json_parser.rs | 21 +++++++++++++---- src/frontend/src/handler/create_source.rs | 24 ++++++++++++-------- 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/e2e_test/source/basic/nosim_kafka.slt b/e2e_test/source/basic/nosim_kafka.slt index 0cef4889c3fe7..b773126c9a7c1 100644 --- a/e2e_test/source/basic/nosim_kafka.slt +++ b/e2e_test/source/basic/nosim_kafka.slt @@ -39,6 +39,22 @@ CREATE TABLE kafka_json_schema_plain with ( kafka.scan.startup.mode = 'earliest' ) FORMAT PLAIN ENCODE JSON (schema.registry = '${RISEDEV_SCHEMA_REGISTRY_URL}'); + +query +describe kafka_json_schema_plain; +---- +dimensions (empty) false NULL +map jsonb false NULL +notMap (empty) false NULL +price double precision false NULL +productId bigint false NULL +productName character varying false NULL +tags character varying[] false NULL +_row_id serial true NULL +primary key _row_id NULL NULL +distribution key _row_id NULL NULL +table description kafka_json_schema_plain NULL NULL + statement ok CREATE TABLE kafka_json_schema_upsert (PRIMARY KEY(rw_key)) INCLUDE KEY AS rw_key @@ -83,10 +99,10 @@ select count(*) from debezium_compact; query TFITT select - "dimensions", "price", "productId", "productName", "tags" + * from kafka_json_schema_plain ---- -(9.5,7,12) 12.5 1 An ice sculpture {cold,ice} +(9.5,7,12) {"foo": "bar"} (b) 12.5 1 An ice sculpture {cold,ice} query TFITT select diff --git a/scripts/source/test_data/kafka_json_schema.1 b/scripts/source/test_data/kafka_json_schema.1 index e71485885a654..afa96107864cb 100644 --- a/scripts/source/test_data/kafka_json_schema.1 +++ b/scripts/source/test_data/kafka_json_schema.1 @@ -1,2 +1,2 @@ -{"$schema":"https://json-schema.org/draft/2020-12/schema","$id":"https://example.com/product.schema.json","title":"Product","description":"A product from Acme's catalog","type":"object","properties":{"productId":{"description":"The unique identifier for a product","type":"integer"},"productName":{"description":"Name of the product","type":"string"},"price":{"description":"The price of the product","type":"number","exclusiveMinimum":0},"tags":{"description":"Tags for the product","type":"array","items":{"type":"string"},"minItems":1,"uniqueItems":true},"dimensions":{"type":"object","properties":{"length":{"type":"number"},"width":{"type":"number"},"height":{"type":"number"}},"required":["length","width","height"]}},"required":["productId","productName","price"]} -{"productId":1,"productName":"An ice sculpture","price":12.5,"tags":["cold","ice"],"dimensions":{"length":7,"width":12,"height":9.5}} \ No newline at end of file +{"$schema":"https://json-schema.org/draft/2020-12/schema","$id":"https://example.com/product.schema.json","title":"Product","description":"A product from Acme's catalog","type":"object","properties":{"productId":{"description":"The unique identifier for a product","type":"integer"},"productName":{"description":"Name of the product","type":"string"},"price":{"description":"The price of the product","type":"number","exclusiveMinimum":0},"tags":{"description":"Tags for the product","type":"array","items":{"type":"string"},"minItems":1,"uniqueItems":true},"dimensions":{"type":"object","properties":{"length":{"type":"number"},"width":{"type":"number"},"height":{"type":"number"}},"required":["length","width","height"]},"map":{"type":"object","additionalProperties":{"type":"string"}},"notMap":{"type":"object","additionalProperties":{"type":"string"},"properties":{"a":{"type":"string"}}}},"required":["productId","productName","price"]} +{"productId":1,"productName":"An ice sculpture","price":12.5,"tags":["cold","ice"],"dimensions":{"length":7,"width":12,"height":9.5},"map":{"foo":"bar"},"notMap":{"a":"b","ignored":"c"}} \ No newline at end of file diff --git a/src/connector/src/parser/json_parser.rs b/src/connector/src/parser/json_parser.rs index 9d62d76eff7a7..89f118eb1022f 100644 --- a/src/connector/src/parser/json_parser.rs +++ b/src/connector/src/parser/json_parser.rs @@ -26,6 +26,7 @@ use std::collections::BTreeMap; use anyhow::Context as _; use apache_avro::Schema; use jst::{convert_avro, Context}; +use risingwave_connector_codec::decoder::avro::MapHandling; use risingwave_pb::plan_common::ColumnDesc; use super::util::{bytes_from_url, get_kafka_topic}; @@ -80,7 +81,7 @@ impl JsonAccessBuilder { } } -pub async fn schema_to_columns( +pub async fn fetch_json_schema_and_map_to_columns( schema_location: &str, schema_registry_auth: Option, props: &BTreeMap, @@ -98,11 +99,21 @@ pub async fn schema_to_columns( let bytes = bytes_from_url(url, None).await?; serde_json::from_slice(&bytes)? }; - let context = Context::default(); - let avro_schema = convert_avro(&json_schema, context).to_string(); + json_schema_to_columns(&json_schema) +} + +/// FIXME: when the JSON schema is invalid, it will panic. +/// +/// ## Notes on type conversion +/// Map will be used when an object doesn't have `properties` but has `additionalProperties`. +/// When an object has `properties` and `additionalProperties`, the latter will be ignored. +/// +/// +/// TODO: examine other stuff like `oneOf`, `patternProperties`, etc. +fn json_schema_to_columns(json_schema: &serde_json::Value) -> ConnectorResult> { + let avro_schema = convert_avro(json_schema, Context::default()).to_string(); let schema = Schema::parse_str(&avro_schema).context("failed to parse avro schema")?; - // TODO: do we need to support map type here? - avro_schema_to_column_descs(&schema, None).map_err(Into::into) + avro_schema_to_column_descs(&schema, Some(MapHandling::Jsonb)).map_err(Into::into) } #[cfg(test)] diff --git a/src/frontend/src/handler/create_source.rs b/src/frontend/src/handler/create_source.rs index f10f951665208..a29aa86907e0f 100644 --- a/src/frontend/src/handler/create_source.rs +++ b/src/frontend/src/handler/create_source.rs @@ -32,8 +32,8 @@ use risingwave_connector::parser::additional_columns::{ build_additional_column_catalog, get_supported_additional_columns, }; use risingwave_connector::parser::{ - schema_to_columns, AvroParserConfig, DebeziumAvroParserConfig, ProtobufParserConfig, - SpecificParserConfig, TimestamptzHandling, DEBEZIUM_IGNORE_KEY, + fetch_json_schema_and_map_to_columns, AvroParserConfig, DebeziumAvroParserConfig, + ProtobufParserConfig, SpecificParserConfig, TimestamptzHandling, DEBEZIUM_IGNORE_KEY, }; use risingwave_connector::schema::schema_registry::{ name_strategy_from_str, SchemaRegistryAuth, SCHEMA_REGISTRY_PASSWORD, SCHEMA_REGISTRY_USERNAME, @@ -102,14 +102,18 @@ async fn extract_json_table_schema( auth }); Ok(Some( - schema_to_columns(&schema_location.0, schema_registry_auth, with_properties) - .await? - .into_iter() - .map(|col| ColumnCatalog { - column_desc: col.into(), - is_hidden: false, - }) - .collect_vec(), + fetch_json_schema_and_map_to_columns( + &schema_location.0, + schema_registry_auth, + with_properties, + ) + .await? + .into_iter() + .map(|col| ColumnCatalog { + column_desc: col.into(), + is_hidden: false, + }) + .collect_vec(), )) } } From a59356f6f8ea3c75bad79b05cf7c39240cb79753 Mon Sep 17 00:00:00 2001 From: Noel Kwan <47273164+kwannoel@users.noreply.github.com> Date: Thu, 13 Jun 2024 15:00:51 +0800 Subject: [PATCH 07/19] feat(catalog): add `rw_catalog.actor_id_to_ddl`, `rw_catalog.fragment_id_to_ddl` (#17229) --- .../catalog/system_catalog/rw_catalog/mod.rs | 3 ++ .../rw_catalog/rw_actor_id_to_ddl.rs | 42 +++++++++++++++++++ .../rw_catalog/rw_fragment_id_to_ddl.rs | 40 ++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 src/frontend/src/catalog/system_catalog/rw_catalog/rw_actor_id_to_ddl.rs create mode 100644 src/frontend/src/catalog/system_catalog/rw_catalog/rw_fragment_id_to_ddl.rs diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs index 502c41561b65f..4e0e165a6d524 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs @@ -56,3 +56,6 @@ mod rw_user_secrets; mod rw_users; mod rw_views; mod rw_worker_nodes; + +mod rw_actor_id_to_ddl; +mod rw_fragment_id_to_ddl; diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_actor_id_to_ddl.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_actor_id_to_ddl.rs new file mode 100644 index 0000000000000..95e269f0d5d5c --- /dev/null +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_actor_id_to_ddl.rs @@ -0,0 +1,42 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_common::types::Fields; +use risingwave_frontend_macro::system_catalog; + +/// Provides a mapping from `actor_id` to its ddl info. +#[system_catalog( +view, +"rw_catalog.rw_actor_id_to_ddl", +"with + actor_to_job_id as (select actor_id, a.fragment_id, table_id from rw_fragments f join rw_actors a on f.fragment_id = a.fragment_id), + job_id_to_mv as (select actor_id, fragment_id, d.id as job_id, schema_id, 'mv' as ddl_type, name from rw_materialized_views d join actor_to_job_id a on d.id = a.table_id), + job_id_to_sink as (select actor_id, fragment_id, d.id as job_id, schema_id, 'sink' as ddl_type, name from rw_sinks d join actor_to_job_id a on d.id = a.table_id), + job_id_to_source as (select actor_id, fragment_id, d.id as job_id, schema_id, 'source' as ddl_type, name from rw_sources d join actor_to_job_id a on d.id = a.table_id), + job_id_to_table as (select actor_id, fragment_id, d.id as job_id, schema_id, 'table' as ddl_type, name from rw_tables d join actor_to_job_id a on d.id = a.table_id) + select * from job_id_to_mv + union all select * from job_id_to_sink + union all select * from job_id_to_source + union all select * from job_id_to_table" +)] +#[derive(Fields)] +struct RwActorIdToDdl { + #[primary_key] + actor_id: i32, + fragment_id: i32, + job_id: i32, + schema_id: i32, + ddl_type: String, + name: String, +} diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_fragment_id_to_ddl.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_fragment_id_to_ddl.rs new file mode 100644 index 0000000000000..094e85903a31a --- /dev/null +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_fragment_id_to_ddl.rs @@ -0,0 +1,40 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_common::types::Fields; +use risingwave_frontend_macro::system_catalog; + +/// Provides a mapping from `actor_id` to its ddl info. +#[system_catalog( +view, +"rw_catalog.rw_fragment_id_to_ddl", +"with + job_id_to_mv as (select fragment_id, d.id as job_id, schema_id, 'mv' as ddl_type, name from rw_materialized_views d join rw_fragments f on d.id = f.table_id), + job_id_to_sink as (select fragment_id, d.id as job_id, schema_id, 'sink' as ddl_type, name from rw_sinks d join rw_fragments f on d.id = f.table_id), + job_id_to_source as (select fragment_id, d.id as job_id, schema_id, 'source' as ddl_type, name from rw_sources d join rw_fragments f on d.id = f.table_id), + job_id_to_table as (select fragment_id, d.id as job_id, schema_id, 'table' as ddl_type, name from rw_tables d join rw_fragments f on d.id = f.table_id) + select * from job_id_to_mv + union all select * from job_id_to_sink + union all select * from job_id_to_source + union all select * from job_id_to_table" +)] +#[derive(Fields)] +struct RwFragmentIdToDdl { + #[primary_key] + fragment_id: i32, + job_id: i32, + schema_id: i32, + ddl_type: String, + name: String, +} From 759b2228fa4f556d281814afc4842cdd8ba9c23c Mon Sep 17 00:00:00 2001 From: Li0k Date: Thu, 13 Jun 2024 15:15:48 +0800 Subject: [PATCH 08/19] refactor(storage): refactor compaction group manager txn (#17070) --- src/meta/node/src/server.rs | 2 +- src/meta/src/controller/session_params.rs | 2 +- src/meta/src/hummock/compactor_manager.rs | 2 +- src/meta/src/hummock/manager/compaction.rs | 16 +- .../manager/compaction_group_manager.rs | 486 ++++++++---------- src/meta/src/hummock/manager/context.rs | 4 +- src/meta/src/hummock/manager/mod.rs | 14 +- src/meta/src/hummock/manager/tests.rs | 4 +- src/meta/src/hummock/manager/timer_task.rs | 9 +- src/meta/src/manager/cluster.rs | 2 +- src/meta/src/manager/env.rs | 4 +- src/meta/src/model/catalog.rs | 2 +- src/storage/hummock_sdk/src/version.rs | 4 +- 13 files changed, 255 insertions(+), 296 deletions(-) diff --git a/src/meta/node/src/server.rs b/src/meta/node/src/server.rs index 8802ebb37951e..e8b738305dce7 100644 --- a/src/meta/node/src/server.rs +++ b/src/meta/node/src/server.rs @@ -659,7 +659,7 @@ pub async fn start_service_as_election_leader( ); let health_srv = HealthServiceImpl::new(); let backup_srv = BackupServiceImpl::new(backup_manager); - let telemetry_srv = TelemetryInfoServiceImpl::new(env.meta_store_ref()); + let telemetry_srv = TelemetryInfoServiceImpl::new(env.meta_store()); let system_params_srv = SystemParamsServiceImpl::new(env.system_params_manager_impl_ref()); let session_params_srv = SessionParamsServiceImpl::new(env.session_params_manager_impl_ref()); let serving_srv = diff --git a/src/meta/src/controller/session_params.rs b/src/meta/src/controller/session_params.rs index 4a27967fa2b0a..566170a0ef4d2 100644 --- a/src/meta/src/controller/session_params.rs +++ b/src/meta/src/controller/session_params.rs @@ -151,7 +151,7 @@ mod tests { use sea_orm::QueryFilter; let env = MetaSrvEnv::for_test_with_sql_meta_store().await; - let meta_store = env.meta_store().as_sql(); + let meta_store = env.meta_store_ref().as_sql(); let init_params = SessionConfig::default(); // init system parameter controller as first launch. diff --git a/src/meta/src/hummock/compactor_manager.rs b/src/meta/src/hummock/compactor_manager.rs index 0876f31d211e3..ab0c868f703c4 100644 --- a/src/meta/src/hummock/compactor_manager.rs +++ b/src/meta/src/hummock/compactor_manager.rs @@ -138,7 +138,7 @@ impl CompactorManagerInner { use risingwave_meta_model_v2::compaction_task; use sea_orm::EntityTrait; // Retrieve the existing task assignments from metastore. - let task_assignment: Vec = match env.meta_store() { + let task_assignment: Vec = match env.meta_store_ref() { MetaStoreImpl::Kv(meta_store) => CompactTaskAssignment::list(meta_store).await?, MetaStoreImpl::Sql(sql_meta_store) => compaction_task::Entity::find() .all(&sql_meta_store.conn) diff --git a/src/meta/src/hummock/manager/compaction.rs b/src/meta/src/hummock/manager/compaction.rs index 7a27061c8d373..1969fc8ffc348 100644 --- a/src/meta/src/hummock/manager/compaction.rs +++ b/src/meta/src/hummock/manager/compaction.rs @@ -704,15 +704,15 @@ impl HummockManager { // When the last table of a compaction group is deleted, the compaction group (and its // config) is destroyed as well. Then a compaction task for this group may come later and // cannot find its config. - let group_config = match self - .compaction_group_manager - .read() - .await - .try_get_compaction_group_config(compaction_group_id) - { - Some(config) => config, - None => continue, + let group_config = { + let config_manager = self.compaction_group_manager.read().await; + + match config_manager.try_get_compaction_group_config(compaction_group_id) { + Some(config) => config, + None => continue, + } }; + // StoredIdGenerator already implements ids pre-allocation by ID_PREALLOCATE_INTERVAL. let task_id = next_compaction_task_id(&self.env).await?; diff --git a/src/meta/src/hummock/manager/compaction_group_manager.rs b/src/meta/src/hummock/manager/compaction_group_manager.rs index 063ce4e0a58b1..3a3d596844e95 100644 --- a/src/meta/src/hummock/manager/compaction_group_manager.rs +++ b/src/meta/src/hummock/manager/compaction_group_manager.rs @@ -42,17 +42,15 @@ use crate::hummock::compaction::compaction_config::{ }; use crate::hummock::error::{Error, Result}; use crate::hummock::manager::transaction::HummockVersionTransaction; -use crate::hummock::manager::versioning::{calc_new_write_limits, Versioning}; +use crate::hummock::manager::versioning::Versioning; use crate::hummock::manager::{commit_multi_var, HummockManager}; -use crate::hummock::metrics_utils::{ - remove_compaction_group_in_sst_stat, trigger_write_stop_stats, -}; +use crate::hummock::metrics_utils::remove_compaction_group_in_sst_stat; use crate::hummock::model::CompactionGroup; use crate::hummock::sequence::{next_compaction_group_id, next_sstable_object_id}; use crate::manager::{MetaSrvEnv, MetaStoreImpl}; -use crate::model::{ - BTreeMapEntryTransaction, BTreeMapTransaction, MetadataModel, MetadataModelError, -}; +use crate::model::{BTreeMapTransaction, MetadataModel, MetadataModelError}; + +type CompactionGroupTransaction<'a> = BTreeMapTransaction<'a, CompactionGroupId, CompactionGroup>; impl CompactionGroupManager { pub(super) async fn new(env: &MetaSrvEnv) -> Result { @@ -69,13 +67,38 @@ impl CompactionGroupManager { ) -> Result { let mut compaction_group_manager = CompactionGroupManager { compaction_groups: BTreeMap::new(), - default_config, + default_config: Arc::new(default_config), write_limit: Default::default(), - meta_store_impl: env.meta_store_ref(), }; - compaction_group_manager.init().await?; + + let loaded_compaction_groups: BTreeMap = + match env.meta_store_ref() { + MetaStoreImpl::Kv(meta_store) => CompactionGroup::list(meta_store) + .await? + .into_iter() + .map(|cg| (cg.group_id(), cg)) + .collect(), + MetaStoreImpl::Sql(sql_meta_store) => { + use sea_orm::EntityTrait; + compaction_config::Entity::find() + .all(&sql_meta_store.conn) + .await + .map_err(MetadataModelError::from)? + .into_iter() + .map(|m| (m.compaction_group_id as CompactionGroupId, m.into())) + .collect() + } + }; + + compaction_group_manager.init(loaded_compaction_groups); Ok(compaction_group_manager) } + + fn init(&mut self, loaded_compaction_groups: BTreeMap) { + if !loaded_compaction_groups.is_empty() { + self.compaction_groups = loaded_compaction_groups; + } + } } impl HummockManager { @@ -167,7 +190,10 @@ impl HummockManager { } let mut versioning_guard = self.versioning.write().await; let versioning = versioning_guard.deref_mut(); + let mut compaction_group_manager = self.compaction_group_manager.write().await; let current_version = &versioning.current_version; + let default_config = compaction_group_manager.default_compaction_config(); + let mut compaction_groups_txn = compaction_group_manager.start_compaction_groups_txn(); for (table_id, _) in pairs { if let Some(old_group) = @@ -210,15 +236,17 @@ impl HummockManager { .entry(group_id) .or_default() .group_deltas; - let config = self - .compaction_group_manager - .write() - .await - .get_or_insert_compaction_group_config(group_id) - .await? - .compaction_config - .as_ref() - .clone(); + + let config = + match compaction_groups_txn.try_get_compaction_group_config(group_id) { + Some(config) => config.compaction_config.as_ref().clone(), + None => { + compaction_groups_txn + .create_compaction_groups(group_id, default_config.clone()); + default_config.as_ref().clone() + } + }; + group_deltas.push(GroupDelta { delta_type: Some(DeltaType::GroupConstruct(GroupConstruct { group_config: Some(config), @@ -233,6 +261,7 @@ impl HummockManager { .entry(group_id) .or_default() .group_deltas; + group_deltas.push(GroupDelta { delta_type: Some(DeltaType::GroupMetaChange(GroupMetaChange { table_ids_add: vec![*table_id], @@ -251,7 +280,7 @@ impl HummockManager { .is_none()); } new_version_delta.pre_apply(); - commit_multi_var!(self.meta_store_ref(), version)?; + commit_multi_var!(self.meta_store_ref(), version, compaction_groups_txn)?; Ok(()) } @@ -334,22 +363,23 @@ impl HummockManager { delta_type: Some(DeltaType::GroupDestroy(GroupDestroy {})), }); } - new_version_delta.pre_apply(); - commit_multi_var!(self.meta_store_ref(), version)?; for (group_id, max_level) in groups_to_remove { remove_compaction_group_in_sst_stat(&self.metrics, group_id, max_level); } + new_version_delta.pre_apply(); + // Purge may cause write to meta store. If it hurts performance while holding versioning // lock, consider to make it in batch. - self.compaction_group_manager - .write() - .await - .purge(HashSet::from_iter(get_compaction_group_ids( - &versioning.current_version, - ))) - .await?; + let mut compaction_group_manager = self.compaction_group_manager.write().await; + let mut compaction_groups_txn = compaction_group_manager.start_compaction_groups_txn(); + + compaction_groups_txn.purge(HashSet::from_iter(get_compaction_group_ids( + version.latest_version(), + ))); + commit_multi_var!(self.meta_store_ref(), version, compaction_groups_txn)?; + Ok(()) } @@ -357,21 +387,25 @@ impl HummockManager { &self, compaction_group_ids: &[CompactionGroupId], config_to_update: &[MutableConfig], - ) -> Result> { - let result = self - .compaction_group_manager - .write() - .await - .update_compaction_config(compaction_group_ids, config_to_update) - .await?; + ) -> Result<()> { + { + // Avoid lock conflicts with `try_update_write_limits`` + let mut compaction_group_manager = self.compaction_group_manager.write().await; + let mut compaction_groups_txn = compaction_group_manager.start_compaction_groups_txn(); + compaction_groups_txn + .update_compaction_config(compaction_group_ids, config_to_update)?; + commit_multi_var!(self.meta_store_ref(), compaction_groups_txn)?; + } + if config_to_update .iter() .any(|c| matches!(c, MutableConfig::Level0StopWriteThresholdSubLevelNumber(_))) { + // Update write limits with lock self.try_update_write_limits(compaction_group_ids).await; } - Ok(result) + Ok(()) } /// Gets complete compaction group info. @@ -380,24 +414,25 @@ impl HummockManager { let mut versioning_guard = self.versioning.write().await; let versioning = versioning_guard.deref_mut(); let current_version = &versioning.current_version; - let mut compaction_groups = vec![]; + let mut results = vec![]; + let compaction_group_manager = self.compaction_group_manager.read().await; + for levels in current_version.levels.values() { - let config = self - .compaction_group_manager - .read() - .await + let compaction_config = compaction_group_manager .try_get_compaction_group_config(levels.group_id) .unwrap() - .compaction_config; + .compaction_config + .as_ref() + .clone(); let group = CompactionGroupInfo { id: levels.group_id, parent_id: levels.parent_group_id, member_table_ids: levels.member_table_ids.clone(), - compaction_config: Some(config.as_ref().clone()), + compaction_config: Some(compaction_config), }; - compaction_groups.push(group); + results.push(group); } - compaction_groups + results } /// Splits a compaction group into two. The new one will contain `table_ids`. @@ -415,21 +450,19 @@ impl HummockManager { ) .await?; - Ok(result.0) + Ok(result) } /// move some table to another compaction-group. Create a new compaction group if it does not /// exist. - /// TODO: Move `table_to_partition` in result to compaction group pub async fn move_state_table_to_compaction_group( &self, parent_group_id: CompactionGroupId, table_ids: &[StateTableId], partition_vnode_count: u32, - ) -> Result<(CompactionGroupId, BTreeMap)> { - let mut table_to_partition = BTreeMap::default(); + ) -> Result { if table_ids.is_empty() { - return Ok((parent_group_id, table_to_partition)); + return Ok(parent_group_id); } let table_ids = table_ids.iter().cloned().unique().collect_vec(); let compaction_guard = self.compaction.write().await; @@ -483,7 +516,9 @@ impl HummockManager { .compaction_group_manager .read() .await - .default_compaction_config(); + .default_compaction_config() + .as_ref() + .clone(); config.split_weight_by_vnode = partition_vnode_count; new_version_delta.group_deltas.insert( @@ -520,20 +555,12 @@ impl HummockManager { let (new_compaction_group_id, config) = new_group; { let mut compaction_group_manager = self.compaction_group_manager.write().await; - let insert = BTreeMapEntryTransaction::new_insert( - &mut compaction_group_manager.compaction_groups, - new_compaction_group_id, - CompactionGroup { - group_id: new_compaction_group_id, - compaction_config: Arc::new(config), - }, - ); + let mut compaction_groups_txn = compaction_group_manager.start_compaction_groups_txn(); + compaction_groups_txn + .create_compaction_groups(new_compaction_group_id, Arc::new(config)); + new_version_delta.pre_apply(); - commit_multi_var!(self.meta_store_ref(), version, insert)?; - // Currently, only splitting out a single table_id is supported. - for table_id in table_ids { - table_to_partition.insert(table_id, partition_vnode_count); - } + commit_multi_var!(self.meta_store_ref(), version, compaction_groups_txn)?; } let mut canceled_tasks = vec![]; @@ -571,7 +598,7 @@ impl HummockManager { .with_label_values(&[&parent_group_id.to_string()]) .inc(); - Ok((target_compaction_group_id, table_to_partition)) + Ok(target_compaction_group_id) } pub async fn calculate_compaction_group_statistic(&self) -> Vec { @@ -614,71 +641,11 @@ impl HummockManager { ) -> Result<()> { // 1. Due to version compatibility, we fix some of the configuration of older versions after hummock starts. let current_version = &versioning_guard.current_version; - let all_group_ids = get_compaction_group_ids(current_version); - let mut configs = compaction_group_manager - .get_or_insert_compaction_group_configs(&all_group_ids.collect_vec()) - .await?; - - // We've already lowered the default limit for write limit in PR-12183, and to prevent older clusters from continuing to use the outdated configuration, we've introduced a new logic to rewrite it in a uniform way. - let mut rewrite_cg_ids = vec![]; - let mut restore_cg_to_partition_vnode: HashMap> = - HashMap::default(); - for (cg_id, compaction_group_config) in &mut configs { - // update write limit - let relaxed_default_write_stop_level_count = 1000; - if compaction_group_config - .compaction_config - .level0_sub_level_compact_level_count - == relaxed_default_write_stop_level_count - { - rewrite_cg_ids.push(*cg_id); - } - - if let Some(levels) = current_version.levels.get(cg_id) { - if levels.member_table_ids.len() == 1 { - restore_cg_to_partition_vnode.insert( - *cg_id, - vec![( - levels.member_table_ids[0], - compaction_group_config - .compaction_config - .split_weight_by_vnode, - )] - .into_iter() - .collect(), - ); - } - } - } - - if !rewrite_cg_ids.is_empty() { - tracing::info!("Compaction group {:?} configs rewrite ", rewrite_cg_ids); - - // update meta store - let result = compaction_group_manager - .update_compaction_config( - &rewrite_cg_ids, - &[ - MutableConfig::Level0StopWriteThresholdSubLevelNumber( - risingwave_common::config::default::compaction_config::level0_stop_write_threshold_sub_level_number(), - ), - ], - ) - .await?; - - // update memory - for new_config in result { - configs.insert(new_config.group_id(), new_config); - } - } - - compaction_group_manager.write_limit = - calc_new_write_limits(configs, HashMap::new(), &versioning_guard.current_version); - trigger_write_stop_stats(&self.metrics, &compaction_group_manager.write_limit); - tracing::debug!( - "Hummock stopped write: {:#?}", - compaction_group_manager.write_limit - ); + let all_group_ids = get_compaction_group_ids(current_version).collect_vec(); + let default_config = compaction_group_manager.default_compaction_config(); + let mut compaction_groups_txn = compaction_group_manager.start_compaction_groups_txn(); + compaction_groups_txn.try_create_compaction_groups(&all_group_ids, default_config); + commit_multi_var!(self.meta_store_ref(), compaction_groups_txn)?; Ok(()) } @@ -693,91 +660,15 @@ impl HummockManager { /// 3. move existent table to new compaction group. pub(super) struct CompactionGroupManager { compaction_groups: BTreeMap, - default_config: CompactionConfig, + default_config: Arc, /// Tables that write limit is trigger for. pub write_limit: HashMap, - meta_store_impl: MetaStoreImpl, -} - -// init method -impl CompactionGroupManager { - async fn init(&mut self) -> Result<()> { - let loaded_compaction_groups: BTreeMap = - match &self.meta_store_impl { - MetaStoreImpl::Kv(meta_store) => CompactionGroup::list(meta_store) - .await? - .into_iter() - .map(|cg| (cg.group_id(), cg)) - .collect(), - MetaStoreImpl::Sql(sql_meta_store) => { - use sea_orm::EntityTrait; - compaction_config::Entity::find() - .all(&sql_meta_store.conn) - .await - .map_err(MetadataModelError::from)? - .into_iter() - .map(|m| (m.compaction_group_id as CompactionGroupId, m.into())) - .collect() - } - }; - if !loaded_compaction_groups.is_empty() { - self.compaction_groups = loaded_compaction_groups; - } - Ok(()) - } - - /// Initializes the config for a group. - /// Should only be used by compaction test. - pub(super) async fn init_compaction_config_for_replay( - &mut self, - group_id: CompactionGroupId, - config: CompactionConfig, - ) -> Result<()> { - let insert = BTreeMapEntryTransaction::new_insert( - &mut self.compaction_groups, - group_id, - CompactionGroup { - group_id, - compaction_config: Arc::new(config), - }, - ); - commit_multi_var!(self.meta_store_impl, insert)?; - Ok(()) - } } impl CompactionGroupManager { - /// Gets compaction group config for `compaction_group_id`, inserts default one if missing. - async fn get_or_insert_compaction_group_config( - &mut self, - compaction_group_id: CompactionGroupId, - ) -> Result { - let r = self - .get_or_insert_compaction_group_configs(&[compaction_group_id]) - .await?; - Ok(r.into_values().next().unwrap()) - } - - /// Gets compaction group configs for `compaction_group_ids`, inserts default one if missing. - async fn get_or_insert_compaction_group_configs( - &mut self, - compaction_group_ids: &[CompactionGroupId], - ) -> Result> { - let mut compaction_groups = BTreeMapTransaction::new(&mut self.compaction_groups); - for id in compaction_group_ids { - if compaction_groups.contains_key(id) { - continue; - } - let new_entry = CompactionGroup::new(*id, self.default_config.clone()); - compaction_groups.insert(*id, new_entry); - } - commit_multi_var!(self.meta_store_impl, compaction_groups)?; - - let r = compaction_group_ids - .iter() - .map(|id| (*id, self.compaction_groups[id].clone())) - .collect(); - Ok(r) + /// Starts a transaction to update compaction group configs. + pub fn start_compaction_groups_txn(&mut self) -> CompactionGroupTransaction<'_> { + CompactionGroupTransaction::new(&mut self.compaction_groups) } /// Tries to get compaction group config for `compaction_group_id`. @@ -788,53 +679,10 @@ impl CompactionGroupManager { self.compaction_groups.get(&compaction_group_id).cloned() } - pub(super) fn default_compaction_config(&self) -> CompactionConfig { + /// Tries to get compaction group config for `compaction_group_id`. + pub(super) fn default_compaction_config(&self) -> Arc { self.default_config.clone() } - - pub(super) async fn update_compaction_config( - &mut self, - compaction_group_ids: &[CompactionGroupId], - config_to_update: &[MutableConfig], - ) -> Result> { - let mut compaction_groups = BTreeMapTransaction::new(&mut self.compaction_groups); - let mut result = Vec::with_capacity(compaction_group_ids.len()); - for compaction_group_id in compaction_group_ids.iter().unique() { - let group = compaction_groups.get(compaction_group_id).ok_or_else(|| { - Error::CompactionGroup(format!("invalid group {}", *compaction_group_id)) - })?; - let mut config = group.compaction_config.as_ref().clone(); - update_compaction_config(&mut config, config_to_update); - if let Err(reason) = validate_compaction_config(&config) { - return Err(Error::CompactionGroup(reason)); - } - let mut new_group = group.clone(); - new_group.compaction_config = Arc::new(config); - compaction_groups.insert(*compaction_group_id, new_group.clone()); - result.push(new_group); - } - commit_multi_var!(self.meta_store_impl, compaction_groups)?; - Ok(result) - } - - /// Removes stale group configs. - async fn purge(&mut self, existing_groups: HashSet) -> Result<()> { - let mut compaction_groups = BTreeMapTransaction::new(&mut self.compaction_groups); - let stale_group = compaction_groups - .tree_ref() - .keys() - .cloned() - .filter(|k| !existing_groups.contains(k)) - .collect_vec(); - if stale_group.is_empty() { - return Ok(()); - } - for group in stale_group { - compaction_groups.remove(group); - } - commit_multi_var!(self.meta_store_impl, compaction_groups)?; - Ok(()) - } } fn update_compaction_config(target: &mut CompactionConfig, items: &[MutableConfig]) { @@ -896,6 +744,84 @@ fn update_compaction_config(target: &mut CompactionConfig, items: &[MutableConfi } } +impl<'a> CompactionGroupTransaction<'a> { + /// Inserts compaction group configs if they do not exist. + pub fn try_create_compaction_groups( + &mut self, + compaction_group_ids: &[CompactionGroupId], + config: Arc, + ) -> bool { + let mut trivial = true; + for id in compaction_group_ids { + if self.contains_key(id) { + continue; + } + let new_entry = CompactionGroup::new(*id, config.as_ref().clone()); + self.insert(*id, new_entry); + + trivial = false; + } + + !trivial + } + + pub fn create_compaction_groups( + &mut self, + compaction_group_id: CompactionGroupId, + config: Arc, + ) { + self.try_create_compaction_groups(&[compaction_group_id], config); + } + + /// Tries to get compaction group config for `compaction_group_id`. + pub(super) fn try_get_compaction_group_config( + &self, + compaction_group_id: CompactionGroupId, + ) -> Option<&CompactionGroup> { + self.get(&compaction_group_id) + } + + /// Removes stale group configs. + fn purge(&mut self, existing_groups: HashSet) { + let stale_group = self + .tree_ref() + .keys() + .cloned() + .filter(|k| !existing_groups.contains(k)) + .collect_vec(); + if stale_group.is_empty() { + return; + } + for group in stale_group { + self.remove(group); + } + } + + pub(super) fn update_compaction_config( + &mut self, + compaction_group_ids: &[CompactionGroupId], + config_to_update: &[MutableConfig], + ) -> Result> { + let mut results = HashMap::default(); + for compaction_group_id in compaction_group_ids.iter().unique() { + let group = self.get(compaction_group_id).ok_or_else(|| { + Error::CompactionGroup(format!("invalid group {}", *compaction_group_id)) + })?; + let mut config = group.compaction_config.as_ref().clone(); + update_compaction_config(&mut config, config_to_update); + if let Err(reason) = validate_compaction_config(&config) { + return Err(Error::CompactionGroup(reason)); + } + let mut new_group = group.clone(); + new_group.compaction_config = Arc::new(config); + self.insert(*compaction_group_id, new_group.clone()); + results.insert(new_group.group_id(), new_group); + } + + Ok(results) + } +} + #[cfg(test)] mod tests { use std::collections::BTreeMap; @@ -905,8 +831,11 @@ mod tests { use risingwave_pb::hummock::rise_ctl_update_compaction_config_request::mutable_config::MutableConfig; use risingwave_pb::meta::table_fragments::Fragment; + use crate::hummock::commit_multi_var; + use crate::hummock::error::Result; use crate::hummock::manager::compaction_group_manager::CompactionGroupManager; use crate::hummock::test_utils::setup_compute_env; + use crate::manager::MetaStoreImpl; use crate::model::TableFragments; #[tokio::test] @@ -914,21 +843,46 @@ mod tests { let (env, ..) = setup_compute_env(8080).await; let mut inner = CompactionGroupManager::new(&env).await.unwrap(); assert_eq!(inner.compaction_groups.len(), 2); - inner - .update_compaction_config(&[100, 200], &[]) + + async fn update_compaction_config( + meta: &MetaStoreImpl, + inner: &mut CompactionGroupManager, + cg_ids: &[u64], + config_to_update: &[MutableConfig], + ) -> Result<()> { + let mut compaction_groups_txn = inner.start_compaction_groups_txn(); + compaction_groups_txn.update_compaction_config(cg_ids, config_to_update)?; + commit_multi_var!(meta, compaction_groups_txn) + } + + async fn insert_compaction_group_configs( + meta: &MetaStoreImpl, + inner: &mut CompactionGroupManager, + cg_ids: &[u64], + ) { + let default_config = inner.default_compaction_config(); + let mut compaction_groups_txn = inner.start_compaction_groups_txn(); + if compaction_groups_txn.try_create_compaction_groups(cg_ids, default_config) { + commit_multi_var!(meta, compaction_groups_txn).unwrap(); + } + } + + update_compaction_config(env.meta_store_ref(), &mut inner, &[100, 200], &[]) .await .unwrap_err(); - inner - .get_or_insert_compaction_group_configs(&[100, 200]) - .await - .unwrap(); + insert_compaction_group_configs(env.meta_store_ref(), &mut inner, &[100, 200]).await; assert_eq!(inner.compaction_groups.len(), 4); let mut inner = CompactionGroupManager::new(&env).await.unwrap(); assert_eq!(inner.compaction_groups.len(), 4); - inner - .update_compaction_config(&[100, 200], &[MutableConfig::MaxSubCompaction(123)]) - .await - .unwrap(); + + update_compaction_config( + env.meta_store_ref(), + &mut inner, + &[100, 200], + &[MutableConfig::MaxSubCompaction(123)], + ) + .await + .unwrap(); assert_eq!(inner.compaction_groups.len(), 4); assert_eq!( inner diff --git a/src/meta/src/hummock/manager/context.rs b/src/meta/src/hummock/manager/context.rs index 155dd362c0909..982a94fd5f9db 100644 --- a/src/meta/src/hummock/manager/context.rs +++ b/src/meta/src/hummock/manager/context.rs @@ -99,7 +99,7 @@ impl HummockManager { ) -> Result<()> { let mut context_info = self.context_info.write().await; context_info - .release_contexts(context_ids, self.meta_store_ref()) + .release_contexts(context_ids, self.env.meta_store()) .await?; #[cfg(test)] { @@ -188,7 +188,7 @@ impl HummockManager { } context_info - .release_contexts(&invalid_context_ids, self.meta_store_ref()) + .release_contexts(&invalid_context_ids, self.env.meta_store()) .await?; Ok(invalid_context_ids) diff --git a/src/meta/src/hummock/manager/mod.rs b/src/meta/src/hummock/manager/mod.rs index 47209ddf1fff2..8a49d91a55fc3 100644 --- a/src/meta/src/hummock/manager/mod.rs +++ b/src/meta/src/hummock/manager/mod.rs @@ -297,7 +297,7 @@ impl HummockManager { Ok(instance) } - fn meta_store_ref(&self) -> MetaStoreImpl { + fn meta_store_ref(&self) -> &MetaStoreImpl { self.env.meta_store_ref() } @@ -495,22 +495,22 @@ impl HummockManager { ); } + let mut compaction_group_manager = self.compaction_group_manager.write().await; + let mut compaction_groups_txn = compaction_group_manager.start_compaction_groups_txn(); for group in &compaction_groups { let mut pairs = vec![]; for table_id in group.member_table_ids.clone() { pairs.push((table_id as StateTableId, group.id)); } let group_config = group.compaction_config.clone().unwrap(); - self.compaction_group_manager - .write() - .await - .init_compaction_config_for_replay(group.id, group_config) - .await - .unwrap(); + compaction_groups_txn.create_compaction_groups(group.id, Arc::new(group_config)); + self.register_table_ids_for_test(&pairs).await?; tracing::info!("Registered table ids {:?}", pairs); } + commit_multi_var!(self.meta_store_ref(), compaction_groups_txn)?; + // Notify that tables have created for table in table_catalogs { self.env diff --git a/src/meta/src/hummock/manager/tests.rs b/src/meta/src/hummock/manager/tests.rs index 90ba6db61de4e..5b62e5f0694b7 100644 --- a/src/meta/src/hummock/manager/tests.rs +++ b/src/meta/src/hummock/manager/tests.rs @@ -100,7 +100,7 @@ fn get_compaction_group_object_ids( } async fn list_pinned_snapshot_from_meta_store(env: &MetaSrvEnv) -> Vec { - match env.meta_store() { + match env.meta_store_ref() { MetaStoreImpl::Kv(meta_store) => HummockPinnedSnapshot::list(meta_store).await.unwrap(), MetaStoreImpl::Sql(sql_meta_store) => { use risingwave_meta_model_v2::hummock_pinned_snapshot; @@ -117,7 +117,7 @@ async fn list_pinned_snapshot_from_meta_store(env: &MetaSrvEnv) -> Vec Vec { - match env.meta_store() { + match env.meta_store_ref() { MetaStoreImpl::Kv(meta_store) => HummockPinnedVersion::list(meta_store).await.unwrap(), MetaStoreImpl::Sql(sql_meta_store) => { use risingwave_meta_model_v2::hummock_pinned_version; diff --git a/src/meta/src/hummock/manager/timer_task.rs b/src/meta/src/hummock/manager/timer_task.rs index 6b68950a28e3b..b7c8cae4b260e 100644 --- a/src/meta/src/hummock/manager/timer_task.rs +++ b/src/meta/src/hummock/manager/timer_task.rs @@ -565,8 +565,13 @@ impl HummockManager { ) .await; match ret { - Ok((new_group_id, table_vnode_partition_count)) => { - tracing::info!("move state table [{}] from group-{} to group-{} success table_vnode_partition_count {:?}", table_id, parent_group_id, new_group_id, table_vnode_partition_count); + Ok(new_group_id) => { + tracing::info!( + "move state table [{}] from group-{} to group-{} success", + table_id, + parent_group_id, + new_group_id + ); } Err(e) => { tracing::info!( diff --git a/src/meta/src/manager/cluster.rs b/src/meta/src/manager/cluster.rs index 4dd4a257c5b45..876050c36ae6c 100644 --- a/src/meta/src/manager/cluster.rs +++ b/src/meta/src/manager/cluster.rs @@ -568,7 +568,7 @@ impl ClusterManagerCore { pub const MAX_WORKER_REUSABLE_ID_COUNT: usize = 1 << Self::MAX_WORKER_REUSABLE_ID_BITS; async fn new(env: MetaSrvEnv) -> MetaResult { - let meta_store = env.meta_store().as_kv(); + let meta_store = env.meta_store_ref().as_kv(); let mut workers = Worker::list(meta_store).await?; let used_transactional_ids: HashSet<_> = workers diff --git a/src/meta/src/manager/env.rs b/src/meta/src/manager/env.rs index 0b0c2ed2a3ced..b623e441c0c22 100644 --- a/src/meta/src/manager/env.rs +++ b/src/meta/src/manager/env.rs @@ -462,11 +462,11 @@ impl MetaSrvEnv { Ok(env) } - pub fn meta_store_ref(&self) -> MetaStoreImpl { + pub fn meta_store(&self) -> MetaStoreImpl { self.meta_store_impl.clone() } - pub fn meta_store(&self) -> &MetaStoreImpl { + pub fn meta_store_ref(&self) -> &MetaStoreImpl { &self.meta_store_impl } diff --git a/src/meta/src/model/catalog.rs b/src/meta/src/model/catalog.rs index c11be01d1a599..8d89080ae2462 100644 --- a/src/meta/src/model/catalog.rs +++ b/src/meta/src/model/catalog.rs @@ -96,7 +96,7 @@ mod tests { #[tokio::test] async fn test_database() -> MetadataModelResult<()> { let env = MetaSrvEnv::for_test().await; - let store = env.meta_store().as_kv(); + let store = env.meta_store_ref().as_kv(); let databases = Database::list(store).await?; assert!(databases.is_empty()); assert!(Database::select(store, &0).await.unwrap().is_none()); diff --git a/src/storage/hummock_sdk/src/version.rs b/src/storage/hummock_sdk/src/version.rs index 51780b1bc7334..b30c1ba0e5191 100644 --- a/src/storage/hummock_sdk/src/version.rs +++ b/src/storage/hummock_sdk/src/version.rs @@ -266,7 +266,7 @@ impl HummockVersion { self.safe_epoch } - pub fn create_init_version(default_compaction_config: CompactionConfig) -> HummockVersion { + pub fn create_init_version(default_compaction_config: Arc) -> HummockVersion { let mut init_version = HummockVersion { id: FIRST_VERSION_ID, levels: Default::default(), @@ -282,7 +282,7 @@ impl HummockVersion { ] { init_version.levels.insert( group_id, - build_initial_compaction_group_levels(group_id, &default_compaction_config), + build_initial_compaction_group_levels(group_id, default_compaction_config.as_ref()), ); } init_version From c2b0ead320066e78af8aadfff94c3ccf40c42be2 Mon Sep 17 00:00:00 2001 From: StrikeW Date: Thu, 13 Jun 2024 15:37:20 +0800 Subject: [PATCH 09/19] chore: bump mysql java library (#17234) --- java/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/pom.xml b/java/pom.xml index f1ee457ef3b84..5f0327bf8ffc9 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -83,7 +83,7 @@ 1.18.0 1.17.6 42.5.5 - 8.0.33 + 8.3.0 4.11.1 3.45.0.0 2.21.42 @@ -570,4 +570,4 @@ https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/ - \ No newline at end of file + From 4d32c189e785b00c058d5ff89b7e728445a2af26 Mon Sep 17 00:00:00 2001 From: Tao Wu Date: Thu, 13 Jun 2024 15:45:39 +0800 Subject: [PATCH 10/19] chore: update tokio 0.2.24 to 0.2 (#17233) --- src/tests/e2e_extended_mode/Cargo.toml | 2 +- src/tests/simulation/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/e2e_extended_mode/Cargo.toml b/src/tests/e2e_extended_mode/Cargo.toml index ecbf751fe1004..cbc7831ea7a03 100644 --- a/src/tests/e2e_extended_mode/Cargo.toml +++ b/src/tests/e2e_extended_mode/Cargo.toml @@ -19,7 +19,7 @@ chrono = { version = "0.4", features = ['serde'] } clap = { workspace = true } pg_interval = "0.4" rust_decimal = { version = "1.35", features = ["db-postgres"] } -tokio = { version = "0.2.24", package = "madsim-tokio", features = [ +tokio = { version = "0.2", package = "madsim-tokio", features = [ "rt", "macros", "rt-multi-thread", diff --git a/src/tests/simulation/Cargo.toml b/src/tests/simulation/Cargo.toml index 57768643eb7dc..5fbfc0e19f6fd 100644 --- a/src/tests/simulation/Cargo.toml +++ b/src/tests/simulation/Cargo.toml @@ -53,7 +53,7 @@ serde_json = "1.0.107" sqllogictest = "0.20" tempfile = "3" tikv-jemallocator = { workspace = true } -tokio = { version = "0.2.24", package = "madsim-tokio" } +tokio = { version = "0.2", package = "madsim-tokio" } tokio-postgres = "0.7" tokio-stream = "0.1" tracing = "0.1" From 3b823d0175080be867751b486dc7c572dd51188a Mon Sep 17 00:00:00 2001 From: zwang28 <70626450+zwang28@users.noreply.github.com> Date: Thu, 13 Jun 2024 15:47:27 +0800 Subject: [PATCH 11/19] fix(meta): use per table mce and safe epoch in metadata backup (#17227) --- proto/backup_service.proto | 3 ++ .../rw_catalog/rw_meta_snapshot.rs | 31 +++--------- .../meta_snapshot_builder_v2.rs | 2 +- .../src/backup_restore/restore_impl/v2.rs | 46 ++++++++++------- .../backup/integration_tests/common.sh | 14 ++---- .../integration_tests/test_query_backup.sh | 10 ++-- src/storage/backup/src/lib.rs | 49 ++++++++++++++++++- src/storage/src/hummock/backup_reader.rs | 12 ++++- .../src/hummock/store/hummock_storage.rs | 6 ++- 9 files changed, 113 insertions(+), 60 deletions(-) diff --git a/proto/backup_service.proto b/proto/backup_service.proto index 48fe46ed7eac2..24d410b38f115 100644 --- a/proto/backup_service.proto +++ b/proto/backup_service.proto @@ -2,6 +2,8 @@ syntax = "proto3"; package backup_service; +import "hummock.proto"; + option java_package = "com.risingwave.proto"; option optimize_for = SPEED; @@ -50,6 +52,7 @@ message MetaSnapshotMetadata { optional uint32 format_version = 5; optional string remarks = 6; optional string rw_version = 7; + map state_table_info = 8; } service BackupService { diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_meta_snapshot.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_meta_snapshot.rs index f31b1f7c67c5c..51df244f9539c 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_meta_snapshot.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_meta_snapshot.rs @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -use risingwave_common::types::{Fields, Timestamp}; -use risingwave_common::util::epoch::Epoch; +use risingwave_common::types::{Fields, JsonbVal}; use risingwave_frontend_macro::system_catalog; +use serde_json::json; use crate::catalog::system_catalog::SysCatalogReaderImpl; use crate::error::Result; @@ -24,30 +24,13 @@ struct RwMetaSnapshot { #[primary_key] meta_snapshot_id: i64, hummock_version_id: i64, - // the smallest epoch this meta snapshot includes - safe_epoch: i64, - // human-readable timestamp of safe_epoch - safe_epoch_ts: Option, - // the largest epoch this meta snapshot includes - max_committed_epoch: i64, - // human-readable timestamp of max_committed_epoch - max_committed_epoch_ts: Option, remarks: Option, + state_table_info: Option, + rw_version: Option, } #[system_catalog(table, "rw_catalog.rw_meta_snapshot")] async fn read_meta_snapshot(reader: &SysCatalogReaderImpl) -> Result> { - let try_get_date_time = |epoch: u64| { - if epoch == 0 { - return None; - } - let time_millis = Epoch::from(epoch).as_unix_millis(); - Timestamp::with_secs_nsecs( - (time_millis / 1000) as i64, - (time_millis % 1000 * 1_000_000) as u32, - ) - .ok() - }; let meta_snapshots = reader .meta_client .list_meta_snapshots() @@ -56,11 +39,9 @@ async fn read_meta_snapshot(reader: &SysCatalogReaderImpl) -> Result BackupError { macro_rules! define_set_metadata { ($( {$name:ident, $mod_path:ident::$mod_name:ident} ),*) => { - pub async fn set_metadata( + async fn set_metadata( metadata: &mut MetadataV2, txn: &sea_orm::DatabaseTransaction, ) -> BackupResult<()> { diff --git a/src/meta/src/backup_restore/restore_impl/v2.rs b/src/meta/src/backup_restore/restore_impl/v2.rs index 13492c56316a2..a431b455063bb 100644 --- a/src/meta/src/backup_restore/restore_impl/v2.rs +++ b/src/meta/src/backup_restore/restore_impl/v2.rs @@ -93,28 +93,40 @@ impl WriterModelV2ToMetaStoreV2 { } } -macro_rules! define_write_model_v2_to_meta_store_v2 { - ($( {$name:ident, $mod_path:ident::$mod_name:ident} ),*) => { - async fn write_model_v2_to_meta_store_v2( - metadata: &risingwave_backup::meta_snapshot_v2::MetadataV2, - db: &sea_orm::DatabaseConnection, - ) -> BackupResult<()> { - $( - insert_models(metadata.$name.clone(), db).await?; - )* - Ok(()) - } - }; -} - -risingwave_backup::for_all_metadata_models_v2!(define_write_model_v2_to_meta_store_v2); - #[async_trait::async_trait] impl Writer for WriterModelV2ToMetaStoreV2 { async fn write(&self, target_snapshot: MetaSnapshot) -> BackupResult<()> { let metadata = target_snapshot.metadata; let db = &self.meta_store.conn; - write_model_v2_to_meta_store_v2(&metadata, db).await?; + insert_models(metadata.seaql_migrations.clone(), db).await?; + insert_models(metadata.clusters.clone(), db).await?; + insert_models(metadata.version_stats.clone(), db).await?; + insert_models(metadata.compaction_configs.clone(), db).await?; + insert_models(metadata.hummock_sequences.clone(), db).await?; + insert_models(metadata.workers.clone(), db).await?; + insert_models(metadata.worker_properties.clone(), db).await?; + insert_models(metadata.users.clone(), db).await?; + insert_models(metadata.user_privileges.clone(), db).await?; + insert_models(metadata.objects.clone(), db).await?; + insert_models(metadata.object_dependencies.clone(), db).await?; + insert_models(metadata.databases.clone(), db).await?; + insert_models(metadata.schemas.clone(), db).await?; + insert_models(metadata.streaming_jobs.clone(), db).await?; + insert_models(metadata.fragments.clone(), db).await?; + insert_models(metadata.actors.clone(), db).await?; + insert_models(metadata.actor_dispatchers.clone(), db).await?; + insert_models(metadata.connections.clone(), db).await?; + insert_models(metadata.sources.clone(), db).await?; + insert_models(metadata.tables.clone(), db).await?; + insert_models(metadata.sinks.clone(), db).await?; + insert_models(metadata.views.clone(), db).await?; + insert_models(metadata.indexes.clone(), db).await?; + insert_models(metadata.functions.clone(), db).await?; + insert_models(metadata.system_parameters.clone(), db).await?; + insert_models(metadata.catalog_versions.clone(), db).await?; + insert_models(metadata.subscriptions.clone(), db).await?; + insert_models(metadata.session_parameters.clone(), db).await?; + insert_models(metadata.secrets.clone(), db).await?; // update_auto_inc must be called last. update_auto_inc(&metadata, db).await?; Ok(()) diff --git a/src/storage/backup/integration_tests/common.sh b/src/storage/backup/integration_tests/common.sh index d37ea0e7b3c6a..68f37e5d67249 100644 --- a/src/storage/backup/integration_tests/common.sh +++ b/src/storage/backup/integration_tests/common.sh @@ -156,15 +156,15 @@ function execute_sql_and_expect() { } function get_max_committed_epoch() { - mce=$(${BACKUP_TEST_RW_ALL_IN_ONE} risectl hummock list-version --verbose 2>&1 | grep max_committed_epoch | sed -n 's/^.*max_committed_epoch: \(.*\),/\1/p') + mce=$(${BACKUP_TEST_RW_ALL_IN_ONE} risectl hummock list-version --verbose 2>&1 | grep committed_epoch | sed -n 's/^.*committed_epoch: \(.*\),/\1/p') # always take the smallest one echo "${mce}"|sort -n |head -n 1 } function get_safe_epoch() { safe_epoch=$(${BACKUP_TEST_RW_ALL_IN_ONE} risectl hummock list-version --verbose 2>&1 | grep safe_epoch | sed -n 's/^.*safe_epoch: \(.*\),/\1/p') - # always take the smallest one - echo "${safe_epoch}"|sort -n |head -n 1 + # always take the largest one + echo "${safe_epoch}"|sort -n -r |head -n 1 } function get_total_sst_count() { @@ -173,17 +173,13 @@ function get_total_sst_count() { } function get_max_committed_epoch_in_backup() { - local id - id=$1 - sed_str="s/.*{\"id\":${id},\"hummock_version_id\":.*,\"ssts\":\[.*\],\"max_committed_epoch\":\([[:digit:]]*\),\"safe_epoch\":.*}.*/\1/p" + sed_str="s/.*\"state_table_info\":{\"[[:digit:]]*\":{\"committedEpoch\":\"\([[:digit:]]*\)\",\"safeEpoch\":\"\([[:digit:]]*\)\"}.*/\1/p" ${BACKUP_TEST_MCLI} -C "${BACKUP_TEST_MCLI_CONFIG}" \ cat "hummock-minio/hummock001/backup/manifest.json" | sed -n "${sed_str}" } function get_safe_epoch_in_backup() { - local id - id=$1 - sed_str="s/.*{\"id\":${id},\"hummock_version_id\":.*,\"ssts\":\[.*\],\"max_committed_epoch\":.*,\"safe_epoch\":\([[:digit:]]*\).*}.*/\1/p" + sed_str="s/.*\"state_table_info\":{\"[[:digit:]]*\":{\"committedEpoch\":\"\([[:digit:]]*\)\",\"safeEpoch\":\"\([[:digit:]]*\)\"}.*/\2/p" ${BACKUP_TEST_MCLI} -C "${BACKUP_TEST_MCLI_CONFIG}" \ cat "hummock-minio/hummock001/backup/manifest.json" | sed -n "${sed_str}" } diff --git a/src/storage/backup/integration_tests/test_query_backup.sh b/src/storage/backup/integration_tests/test_query_backup.sh index b08216267f27d..dbba68c7e8564 100644 --- a/src/storage/backup/integration_tests/test_query_backup.sh +++ b/src/storage/backup/integration_tests/test_query_backup.sh @@ -24,8 +24,8 @@ select * from t1; job_id=$(backup) echo "${job_id}" -backup_mce=$(get_max_committed_epoch_in_backup "${job_id}") -backup_safe_epoch=$(get_safe_epoch_in_backup "${job_id}") +backup_mce=$(get_max_committed_epoch_in_backup) +backup_safe_epoch=$(get_safe_epoch_in_backup) echo "backup MCE: ${backup_mce}" echo "backup safe_epoch: ${backup_safe_epoch}" @@ -55,10 +55,10 @@ do sleep 5 min_pinned_snapshot=$(get_min_pinned_snapshot) done -# safe epoch equals to 0 because no compaction has been done +# safe epoch equals to backup_safe_epoch because no compaction has been done safe_epoch=$(get_safe_epoch) echo "safe epoch after unpin: ${safe_epoch}" -[ "${safe_epoch}" -eq 0 ] +[ "${safe_epoch}" -eq "${backup_safe_epoch}" ] # trigger a compaction to increase safe_epoch manual_compaction -c 3 -l 0 # wait until compaction is done @@ -68,6 +68,7 @@ do sleep 5 done echo "safe epoch after compaction: ${safe_epoch}" +[ "${safe_epoch}" -gt "${backup_safe_epoch}" ] echo "QUERY_EPOCH=safe_epoch. It should fail because it's not covered by any backup" execute_sql_and_expect \ @@ -83,7 +84,6 @@ select * from t1;" \ echo "QUERY_EPOCH=backup_safe_epoch + 1<<16 + 1, it's < safe_epoch but covered by backup" epoch=$((backup_safe_epoch + (1<<16) + 1)) -[ ${epoch} -eq 65537 ] execute_sql_and_expect \ "SET QUERY_EPOCH TO ${epoch}; select * from t1;" \ diff --git a/src/storage/backup/src/lib.rs b/src/storage/backup/src/lib.rs index ed8dccf8d1e49..6b569277c54dd 100644 --- a/src/storage/backup/src/lib.rs +++ b/src/storage/backup/src/lib.rs @@ -32,14 +32,16 @@ pub mod meta_snapshot_v1; pub mod meta_snapshot_v2; pub mod storage; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::hash::Hasher; use itertools::Itertools; +use risingwave_common::catalog::TableId; use risingwave_common::RW_VERSION; use risingwave_hummock_sdk::version::HummockVersion; use risingwave_hummock_sdk::{HummockSstableObjectId, HummockVersionId}; use risingwave_pb::backup_service::{PbMetaSnapshotManifest, PbMetaSnapshotMetadata}; +use risingwave_pb::hummock::PbStateTableInfo; use serde::{Deserialize, Serialize}; use crate::error::{BackupError, BackupResult}; @@ -58,6 +60,8 @@ pub struct MetaSnapshotMetadata { #[serde(default)] pub format_version: u32, pub remarks: Option, + #[serde(with = "table_id_key_map")] + pub state_table_info: HashMap, pub rw_version: Option, } @@ -76,6 +80,7 @@ impl MetaSnapshotMetadata { safe_epoch: v.visible_table_safe_epoch(), format_version, remarks, + state_table_info: v.state_table_info.info().clone(), rw_version: Some(RW_VERSION.to_owned()), } } @@ -115,6 +120,11 @@ impl From<&MetaSnapshotMetadata> for PbMetaSnapshotMetadata { safe_epoch: m.safe_epoch, format_version: Some(m.format_version), remarks: m.remarks.clone(), + state_table_info: m + .state_table_info + .iter() + .map(|(t, i)| (t.table_id, i.clone())) + .collect(), rw_version: m.rw_version.clone(), } } @@ -128,3 +138,40 @@ impl From<&MetaSnapshotManifest> for PbMetaSnapshotManifest { } } } + +mod table_id_key_map { + use std::collections::HashMap; + use std::str::FromStr; + + use risingwave_common::catalog::TableId; + use risingwave_pb::hummock::PbStateTableInfo; + use serde::{Deserialize, Deserializer, Serialize, Serializer}; + + pub fn serialize( + map: &HashMap, + serializer: S, + ) -> Result + where + S: Serializer, + { + let map_as_str: HashMap = + map.iter().map(|(k, v)| (k.to_string(), v)).collect(); + map_as_str.serialize(serializer) + } + + pub fn deserialize<'de, D>( + deserializer: D, + ) -> Result, D::Error> + where + D: Deserializer<'de>, + { + let map_as_str: HashMap = HashMap::deserialize(deserializer)?; + map_as_str + .into_iter() + .map(|(k, v)| { + let key = u32::from_str(&k).map_err(serde::de::Error::custom)?; + Ok((TableId::new(key), v)) + }) + .collect() + } +} diff --git a/src/storage/src/hummock/backup_reader.rs b/src/storage/src/hummock/backup_reader.rs index da1707db3ade1..3fa0748b21737 100644 --- a/src/storage/src/hummock/backup_reader.rs +++ b/src/storage/src/hummock/backup_reader.rs @@ -25,6 +25,7 @@ use risingwave_backup::error::BackupError; use risingwave_backup::meta_snapshot::{MetaSnapshot, Metadata}; use risingwave_backup::storage::{MetaSnapshotStorage, ObjectStoreMetaSnapshotStorage}; use risingwave_backup::{meta_snapshot_v1, meta_snapshot_v2, MetaSnapshotId}; +use risingwave_common::catalog::TableId; use risingwave_common::config::ObjectStoreConfig; use risingwave_common::system_param::local_manager::SystemParamsReaderRef; use risingwave_common::system_param::reader::SystemParamsRead; @@ -182,6 +183,7 @@ impl BackupReader { /// Otherwise, reading the version may encounter object store error, due to SST absence. pub async fn try_get_hummock_version( self: &BackupReaderRef, + table_id: TableId, epoch: u64, ) -> StorageResult> { // Use the same store throughout the call. @@ -192,7 +194,15 @@ impl BackupReader { .manifest() .snapshot_metadata .iter() - .find(|v| epoch >= v.safe_epoch && epoch <= v.max_committed_epoch) + .find(|v| { + if v.state_table_info.is_empty() { + return epoch >= v.safe_epoch && epoch <= v.max_committed_epoch; + } + if let Some(m) = v.state_table_info.get(&table_id) { + return epoch >= m.safe_epoch && epoch <= m.committed_epoch; + } + false + }) .cloned() else { return Ok(None); diff --git a/src/storage/src/hummock/store/hummock_storage.rs b/src/storage/src/hummock/store/hummock_storage.rs index 2d89fdd401fa2..7f9f956b62ddf 100644 --- a/src/storage/src/hummock/store/hummock_storage.rs +++ b/src/storage/src/hummock/store/hummock_storage.rs @@ -311,7 +311,11 @@ impl HummockStorage { table_id: TableId, key_range: TableKeyRange, ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { - match self.backup_reader.try_get_hummock_version(epoch).await { + match self + .backup_reader + .try_get_hummock_version(table_id, epoch) + .await + { Ok(Some(backup_version)) => { validate_safe_epoch(backup_version.version(), table_id, epoch)?; From 2c6f7c3d420f5d7a4938d4bb78b33dd845e72989 Mon Sep 17 00:00:00 2001 From: Dylan Date: Thu, 13 Jun 2024 16:14:47 +0800 Subject: [PATCH 12/19] fix(stream): fix temporal join append only property (#17239) --- .../tests/testdata/output/temporal_join.yaml | 6 +-- .../plan_node/stream_temporal_join.rs | 2 +- src/stream/src/executor/temporal_join.rs | 41 ++++++++++--------- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/src/frontend/planner_test/tests/testdata/output/temporal_join.yaml b/src/frontend/planner_test/tests/testdata/output/temporal_join.yaml index aa8887e98bef6..5cdfdf6cf45ea 100644 --- a/src/frontend/planner_test/tests/testdata/output/temporal_join.yaml +++ b/src/frontend/planner_test/tests/testdata/output/temporal_join.yaml @@ -111,7 +111,7 @@ stream_plan: |- StreamMaterialize { columns: [k, x1, x2, a1, b1, stream._row_id(hidden), version2.k(hidden)], stream_key: [stream._row_id, k], pk_columns: [stream._row_id, k], pk_conflict: NoCheck } └─StreamExchange { dist: HashShard(stream.k, stream._row_id) } - └─StreamTemporalJoin { type: Inner, append_only: true, predicate: stream.k = version2.k, output: [stream.k, version1.x1, version2.x2, stream.a1, stream.b1, stream._row_id, version2.k] } + └─StreamTemporalJoin { type: Inner, append_only: false, predicate: stream.k = version2.k, output: [stream.k, version1.x1, version2.x2, stream.a1, stream.b1, stream._row_id, version2.k] } ├─StreamExchange { dist: HashShard(stream.k) } │ └─StreamTemporalJoin { type: Inner, append_only: false, predicate: stream.k = version1.k, output: [stream.k, stream.a1, stream.b1, version1.x1, stream._row_id, version1.k] } │ ├─StreamExchange { dist: HashShard(stream.k) } @@ -133,7 +133,7 @@ stream_plan: |- StreamMaterialize { columns: [id1, x1, id2, x2, a1, b1, stream._row_id(hidden), version2.id2(hidden)], stream_key: [stream._row_id, id1, id2], pk_columns: [stream._row_id, id1, id2], pk_conflict: NoCheck } └─StreamExchange { dist: HashShard(stream.id1, stream.id2, stream._row_id) } - └─StreamTemporalJoin { type: Inner, append_only: true, predicate: stream.id2 = version2.id2, output: [stream.id1, version1.x1, stream.id2, version2.x2, stream.a1, stream.b1, stream._row_id, version2.id2] } + └─StreamTemporalJoin { type: Inner, append_only: false, predicate: stream.id2 = version2.id2, output: [stream.id1, version1.x1, stream.id2, version2.x2, stream.a1, stream.b1, stream._row_id, version2.id2] } ├─StreamExchange { dist: HashShard(stream.id2) } │ └─StreamTemporalJoin { type: Inner, append_only: false, predicate: stream.id1 = version1.id1, output: [stream.id1, stream.id2, stream.a1, stream.b1, version1.x1, stream._row_id, version1.id1] } │ ├─StreamExchange { dist: HashShard(stream.id1) } @@ -155,7 +155,7 @@ stream_plan: |- StreamMaterialize { columns: [id1, x1, id2, x2, a1, b1, stream._row_id(hidden), version2.id2(hidden)], stream_key: [stream._row_id, id1, id2], pk_columns: [stream._row_id, id1, id2], pk_conflict: NoCheck } └─StreamExchange { dist: HashShard(stream.id1, stream.id2, stream._row_id) } - └─StreamTemporalJoin { type: Inner, append_only: true, predicate: stream.id2 = version2.id2, output: [stream.id1, version1.x1, stream.id2, version2.x2, stream.a1, stream.b1, stream._row_id, version2.id2] } + └─StreamTemporalJoin { type: Inner, append_only: false, predicate: stream.id2 = version2.id2, output: [stream.id1, version1.x1, stream.id2, version2.x2, stream.a1, stream.b1, stream._row_id, version2.id2] } ├─StreamExchange { dist: HashShard(stream.id2) } │ └─StreamTemporalJoin { type: Inner, append_only: false, predicate: stream.id1 = version1.id1, output: [stream.id1, stream.id2, stream.a1, stream.b1, version1.x1, stream._row_id, version1.id1] } │ ├─StreamExchange { dist: HashShard(stream.id1) } diff --git a/src/frontend/src/optimizer/plan_node/stream_temporal_join.rs b/src/frontend/src/optimizer/plan_node/stream_temporal_join.rs index aa090143b925b..f94dbba36cb79 100644 --- a/src/frontend/src/optimizer/plan_node/stream_temporal_join.rs +++ b/src/frontend/src/optimizer/plan_node/stream_temporal_join.rs @@ -72,7 +72,7 @@ impl StreamTemporalJoin { let base = PlanBase::new_stream_with_core( &core, dist, - true, + append_only, false, // TODO(rc): derive EOWC property from input watermark_columns, ); diff --git a/src/stream/src/executor/temporal_join.rs b/src/stream/src/executor/temporal_join.rs index 077ea17ca74d9..ce0c4d29621d6 100644 --- a/src/stream/src/executor/temporal_join.rs +++ b/src/stream/src/executor/temporal_join.rs @@ -571,8 +571,8 @@ mod phase1 { } } -impl - TemporalJoinExecutor +impl + TemporalJoinExecutor { #[allow(clippy::too_many_arguments)] pub fn new( @@ -698,7 +698,7 @@ impl let full_schema = full_schema.clone(); if T == JoinType::Inner { - let st1 = phase1::handle_chunk::( + let st1 = phase1::handle_chunk::( self.chunk_size, right_size, full_schema, @@ -731,19 +731,20 @@ impl } } else if let Some(ref cond) = self.condition { // Joined result without evaluating non-lookup conditions. - let st1 = phase1::handle_chunk::( - self.chunk_size, - right_size, - full_schema, - epoch, - &self.left_join_keys, - &mut self.right_table, - &memo_table_lookup_prefix, - &mut self.memo_table, - &null_matched, - chunk, - &self.metrics, - ); + let st1 = + phase1::handle_chunk::( + self.chunk_size, + right_size, + full_schema, + epoch, + &self.left_join_keys, + &mut self.right_table, + &memo_table_lookup_prefix, + &mut self.memo_table, + &null_matched, + chunk, + &self.metrics, + ); let mut matched_count = 0usize; #[for_await] for chunk in st1 { @@ -783,7 +784,7 @@ impl // The last row should always be marker row, assert_eq!(matched_count, 0); } else { - let st1 = phase1::handle_chunk::( + let st1 = phase1::handle_chunk::( self.chunk_size, right_size, full_schema, @@ -805,7 +806,7 @@ impl } } InternalMessage::Barrier(updates, barrier) => { - if !A { + if !APPEND_ONLY { if wait_first_barrier { wait_first_barrier = false; self.memo_table.as_mut().unwrap().init_epoch(barrier.epoch); @@ -837,8 +838,8 @@ impl } } -impl Execute - for TemporalJoinExecutor +impl Execute + for TemporalJoinExecutor { fn execute(self: Box) -> super::BoxedMessageStream { self.into_stream().boxed() From a3da000942cc1616d12c5463106baee7beacbf38 Mon Sep 17 00:00:00 2001 From: Richard Chien Date: Thu, 13 Jun 2024 16:35:59 +0800 Subject: [PATCH 13/19] feat(over window): WindowBuffer support for session window (#17067) Signed-off-by: Richard Chien --- proto/expr.proto | 9 + src/expr/core/src/window_function/call.rs | 22 +- src/expr/core/src/window_function/mod.rs | 2 + src/expr/core/src/window_function/session.rs | 208 +++++++++++ .../impl/src/window_function/aggregate.rs | 13 +- src/expr/impl/src/window_function/buffer.rs | 342 +++++++++++++++++- 6 files changed, 582 insertions(+), 14 deletions(-) create mode 100644 src/expr/core/src/window_function/session.rs diff --git a/proto/expr.proto b/proto/expr.proto index 9abb1d74f4955..602c712975ecd 100644 --- a/proto/expr.proto +++ b/proto/expr.proto @@ -452,6 +452,7 @@ message WindowFrame { TYPE_ROWS = 5; TYPE_RANGE = 10; + TYPE_SESSION = 15; } enum BoundType { BOUND_TYPE_UNSPECIFIED = 0; @@ -497,6 +498,13 @@ message WindowFrame { BoundType type = 1; optional data.Datum offset = 3; } + message SessionFrameBounds { + data.Datum gap = 1; + + data.DataType order_data_type = 10; + common.OrderType order_type = 15; + data.DataType gap_data_type = 20; + } Type type = 1; @@ -508,6 +516,7 @@ message WindowFrame { oneof bounds { RowsFrameBounds rows = 10; RangeFrameBounds range = 15; + SessionFrameBounds session = 20; } } diff --git a/src/expr/core/src/window_function/call.rs b/src/expr/core/src/window_function/call.rs index bf3178907ab7b..5c6f40ff25bc0 100644 --- a/src/expr/core/src/window_function/call.rs +++ b/src/expr/core/src/window_function/call.rs @@ -22,7 +22,9 @@ use risingwave_pb::expr::window_frame::{PbBounds, PbExclusion}; use risingwave_pb::expr::{PbWindowFrame, PbWindowFunction}; use FrameBound::{CurrentRow, Following, Preceding, UnboundedFollowing, UnboundedPreceding}; -use super::{RangeFrameBounds, RowsFrameBound, RowsFrameBounds, WindowFuncKind}; +use super::{ + RangeFrameBounds, RowsFrameBound, RowsFrameBounds, SessionFrameBounds, WindowFuncKind, +}; use crate::aggregate::AggArgs; use crate::Result; @@ -100,6 +102,10 @@ impl Frame { let bounds = must_match!(frame.get_bounds()?, PbBounds::Range(bounds) => bounds); FrameBounds::Range(RangeFrameBounds::from_protobuf(bounds)?) } + PbType::Session => { + let bounds = must_match!(frame.get_bounds()?, PbBounds::Session(bounds) => bounds); + FrameBounds::Session(SessionFrameBounds::from_protobuf(bounds)?) + } }; let exclusion = FrameExclusion::from_protobuf(frame.get_exclusion()?)?; Ok(Self { bounds, exclusion }) @@ -108,8 +114,8 @@ impl Frame { pub fn to_protobuf(&self) -> PbWindowFrame { use risingwave_pb::expr::window_frame::PbType; let exclusion = self.exclusion.to_protobuf() as _; + #[expect(deprecated)] // because of `start` and `end` fields match &self.bounds { - #[expect(deprecated)] FrameBounds::Rows(bounds) => PbWindowFrame { r#type: PbType::Rows as _, start: None, // deprecated @@ -117,7 +123,6 @@ impl Frame { exclusion, bounds: Some(PbBounds::Rows(bounds.to_protobuf())), }, - #[expect(deprecated)] FrameBounds::Range(bounds) => PbWindowFrame { r#type: PbType::Range as _, start: None, // deprecated @@ -125,6 +130,13 @@ impl Frame { exclusion, bounds: Some(PbBounds::Range(bounds.to_protobuf())), }, + FrameBounds::Session(bounds) => PbWindowFrame { + r#type: PbType::Session as _, + start: None, // deprecated + end: None, // deprecated + exclusion, + bounds: Some(PbBounds::Session(bounds.to_protobuf())), + }, } } } @@ -135,6 +147,7 @@ pub enum FrameBounds { Rows(RowsFrameBounds), // Groups(GroupsFrameBounds), Range(RangeFrameBounds), + Session(SessionFrameBounds), } impl FrameBounds { @@ -142,6 +155,7 @@ impl FrameBounds { match self { Self::Rows(bounds) => bounds.validate(), Self::Range(bounds) => bounds.validate(), + Self::Session(bounds) => bounds.validate(), } } @@ -149,6 +163,7 @@ impl FrameBounds { match self { Self::Rows(RowsFrameBounds { start, .. }) => start.is_unbounded_preceding(), Self::Range(RangeFrameBounds { start, .. }) => start.is_unbounded_preceding(), + Self::Session(_) => false, } } @@ -156,6 +171,7 @@ impl FrameBounds { match self { Self::Rows(RowsFrameBounds { end, .. }) => end.is_unbounded_following(), Self::Range(RangeFrameBounds { end, .. }) => end.is_unbounded_following(), + Self::Session(_) => false, } } diff --git a/src/expr/core/src/window_function/mod.rs b/src/expr/core/src/window_function/mod.rs index 5f747d2296729..6bbde8c8755e6 100644 --- a/src/expr/core/src/window_function/mod.rs +++ b/src/expr/core/src/window_function/mod.rs @@ -21,6 +21,8 @@ mod rows; pub use rows::*; mod range; pub use range::*; +mod session; +pub use session::*; mod state; pub use state::*; diff --git a/src/expr/core/src/window_function/session.rs b/src/expr/core/src/window_function/session.rs new file mode 100644 index 0000000000000..81a77058759bf --- /dev/null +++ b/src/expr/core/src/window_function/session.rs @@ -0,0 +1,208 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Display; +use std::ops::Deref; +use std::sync::Arc; + +use anyhow::Context; +use educe::Educe; +use futures::FutureExt; +use risingwave_common::bail; +use risingwave_common::row::OwnedRow; +use risingwave_common::types::{ + DataType, Datum, IsNegative, ScalarImpl, ScalarRefImpl, ToOwnedDatum, ToText, +}; +use risingwave_common::util::sort_util::OrderType; +use risingwave_common::util::value_encoding::{DatumFromProtoExt, DatumToProtoExt}; +use risingwave_pb::expr::window_frame::PbSessionFrameBounds; + +use super::FrameBoundsImpl; +use crate::expr::{ + build_func, BoxedExpression, Expression, ExpressionBoxExt, InputRefExpression, + LiteralExpression, +}; +use crate::Result; + +/// To implement Session Window in a similar way to Range Frame, we define a similar frame bounds +/// structure here. It's very like [`RangeFrameBounds`](super::RangeFrameBounds), but with a gap +/// instead of start & end offset. +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct SessionFrameBounds { + pub order_data_type: DataType, + pub order_type: OrderType, + pub gap_data_type: DataType, + pub gap: SessionFrameGap, +} + +impl SessionFrameBounds { + pub(super) fn from_protobuf(bounds: &PbSessionFrameBounds) -> Result { + let order_data_type = DataType::from(bounds.get_order_data_type()?); + let order_type = OrderType::from_protobuf(bounds.get_order_type()?); + let gap_data_type = DataType::from(bounds.get_gap_data_type()?); + let gap_value = Datum::from_protobuf(bounds.get_gap()?, &gap_data_type) + .context("gap `Datum` is not decodable")? + .context("gap of session frame must be non-NULL")?; + let mut gap = SessionFrameGap::new(gap_value); + gap.prepare(&order_data_type, &gap_data_type)?; + Ok(Self { + order_data_type, + order_type, + gap_data_type, + gap, + }) + } + + pub(super) fn to_protobuf(&self) -> PbSessionFrameBounds { + PbSessionFrameBounds { + gap: Some(Some(self.gap.as_scalar_ref_impl()).to_protobuf()), + order_data_type: Some(self.order_data_type.to_protobuf()), + order_type: Some(self.order_type.to_protobuf()), + gap_data_type: Some(self.gap_data_type.to_protobuf()), + } + } +} + +impl Display for SessionFrameBounds { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "SESSION WITH GAP {}", + self.gap.as_scalar_ref_impl().to_text() + ) + } +} + +impl FrameBoundsImpl for SessionFrameBounds { + fn validate(&self) -> Result<()> { + // TODO(rc): maybe can merge with `RangeFrameBounds::validate` + + fn validate_non_negative(val: impl IsNegative + Display) -> Result<()> { + if val.is_negative() { + bail!("session gap should be non-negative, but {} is given", val); + } + Ok(()) + } + + match self.gap.as_scalar_ref_impl() { + ScalarRefImpl::Int16(val) => validate_non_negative(val)?, + ScalarRefImpl::Int32(val) => validate_non_negative(val)?, + ScalarRefImpl::Int64(val) => validate_non_negative(val)?, + ScalarRefImpl::Float32(val) => validate_non_negative(val)?, + ScalarRefImpl::Float64(val) => validate_non_negative(val)?, + ScalarRefImpl::Decimal(val) => validate_non_negative(val)?, + ScalarRefImpl::Interval(val) => { + if !val.is_never_negative() { + bail!( + "for session gap of type `interval`, each field should be non-negative, but {} is given", + val + ); + } + if matches!(self.order_data_type, DataType::Timestamptz) { + // for `timestamptz`, we only support gap without `month` and `day` fields + if val.months() != 0 || val.days() != 0 { + bail!( + "for session order column of type `timestamptz`, gap should not have non-zero `month` and `day`", + ); + } + } + } + _ => unreachable!( + "other order column data types are not supported and should be banned in frontend" + ), + } + Ok(()) + } +} + +impl SessionFrameBounds { + pub fn minimal_next_start_of(&self, end_order_value: impl ToOwnedDatum) -> Datum { + self.gap.for_calc().minimal_next_start_of(end_order_value) + } +} + +/// The wrapper type for [`ScalarImpl`] session gap, containing an expression to help adding the gap +/// to a given value. +#[derive(Debug, Clone, Educe)] +#[educe(PartialEq, Eq, Hash)] +pub struct SessionFrameGap { + /// The original gap value. + gap: ScalarImpl, + /// Built expression for `$0 + gap`. + #[educe(PartialEq(ignore), Hash(ignore))] + add_expr: Option>, +} + +impl Deref for SessionFrameGap { + type Target = ScalarImpl; + + fn deref(&self) -> &Self::Target { + &self.gap + } +} + +impl SessionFrameGap { + pub fn new(gap: ScalarImpl) -> Self { + Self { + gap, + add_expr: None, + } + } + + fn prepare(&mut self, order_data_type: &DataType, gap_data_type: &DataType) -> Result<()> { + use risingwave_pb::expr::expr_node::PbType as PbExprType; + + let input_expr = InputRefExpression::new(order_data_type.clone(), 0); + let gap_expr = LiteralExpression::new(gap_data_type.clone(), Some(self.gap.clone())); + self.add_expr = Some(Arc::new(build_func( + PbExprType::Add, + order_data_type.clone(), + vec![input_expr.clone().boxed(), gap_expr.clone().boxed()], + )?)); + Ok(()) + } + + pub fn new_for_test( + gap: ScalarImpl, + order_data_type: &DataType, + gap_data_type: &DataType, + ) -> Self { + let mut gap = Self::new(gap); + gap.prepare(order_data_type, gap_data_type).unwrap(); + gap + } + + fn for_calc(&self) -> SessionFrameGapRef<'_> { + SessionFrameGapRef { + add_expr: self.add_expr.as_ref().unwrap().as_ref(), + } + } +} + +#[derive(Debug, Educe)] +#[educe(Clone, Copy)] +struct SessionFrameGapRef<'a> { + add_expr: &'a dyn Expression, +} + +impl<'a> SessionFrameGapRef<'a> { + fn minimal_next_start_of(&self, end_order_value: impl ToOwnedDatum) -> Datum { + let row = OwnedRow::new(vec![end_order_value.to_owned_datum()]); + self.add_expr + .eval_row(&row) + .now_or_never() + .expect("frame bound calculation should finish immediately") + .expect("just simple calculation, should succeed") // TODO(rc): handle overflow + } +} diff --git a/src/expr/impl/src/window_function/aggregate.rs b/src/expr/impl/src/window_function/aggregate.rs index 7710a1b7adb2d..9942581357f77 100644 --- a/src/expr/impl/src/window_function/aggregate.rs +++ b/src/expr/impl/src/window_function/aggregate.rs @@ -31,7 +31,7 @@ use risingwave_expr::window_function::{ use risingwave_expr::Result; use smallvec::SmallVec; -use super::buffer::{RangeWindow, RowsWindow, WindowBuffer, WindowImpl}; +use super::buffer::{RangeWindow, RowsWindow, SessionWindow, WindowBuffer, WindowImpl}; type StateValue = SmallVec<[Datum; 2]>; @@ -99,6 +99,17 @@ pub(super) fn new(call: &WindowFuncCall) -> Result { ), buffer_heap_size: KvSize::new(), }) as BoxedWindowState, + FrameBounds::Session(frame_bounds) => Box::new(AggregateState { + agg_func, + agg_impl, + arg_data_types, + buffer: WindowBuffer::>::new( + SessionWindow::new(frame_bounds.clone()), + call.frame.exclusion, + enable_delta, + ), + buffer_heap_size: KvSize::new(), + }) as BoxedWindowState, }; Ok(this) } diff --git a/src/expr/impl/src/window_function/buffer.rs b/src/expr/impl/src/window_function/buffer.rs index 6c6277c20ee63..bd1c10d162b23 100644 --- a/src/expr/impl/src/window_function/buffer.rs +++ b/src/expr/impl/src/window_function/buffer.rs @@ -18,9 +18,9 @@ use std::ops::Range; use educe::Educe; use risingwave_common::array::Op; use risingwave_common::types::Sentinelled; -use risingwave_common::util::memcmp_encoding; +use risingwave_common::util::memcmp_encoding::{self, MemcmpEncoded}; use risingwave_expr::window_function::{ - FrameExclusion, RangeFrameBounds, RowsFrameBounds, StateKey, + FrameExclusion, RangeFrameBounds, RowsFrameBounds, SessionFrameBounds, StateKey, }; use super::range_utils::{range_diff, range_except}; @@ -148,7 +148,7 @@ impl WindowBuffer { let old_outer = self.curr_window_outer(); self.buffer.push_back(Entry { key, value }); - self.recalculate_left_right(); + self.recalculate_left_right(RecalculateHint::Append); if self.curr_delta.is_some() { self.maintain_delta(old_outer, self.curr_window_outer()); @@ -161,7 +161,7 @@ impl WindowBuffer { let old_outer = self.curr_window_outer(); self.curr_idx += 1; - self.recalculate_left_right(); + self.recalculate_left_right(RecalculateHint::Slide); if self.curr_delta.is_some() { self.maintain_delta(old_outer, self.curr_window_outer()); @@ -171,6 +171,9 @@ impl WindowBuffer { self.curr_idx -= min_needed_idx; self.left_idx -= min_needed_idx; self.right_excl_idx -= min_needed_idx; + + self.window_impl.shift_indices(min_needed_idx); + self.buffer .drain(0..min_needed_idx) .map(|Entry { key, value }| (key, value)) @@ -189,14 +192,14 @@ impl WindowBuffer { } } - fn recalculate_left_right(&mut self) { + fn recalculate_left_right(&mut self, hint: RecalculateHint) { let window = BufferRefMut { buffer: &self.buffer, curr_idx: &mut self.curr_idx, left_idx: &mut self.left_idx, right_excl_idx: &mut self.right_excl_idx, }; - self.window_impl.recalculate_left_right(window); + self.window_impl.recalculate_left_right(window, hint); } } @@ -218,6 +221,12 @@ pub(super) struct BufferRefMut<'a, K: Ord, V: Clone> { right_excl_idx: &'a mut usize, } +#[derive(Clone, Copy, PartialEq, Eq)] +pub(super) enum RecalculateHint { + Append, + Slide, +} + /// A trait for sliding window implementations. This trait is used by [`WindowBuffer`] to /// determine the status of current window and how to slide the window. pub(super) trait WindowImpl { @@ -233,8 +242,16 @@ pub(super) trait WindowImpl { fn following_saturated(&self, window: BufferRef<'_, Self::Key, Self::Value>) -> bool; /// Recalculate the left and right indices of the current window, according to the latest - /// `curr_idx`. The indices are indices in the buffer vector. - fn recalculate_left_right(&self, window: BufferRefMut<'_, Self::Key, Self::Value>); + /// `curr_idx` and the hint. + fn recalculate_left_right( + &mut self, + window: BufferRefMut<'_, Self::Key, Self::Value>, + hint: RecalculateHint, + ); + + /// Shift the indices stored by the [`WindowImpl`] by `n` positions. This should be called + /// after evicting rows from the buffer. + fn shift_indices(&mut self, n: usize); } /// The sliding window implementation for `ROWS` frames. @@ -301,7 +318,11 @@ impl WindowImpl for RowsWindow { } } - fn recalculate_left_right(&self, window: BufferRefMut<'_, Self::Key, Self::Value>) { + fn recalculate_left_right( + &mut self, + window: BufferRefMut<'_, Self::Key, Self::Value>, + _hint: RecalculateHint, + ) { if window.buffer.is_empty() { *window.left_idx = 0; *window.right_excl_idx = 0; @@ -333,6 +354,8 @@ impl WindowImpl for RowsWindow { *window.right_excl_idx = window.buffer.len(); } } + + fn shift_indices(&mut self, _n: usize) {} } /// The sliding window implementation for `RANGE` frames. @@ -377,7 +400,11 @@ impl WindowImpl for RangeWindow { } } - fn recalculate_left_right(&self, window: BufferRefMut<'_, Self::Key, Self::Value>) { + fn recalculate_left_right( + &mut self, + window: BufferRefMut<'_, Self::Key, Self::Value>, + _hint: RecalculateHint, + ) { if window.buffer.is_empty() { *window.left_idx = 0; *window.right_excl_idx = 0; @@ -433,14 +460,199 @@ impl WindowImpl for RangeWindow { Sentinelled::Smallest => unreachable!("frame end never be UNBOUNDED PRECEDING"), } } + + fn shift_indices(&mut self, _n: usize) {} +} + +pub(super) struct SessionWindow { + frame_bounds: SessionFrameBounds, + /// The latest session is the rightmost session in the buffer, which is updated during appending. + latest_session: Option, + /// The sizes of recognized but not consumed sessions in the buffer. It's updated during appending. + /// The first element, if any, should be the size of the "current session window". When sliding, + /// the front should be popped. + recognized_session_sizes: VecDeque, + _phantom: std::marker::PhantomData, +} + +#[derive(Debug)] +struct LatestSession { + /// The starting index of the latest session. + start_idx: usize, + + /// Minimal next start means the minimal order value that can start a new session. + /// If a row has an order value less than this, it should be in the current session. + minimal_next_start: MemcmpEncoded, +} + +impl SessionWindow { + pub fn new(frame_bounds: SessionFrameBounds) -> Self { + Self { + frame_bounds, + latest_session: None, + recognized_session_sizes: Default::default(), + _phantom: std::marker::PhantomData, + } + } +} + +impl WindowImpl for SessionWindow { + type Key = StateKey; + type Value = V; + + fn preceding_saturated(&self, window: BufferRef<'_, Self::Key, Self::Value>) -> bool { + window.curr_idx < window.buffer.len() && { + // XXX(rc): It seems that preceding saturation is not important, may remove later. + true + } + } + + fn following_saturated(&self, window: BufferRef<'_, Self::Key, Self::Value>) -> bool { + window.curr_idx < window.buffer.len() && { + // For session window, `left_idx` is always smaller than `right_excl_idx`. + assert!(window.left_idx <= window.curr_idx); + assert!(window.curr_idx < window.right_excl_idx); + + // The following expression checks whether the current window is the latest session. + // If it is, we don't think it's saturated because the next row may be still in the + // same session. Otherwise, we can safely say it's saturated. + self.latest_session + .as_ref() + .map_or(false, |LatestSession { start_idx, .. }| { + window.left_idx < *start_idx + }) + } + } + + fn recalculate_left_right( + &mut self, + window: BufferRefMut<'_, Self::Key, Self::Value>, + hint: RecalculateHint, + ) { + // Terms: + // - Session: A continuous range of rows among any two of which the difference of order values + // is less than the session gap. This is a concept on the whole stream. Sessions are recognized + // during appending. + // - Current window: The range of rows that are represented by the indices in `window`. It is a + // status of the `WindowBuffer`. If the current window happens to be the last session in the + // buffer, it will be updated during appending. Otherwise it will only be updated during sliding. + + match hint { + RecalculateHint::Append => { + assert!(!window.buffer.is_empty()); // because we just appended a row + let appended_idx = window.buffer.len() - 1; + let appended_key = &window.buffer[appended_idx].key; + + let minimal_next_start_of_appended = self.frame_bounds.minimal_next_start_of( + memcmp_encoding::decode_value( + &self.frame_bounds.order_data_type, + &appended_key.order_key, + self.frame_bounds.order_type, + ) + .expect("no reason to fail here because we just encoded it in memory"), + ); + let minimal_next_start_enc_of_appended = memcmp_encoding::encode_value( + minimal_next_start_of_appended, + self.frame_bounds.order_type, + ) + .expect("no reason to fail here"); + + if let Some(LatestSession { + ref start_idx, + minimal_next_start, + }) = self.latest_session.as_mut() + { + if &appended_key.order_key >= minimal_next_start { + // the appended row starts a new session + self.recognized_session_sizes + .push_back(appended_idx - start_idx); + self.latest_session = Some(LatestSession { + start_idx: appended_idx, + minimal_next_start: minimal_next_start_enc_of_appended, + }); + // no need to update the current window because it's now corresponding + // to some previous session + } else { + // the appended row belongs to the latest session + *minimal_next_start = minimal_next_start_enc_of_appended; + + if *start_idx == *window.left_idx { + // the current window is the latest session, we should extend it + *window.right_excl_idx = appended_idx + 1; + } + } + } else { + // no session yet, the current window should be empty + let left_idx = *window.left_idx; + let curr_idx = *window.curr_idx; + let old_right_excl_idx = *window.right_excl_idx; + assert_eq!(left_idx, curr_idx); + assert_eq!(left_idx, old_right_excl_idx); + assert_eq!(old_right_excl_idx, window.buffer.len() - 1); + + // now we put the first row into the current window + *window.right_excl_idx = window.buffer.len(); + + // and start to recognize the latest session + self.latest_session = Some(LatestSession { + start_idx: left_idx, + minimal_next_start: minimal_next_start_enc_of_appended, + }); + } + } + RecalculateHint::Slide => { + let old_left_idx = *window.left_idx; + let new_curr_idx = *window.curr_idx; + let old_right_excl_idx = *window.right_excl_idx; + + if new_curr_idx < old_right_excl_idx { + // the current row is still in the current session window, no need to slide + } else { + let old_session_size = self.recognized_session_sizes.pop_front(); + let next_session_size = self.recognized_session_sizes.front().copied(); + + if let Some(old_session_size) = old_session_size { + assert_eq!(old_session_size, old_right_excl_idx - old_left_idx); + + // slide the window to the next session + if let Some(next_session_size) = next_session_size { + // the next session is fully recognized, so we know the ending index + *window.left_idx = old_right_excl_idx; + *window.right_excl_idx = old_right_excl_idx + next_session_size; + } else { + // the next session is still in recognition, so we end the window at the end of buffer + *window.left_idx = old_right_excl_idx; + *window.right_excl_idx = window.buffer.len(); + } + } else { + // no recognized session yet, meaning the current window is the last session in the buffer + assert_eq!(old_right_excl_idx, window.buffer.len()); + *window.left_idx = old_right_excl_idx; + *window.right_excl_idx = old_right_excl_idx; + self.latest_session = None; + } + } + } + } + } + + fn shift_indices(&mut self, n: usize) { + if let Some(LatestSession { start_idx, .. }) = self.latest_session.as_mut() { + *start_idx -= n; + } + } } #[cfg(test)] mod tests { use itertools::Itertools; + use risingwave_common::row::OwnedRow; + use risingwave_common::types::{DataType, ScalarImpl}; + use risingwave_common::util::sort_util::OrderType; use risingwave_expr::window_function::FrameBound::{ CurrentRow, Following, Preceding, UnboundedFollowing, UnboundedPreceding, }; + use risingwave_expr::window_function::SessionFrameGap; use super::*; @@ -734,4 +946,114 @@ mod tests { vec!["hello"] ); } + + #[test] + fn test_session_frame() { + let order_data_type = DataType::Int64; + let order_type = OrderType::ascending(); + let gap_data_type = DataType::Int64; + + let mut buffer = WindowBuffer::>::new( + SessionWindow::new(SessionFrameBounds { + order_data_type: order_data_type.clone(), + order_type, + gap_data_type: gap_data_type.clone(), + gap: SessionFrameGap::new_for_test( + ScalarImpl::Int64(5), + &order_data_type, + &gap_data_type, + ), + }), + FrameExclusion::NoOthers, + true, + ); + + let key = |key: i64| -> StateKey { + StateKey { + order_key: memcmp_encoding::encode_value(&Some(ScalarImpl::from(key)), order_type) + .unwrap(), + pk: OwnedRow::empty().into(), + } + }; + + assert!(buffer.curr_key().is_none()); + + buffer.append(key(1), "hello"); + buffer.append(key(3), "session"); + let window = buffer.curr_window(); + assert_eq!(window.key, Some(&key(1))); + assert!(window.preceding_saturated); + assert!(!window.following_saturated); + assert_eq!( + buffer.curr_window_values().cloned().collect_vec(), + vec!["hello", "session"] + ); + + buffer.append(key(8), "window"); // start a new session + let window = buffer.curr_window(); + assert!(window.following_saturated); + assert_eq!( + buffer.curr_window_values().cloned().collect_vec(), + vec!["hello", "session"] + ); + + buffer.append(key(15), "and"); + buffer.append(key(16), "world"); + assert_eq!( + buffer.curr_window_values().cloned().collect_vec(), + vec!["hello", "session"] + ); + + let removed_keys = buffer.slide().map(|(k, _)| k).collect_vec(); + assert!(removed_keys.is_empty()); + let window = buffer.curr_window(); + assert_eq!(window.key, Some(&key(3))); + assert!(window.preceding_saturated); + assert!(window.following_saturated); + assert_eq!( + buffer.curr_window_values().cloned().collect_vec(), + vec!["hello", "session"] + ); + + let removed_keys = buffer.slide().map(|(k, _)| k).collect_vec(); + assert_eq!(removed_keys, vec![key(1), key(3)]); + assert_eq!(buffer.smallest_key(), Some(&key(8))); + let window = buffer.curr_window(); + assert_eq!(window.key, Some(&key(8))); + assert!(window.preceding_saturated); + assert!(window.following_saturated); + assert_eq!( + buffer.curr_window_values().cloned().collect_vec(), + vec!["window"] + ); + + let removed_keys = buffer.slide().map(|(k, _)| k).collect_vec(); + assert_eq!(removed_keys, vec![key(8)]); + assert_eq!(buffer.smallest_key(), Some(&key(15))); + let window = buffer.curr_window(); + assert_eq!(window.key, Some(&key(15))); + assert!(window.preceding_saturated); + assert!(!window.following_saturated); + assert_eq!( + buffer.curr_window_values().cloned().collect_vec(), + vec!["and", "world"] + ); + + let removed_keys = buffer.slide().map(|(k, _)| k).collect_vec(); + assert!(removed_keys.is_empty()); + assert_eq!(buffer.curr_key(), Some(&key(16))); + assert_eq!( + buffer.curr_window_values().cloned().collect_vec(), + vec!["and", "world"] + ); + + let removed_keys = buffer.slide().map(|(k, _)| k).collect_vec(); + assert_eq!(removed_keys, vec![key(15), key(16)]); + assert!(buffer.curr_key().is_none()); + assert!(buffer + .curr_window_values() + .cloned() + .collect_vec() + .is_empty()); + } } From b2e1d15e2eb8bfe6e648f5d7ae5752260940ecb4 Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Thu, 13 Jun 2024 17:43:00 +0800 Subject: [PATCH 14/19] refactor(streaming): refine sink await-tree (#17242) Signed-off-by: Bugen Zhao --- src/connector/src/lib.rs | 1 + .../src/sink/decouple_checkpoint_log_sink.rs | 2 +- src/connector/src/sink/log_store.rs | 31 +++++++++++-------- src/connector/src/sink/mod.rs | 2 +- src/connector/src/sink/remote.rs | 25 +++++---------- src/connector/src/sink/trivial.rs | 2 +- src/connector/src/sink/writer.rs | 4 +-- .../src/common/log_store_impl/in_mem.rs | 5 +++ src/stream/src/executor/sink.rs | 10 +++--- 9 files changed, 41 insertions(+), 41 deletions(-) diff --git a/src/connector/src/lib.rs b/src/connector/src/lib.rs index b701a0b8be431..8c0ade401cb47 100644 --- a/src/connector/src/lib.rs +++ b/src/connector/src/lib.rs @@ -34,6 +34,7 @@ #![feature(negative_impls)] #![feature(register_tool)] #![feature(assert_matches)] +#![feature(never_type)] #![register_tool(rw)] #![recursion_limit = "256"] diff --git a/src/connector/src/sink/decouple_checkpoint_log_sink.rs b/src/connector/src/sink/decouple_checkpoint_log_sink.rs index 9eaba2a10f121..26576cf3e3666 100644 --- a/src/connector/src/sink/decouple_checkpoint_log_sink.rs +++ b/src/connector/src/sink/decouple_checkpoint_log_sink.rs @@ -48,7 +48,7 @@ impl DecoupleCheckpointLogSinkerOf { #[async_trait] impl> LogSinker for DecoupleCheckpointLogSinkerOf { - async fn consume_log_and_sink(self, log_reader: &mut impl SinkLogReader) -> Result<()> { + async fn consume_log_and_sink(self, log_reader: &mut impl SinkLogReader) -> Result { let mut sink_writer = self.writer; let sink_metrics = self.sink_metrics; #[derive(Debug)] diff --git a/src/connector/src/sink/log_store.rs b/src/connector/src/sink/log_store.rs index d696feb1d8617..d609aa7170d0d 100644 --- a/src/connector/src/sink/log_store.rs +++ b/src/connector/src/sink/log_store.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use std::task::Poll; use std::time::Instant; +use await_tree::InstrumentAwait; use futures::{TryFuture, TryFutureExt}; use risingwave_common::array::StreamChunk; use risingwave_common::bail; @@ -282,21 +283,25 @@ impl MonitoredLogReader { impl LogReader for MonitoredLogReader { async fn init(&mut self) -> LogStoreResult<()> { - self.inner.init().await + self.inner.init().instrument_await("log_reader_init").await } async fn next_item(&mut self) -> LogStoreResult<(u64, LogStoreReadItem)> { - self.inner.next_item().await.inspect(|(epoch, item)| { - if self.read_epoch != *epoch { - self.read_epoch = *epoch; - self.metrics.log_store_latest_read_epoch.set(*epoch as _); - } - if let LogStoreReadItem::StreamChunk { chunk, .. } = item { - self.metrics - .log_store_read_rows - .inc_by(chunk.cardinality() as _); - } - }) + self.inner + .next_item() + .instrument_await("log_reader_next_item") + .await + .inspect(|(epoch, item)| { + if self.read_epoch != *epoch { + self.read_epoch = *epoch; + self.metrics.log_store_latest_read_epoch.set(*epoch as _); + } + if let LogStoreReadItem::StreamChunk { chunk, .. } = item { + self.metrics + .log_store_read_rows + .inc_by(chunk.cardinality() as _); + } + }) } fn truncate(&mut self, offset: TruncateOffset) -> LogStoreResult<()> { @@ -306,7 +311,7 @@ impl LogReader for MonitoredLogReader { fn rewind( &mut self, ) -> impl Future)>> + Send + '_ { - self.inner.rewind() + self.inner.rewind().instrument_await("log_reader_rewind") } } diff --git a/src/connector/src/sink/mod.rs b/src/connector/src/sink/mod.rs index e5a5f6143e419..bdd923f786ec4 100644 --- a/src/connector/src/sink/mod.rs +++ b/src/connector/src/sink/mod.rs @@ -386,7 +386,7 @@ impl SinkLogReader for R { #[async_trait] pub trait LogSinker: 'static { - async fn consume_log_and_sink(self, log_reader: &mut impl SinkLogReader) -> Result<()>; + async fn consume_log_and_sink(self, log_reader: &mut impl SinkLogReader) -> Result; } #[async_trait] diff --git a/src/connector/src/sink/remote.rs b/src/connector/src/sink/remote.rs index ea247991f9eb8..f8b84fc64eb86 100644 --- a/src/connector/src/sink/remote.rs +++ b/src/connector/src/sink/remote.rs @@ -301,7 +301,7 @@ impl RemoteLogSinker { #[async_trait] impl LogSinker for RemoteLogSinker { - async fn consume_log_and_sink(self, log_reader: &mut impl SinkLogReader) -> Result<()> { + async fn consume_log_and_sink(self, log_reader: &mut impl SinkLogReader) -> Result { let mut request_tx = self.request_sender; let mut response_err_stream_rx = self.response_stream; let sink_metrics = self.sink_metrics; @@ -313,7 +313,7 @@ impl LogSinker for RemoteLogSinker { let result = response_err_stream_rx .stream .try_next() - .instrument_await("Wait Response Stream") + .instrument_await("log_sinker_wait_next_response") .await; match result { Ok(Some(response)) => { @@ -368,20 +368,12 @@ impl LogSinker for RemoteLogSinker { let mut sent_offset_queue: VecDeque<(TruncateOffset, Option)> = VecDeque::new(); - let mut curr_epoch = 0; - loop { let either_result: futures::future::Either< Option, LogStoreResult<(u64, LogStoreReadItem)>, > = drop_either_future( - select( - pin!(response_rx.recv()), - pin!(log_reader - .next_item() - .instrument_await(format!("Wait Next Item: {}", curr_epoch))), - ) - .await, + select(pin!(response_rx.recv()), pin!(log_reader.next_item())).await, ); match either_result { futures::future::Either::Left(opt) => { @@ -435,7 +427,6 @@ impl LogSinker for RemoteLogSinker { } futures::future::Either::Right(result) => { let (epoch, item): (u64, LogStoreReadItem) = result?; - curr_epoch = epoch; match item { LogStoreReadItem::StreamChunk { chunk, chunk_id } => { @@ -456,8 +447,7 @@ impl LogSinker for RemoteLogSinker { chunk, }) .instrument_await(format!( - "Send Chunk Request: {} {}", - curr_epoch, chunk_id + "log_sinker_send_chunk (chunk {chunk_id})" )) .await?; prev_offset = Some(offset); @@ -473,15 +463,16 @@ impl LogSinker for RemoteLogSinker { let start_time = Instant::now(); request_tx .barrier(epoch, true) - .instrument_await(format!("Commit: {}", curr_epoch)) + .instrument_await(format!( + "log_sinker_commit_checkpoint (epoch {epoch})" + )) .await?; Some(start_time) } else { request_tx .barrier(epoch, false) .instrument_await(format!( - "Send Barrier Request: {}", - curr_epoch + "log_sinker_send_barrier (epoch {epoch})" )) .await?; None diff --git a/src/connector/src/sink/trivial.rs b/src/connector/src/sink/trivial.rs index 60e47949a8fdf..0cfa82c5c4d19 100644 --- a/src/connector/src/sink/trivial.rs +++ b/src/connector/src/sink/trivial.rs @@ -75,7 +75,7 @@ impl Sink for TrivialSink { #[async_trait] impl LogSinker for TrivialSink { - async fn consume_log_and_sink(self, log_reader: &mut impl SinkLogReader) -> Result<()> { + async fn consume_log_and_sink(self, log_reader: &mut impl SinkLogReader) -> Result { loop { let (epoch, item) = log_reader.next_item().await?; match item { diff --git a/src/connector/src/sink/writer.rs b/src/connector/src/sink/writer.rs index ba7df74495272..00917f26c4163 100644 --- a/src/connector/src/sink/writer.rs +++ b/src/connector/src/sink/writer.rs @@ -126,7 +126,7 @@ impl LogSinkerOf { #[async_trait] impl> LogSinker for LogSinkerOf { - async fn consume_log_and_sink(self, log_reader: &mut impl SinkLogReader) -> Result<()> { + async fn consume_log_and_sink(self, log_reader: &mut impl SinkLogReader) -> Result { let mut sink_writer = self.writer; let sink_metrics = self.sink_metrics; #[derive(Debug)] @@ -242,7 +242,7 @@ impl AsyncTruncateLogSinkerOf { #[async_trait] impl LogSinker for AsyncTruncateLogSinkerOf { - async fn consume_log_and_sink(mut self, log_reader: &mut impl SinkLogReader) -> Result<()> { + async fn consume_log_and_sink(mut self, log_reader: &mut impl SinkLogReader) -> Result { loop { let select_result = drop_either_future( select( diff --git a/src/stream/src/common/log_store_impl/in_mem.rs b/src/stream/src/common/log_store_impl/in_mem.rs index 3aafb865f09ac..d1ce2e6e5aa35 100644 --- a/src/stream/src/common/log_store_impl/in_mem.rs +++ b/src/stream/src/common/log_store_impl/in_mem.rs @@ -15,6 +15,7 @@ use std::sync::Arc; use anyhow::{anyhow, Context}; +use await_tree::InstrumentAwait; use risingwave_common::array::StreamChunk; use risingwave_common::buffer::Bitmap; use risingwave_common::util::epoch::{EpochExt, EpochPair, INVALID_EPOCH}; @@ -260,6 +261,7 @@ impl LogWriter for BoundedInMemLogStoreWriter { async fn write_chunk(&mut self, chunk: StreamChunk) -> LogStoreResult<()> { self.item_tx .send(InMemLogStoreItem::StreamChunk(chunk)) + .instrument_await("in_mem_send_item_chunk") .await .map_err(|_| anyhow!("unable to send stream chunk"))?; Ok(()) @@ -275,6 +277,7 @@ impl LogWriter for BoundedInMemLogStoreWriter { next_epoch, is_checkpoint, }) + .instrument_await("in_mem_send_item_barrier") .await .map_err(|_| anyhow!("unable to send barrier"))?; @@ -287,6 +290,7 @@ impl LogWriter for BoundedInMemLogStoreWriter { let truncated_epoch = self .truncated_epoch_rx .recv() + .instrument_await("in_mem_recv_truncated_epoch") .await .ok_or_else(|| anyhow!("cannot get truncated epoch"))?; assert_eq!(truncated_epoch, prev_epoch); @@ -298,6 +302,7 @@ impl LogWriter for BoundedInMemLogStoreWriter { async fn update_vnode_bitmap(&mut self, new_vnodes: Arc) -> LogStoreResult<()> { self.item_tx .send(InMemLogStoreItem::UpdateVnodeBitmap(new_vnodes)) + .instrument_await("in_mem_send_item_vnode_bitmap") .await .map_err(|_| anyhow!("unable to send vnode bitmap")) } diff --git a/src/stream/src/executor/sink.rs b/src/stream/src/executor/sink.rs index 52377c49c322a..e8dfc09e9130c 100644 --- a/src/stream/src/executor/sink.rs +++ b/src/stream/src/executor/sink.rs @@ -125,7 +125,6 @@ impl SinkExecutor { let sink_id = self.sink_param.sink_id; let actor_id = self.actor_context.id; let fragment_id = self.actor_context.fragment_id; - let executor_id = self.sink_writer_param.executor_id; let stream_key = self.info.pk_indices.clone(); let metrics = self.actor_context.streaming_metrics.new_sink_exec_metrics( @@ -217,10 +216,9 @@ impl SinkExecutor { self.sink_writer_param, self.actor_context, ) - .instrument_await(format!( - "Consume Log: sink_id: {} actor_id: {}, executor_id: {}", - sink_id, actor_id, executor_id, - )); + .instrument_await(format!("consume_log (sink_id {sink_id})")) + .map_ok(|never| match never {}); // unify return type to `Message` + // TODO: may try to remove the boxed select(consume_log_stream.into_stream(), write_log_stream).boxed() }) @@ -403,7 +401,7 @@ impl SinkExecutor { sink_param: SinkParam, mut sink_writer_param: SinkWriterParam, actor_context: ActorContextRef, - ) -> StreamExecutorResult { + ) -> StreamExecutorResult { let metrics = sink_writer_param.sink_metrics.clone(); let visible_columns = columns From cccf946239ec8d2ea656a83ba85d3210b675b6b1 Mon Sep 17 00:00:00 2001 From: William Wen <44139337+wenym1@users.noreply.github.com> Date: Thu, 13 Jun 2024 17:44:07 +0800 Subject: [PATCH 15/19] feat(storage): store compaction group id in per table info (#17125) --- proto/hummock.proto | 17 +- src/meta/src/hummock/compaction/mod.rs | 7 +- .../picker/base_level_compaction_picker.rs | 6 - .../picker/intra_compaction_picker.rs | 8 - .../picker/manual_compaction_picker.rs | 6 +- .../picker/space_reclaim_compaction_picker.rs | 30 +++- .../picker/tier_compaction_picker.rs | 1 - .../tombstone_reclaim_compaction_picker.rs | 1 - .../picker/ttl_reclaim_compaction_picker.rs | 8 + .../compaction/selector/emergency_selector.rs | 1 + .../compaction/selector/level_selector.rs | 8 +- .../compaction/selector/manual_selector.rs | 1 + .../src/hummock/compaction/selector/mod.rs | 5 +- .../selector/space_reclaim_selector.rs | 10 +- .../selector/tombstone_compaction_selector.rs | 1 + .../compaction/selector/ttl_selector.rs | 1 + src/meta/src/hummock/manager/commit_epoch.rs | 132 +++++++-------- src/meta/src/hummock/manager/compaction.rs | 15 +- .../manager/compaction_group_manager.rs | 150 ++++++++++-------- src/meta/src/hummock/manager/tests.rs | 79 ++++++--- src/meta/src/hummock/manager/versioning.rs | 7 +- src/meta/src/hummock/metrics_utils.rs | 9 +- .../src/hummock/mock_hummock_meta_client.rs | 7 +- src/storage/benches/bench_table_watermarks.rs | 2 + .../compaction_group/hummock_version_ext.rs | 117 ++++++-------- src/storage/hummock_sdk/src/lib.rs | 1 + src/storage/hummock_sdk/src/table_stats.rs | 14 +- src/storage/hummock_sdk/src/version.rs | 107 ++++++++++++- .../hummock/local_version/pinned_version.rs | 11 +- 29 files changed, 469 insertions(+), 293 deletions(-) diff --git a/proto/hummock.proto b/proto/hummock.proto index 89a0438fc43a7..149944831a4f9 100644 --- a/proto/hummock.proto +++ b/proto/hummock.proto @@ -69,25 +69,28 @@ message IntraLevelDelta { enum CompatibilityVersion { VERSION_UNSPECIFIED = 0; NO_TRIVIAL_SPLIT = 1; + NO_MEMBER_TABLE_IDS = 2; } message GroupConstruct { CompactionConfig group_config = 1; // If parent_group_id is not 0, it means parent_group_id splits into parent_group_id and this group, so this group is not empty initially. uint64 parent_group_id = 2; - repeated uint32 table_ids = 3; + repeated uint32 table_ids = 3 [deprecated = true]; uint64 group_id = 4; uint64 new_sst_start_id = 5; CompatibilityVersion version = 6; } message GroupMetaChange { - repeated uint32 table_ids_add = 1; - repeated uint32 table_ids_remove = 2; + option deprecated = true; + repeated uint32 table_ids_add = 1 [deprecated = true]; + repeated uint32 table_ids_remove = 2 [deprecated = true]; } message GroupTableChange { - repeated uint32 table_ids = 1; + option deprecated = true; + repeated uint32 table_ids = 1 [deprecated = true]; uint64 target_group_id = 2; uint64 origin_group_id = 3; uint64 new_sst_start_id = 4; @@ -102,7 +105,7 @@ message GroupDelta { GroupConstruct group_construct = 2; GroupDestroy group_destroy = 3; GroupMetaChange group_meta_change = 4; - GroupTableChange group_table_change = 5; + GroupTableChange group_table_change = 5 [deprecated = true]; } } @@ -152,11 +155,13 @@ message TableChangeLog { message StateTableInfo { uint64 committed_epoch = 1; uint64 safe_epoch = 2; + uint64 compaction_group_id = 3; } message StateTableInfoDelta { uint64 committed_epoch = 1; uint64 safe_epoch = 2; + uint64 compaction_group_id = 3; } message HummockVersion { @@ -165,7 +170,7 @@ message HummockVersion { OverlappingLevel l0 = 2; uint64 group_id = 3; uint64 parent_group_id = 4; - repeated uint32 member_table_ids = 5; + repeated uint32 member_table_ids = 5 [deprecated = true]; } uint64 id = 1; // Levels of each compaction group diff --git a/src/meta/src/hummock/compaction/mod.rs b/src/meta/src/hummock/compaction/mod.rs index 1d1c2fe4b99fb..bf4b608fe59ad 100644 --- a/src/meta/src/hummock/compaction/mod.rs +++ b/src/meta/src/hummock/compaction/mod.rs @@ -16,12 +16,12 @@ pub mod compaction_config; mod overlap_strategy; -use risingwave_common::catalog::TableOption; +use risingwave_common::catalog::{TableId, TableOption}; use risingwave_pb::hummock::compact_task::{self, TaskType}; mod picker; pub mod selector; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::sync::Arc; @@ -92,6 +92,7 @@ impl CompactStatus { pub fn get_compact_task( &mut self, levels: &Levels, + member_table_ids: &BTreeSet, task_id: HummockCompactionTaskId, group: &CompactionGroup, stats: &mut LocalSelectorStatistic, @@ -106,6 +107,7 @@ impl CompactStatus { task_id, group, levels, + member_table_ids, &mut self.level_handlers, stats, table_id_to_options.clone(), @@ -121,6 +123,7 @@ impl CompactStatus { task_id, group, levels, + member_table_ids, &mut self.level_handlers, stats, table_id_to_options, diff --git a/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs index a2c3a7d52802e..bf05afc6c6e88 100644 --- a/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs @@ -326,7 +326,6 @@ pub mod tests { generate_table(1, 1, 201, 210, 1), ], )], - member_table_ids: vec![1], ..Default::default() }; let mut local_stats = LocalPickerStatistic::default(); @@ -419,7 +418,6 @@ pub mod tests { total_file_size: 0, uncompressed_file_size: 0, }), - member_table_ids: vec![1], ..Default::default() }; push_tables_level0_nonoverlapping(&mut levels, vec![generate_table(1, 1, 50, 140, 2)]); @@ -482,7 +480,6 @@ pub mod tests { total_file_size: 0, uncompressed_file_size: 0, }), - member_table_ids: vec![1], ..Default::default() }; push_tables_level0_nonoverlapping( @@ -586,7 +583,6 @@ pub mod tests { let levels = Levels { l0: Some(l0), levels: vec![generate_level(1, vec![generate_table(3, 1, 0, 100000, 1)])], - member_table_ids: vec![1], ..Default::default() }; let levels_handler = vec![LevelHandler::new(0), LevelHandler::new(1)]; @@ -664,7 +660,6 @@ pub mod tests { let levels = Levels { l0: Some(l0), levels: vec![generate_level(1, vec![generate_table(3, 1, 0, 100000, 1)])], - member_table_ids: vec![1], ..Default::default() }; let mut levels_handler = vec![LevelHandler::new(0), LevelHandler::new(1)]; @@ -731,7 +726,6 @@ pub mod tests { let levels = Levels { l0: Some(l0), levels: vec![generate_level(1, vec![generate_table(3, 1, 1, 100, 1)])], - member_table_ids: vec![1], ..Default::default() }; let mut levels_handler = vec![LevelHandler::new(0), LevelHandler::new(1)]; diff --git a/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs index 6b5dcae7d0c31..5cc65bd38a1c8 100644 --- a/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs @@ -428,7 +428,6 @@ pub mod tests { total_file_size: 0, uncompressed_file_size: 0, }), - member_table_ids: vec![1], ..Default::default() }; push_tables_level0_nonoverlapping( @@ -473,7 +472,6 @@ pub mod tests { generate_table(1, 1, 100, 210, 2), generate_table(2, 1, 200, 250, 2), ])), - member_table_ids: vec![1], ..Default::default() }; let mut levels_handler = vec![LevelHandler::new(0), LevelHandler::new(1)]; @@ -511,7 +509,6 @@ pub mod tests { let levels = Levels { l0: Some(l0), levels: vec![generate_level(1, vec![generate_table(100, 1, 0, 1000, 1)])], - member_table_ids: vec![1], ..Default::default() }; let mut levels_handler = vec![LevelHandler::new(0), LevelHandler::new(1)]; @@ -560,7 +557,6 @@ pub mod tests { let levels = Levels { l0: Some(l0), levels: vec![generate_level(1, vec![generate_table(100, 1, 0, 1000, 1)])], - member_table_ids: vec![1], ..Default::default() }; let mut levels_handler = vec![LevelHandler::new(0), LevelHandler::new(1)]; @@ -632,7 +628,6 @@ pub mod tests { let levels = Levels { l0: Some(l0), levels: vec![generate_level(1, vec![generate_table(100, 1, 0, 1000, 1)])], - member_table_ids: vec![1], ..Default::default() }; let mut levels_handler = vec![LevelHandler::new(0), LevelHandler::new(1)]; @@ -708,7 +703,6 @@ pub mod tests { let levels = Levels { l0: Some(l0), levels: vec![generate_level(1, vec![generate_table(100, 1, 0, 1000, 1)])], - member_table_ids: vec![1], ..Default::default() }; levels_handler[1].add_pending_task(100, 1, levels.levels[0].get_table_infos()); @@ -729,7 +723,6 @@ pub mod tests { let mut levels = Levels { l0: Some(l0), levels: vec![generate_level(1, vec![generate_table(100, 1, 0, 1000, 1)])], - member_table_ids: vec![1], ..Default::default() }; assert!(picker @@ -827,7 +820,6 @@ pub mod tests { let levels = Levels { l0: Some(l0), levels: vec![generate_level(1, vec![generate_table(100, 1, 0, 1000, 1)])], - member_table_ids: vec![1], ..Default::default() }; diff --git a/src/meta/src/hummock/compaction/picker/manual_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/manual_compaction_picker.rs index 82e58b87c5517..2d1b2c95b20aa 100644 --- a/src/meta/src/hummock/compaction/picker/manual_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/manual_compaction_picker.rs @@ -326,7 +326,7 @@ impl CompactionPicker for ManualCompactionPicker { #[cfg(test)] pub mod tests { - use std::collections::HashMap; + use std::collections::{BTreeSet, HashMap}; use risingwave_pb::hummock::compact_task; pub use risingwave_pb::hummock::KeyRange; @@ -1198,6 +1198,7 @@ pub mod tests { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, HashMap::default(), @@ -1235,6 +1236,7 @@ pub mod tests { 2, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, HashMap::default(), @@ -1308,6 +1310,7 @@ pub mod tests { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, HashMap::default(), @@ -1347,6 +1350,7 @@ pub mod tests { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, HashMap::default(), diff --git a/src/meta/src/hummock/compaction/picker/space_reclaim_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/space_reclaim_compaction_picker.rs index ab9f74c063a0b..b94f7587dd04e 100644 --- a/src/meta/src/hummock/compaction/picker/space_reclaim_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/space_reclaim_compaction_picker.rs @@ -168,10 +168,11 @@ impl SpaceReclaimCompactionPicker { #[cfg(test)] mod test { - use std::collections::HashMap; + use std::collections::{BTreeSet, HashMap}; use std::sync::Arc; use itertools::Itertools; + use risingwave_common::catalog::TableId; use risingwave_pb::hummock::compact_task; pub use risingwave_pb::hummock::{Level, LevelType}; @@ -234,11 +235,12 @@ mod test { } assert_eq!(levels.len(), 4); - let mut levels = Levels { + let levels = Levels { levels, l0: Some(l0), ..Default::default() }; + let mut member_table_ids = BTreeSet::new(); let mut levels_handler = (0..5).map(LevelHandler::new).collect_vec(); let mut local_stats = LocalSelectorStatistic::default(); @@ -252,6 +254,7 @@ mod test { 1, &group_config, &levels, + &member_table_ids, &mut levels_handler, &mut local_stats, HashMap::default(), @@ -268,6 +271,7 @@ mod test { 1, &group_config, &levels, + &member_table_ids, &mut levels_handler, &mut local_stats, HashMap::default(), @@ -308,6 +312,7 @@ mod test { 1, &group_config, &levels, + &member_table_ids, &mut levels_handler, &mut local_stats, HashMap::default(), @@ -334,6 +339,7 @@ mod test { 1, &group_config, &levels, + &member_table_ids, &mut levels_handler, &mut local_stats, HashMap::default(), @@ -351,12 +357,17 @@ mod test { } } - levels.member_table_ids = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + member_table_ids = BTreeSet::from_iter( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + .into_iter() + .map(TableId::new), + ); // pick space reclaim let task = selector.pick_compaction( 1, &group_config, &levels, + &member_table_ids, &mut levels_handler, &mut local_stats, HashMap::default(), @@ -372,13 +383,15 @@ mod test { } } - levels.member_table_ids = vec![2, 3, 4, 5, 6, 7, 8, 9]; + member_table_ids = + BTreeSet::from_iter([2, 3, 4, 5, 6, 7, 8, 9].into_iter().map(TableId::new)); // pick space reclaim let task = selector .pick_compaction( 1, &group_config, &levels, + &member_table_ids, &mut levels_handler, &mut local_stats, HashMap::default(), @@ -407,7 +420,7 @@ mod test { // rebuild selector selector = SpaceReclaimCompactionSelector::default(); // cut range [3,4] [6] [8,9,10] - levels.member_table_ids = vec![0, 1, 2, 5, 7]; + member_table_ids = BTreeSet::from_iter([0, 1, 2, 5, 7].into_iter().map(TableId::new)); let expect_task_file_count = [2, 1, 4]; let expect_task_sst_id_range = [vec![3, 4], vec![6], vec![8, 9, 10, 11]]; for (index, x) in expect_task_file_count.iter().enumerate() { @@ -417,6 +430,7 @@ mod test { 1, &group_config, &levels, + &member_table_ids, &mut levels_handler, &mut local_stats, HashMap::default(), @@ -458,12 +472,13 @@ mod test { // rebuild selector selector = SpaceReclaimCompactionSelector::default(); // cut range [3,4] [6] [8,9,10] - levels.member_table_ids = vec![0, 1, 2, 5, 7]; + + member_table_ids = BTreeSet::from_iter([0, 1, 2, 5, 7].into_iter().map(TableId::new)); let expect_task_file_count = [2, 1, 5]; let expect_task_sst_id_range = [vec![3, 4], vec![6], vec![7, 8, 9, 10, 11]]; for (index, x) in expect_task_file_count.iter().enumerate() { if index == expect_task_file_count.len() - 1 { - levels.member_table_ids = vec![2, 5]; + member_table_ids = BTreeSet::from_iter([2, 5].into_iter().map(TableId::new)); } // // pick space reclaim @@ -472,6 +487,7 @@ mod test { 1, &group_config, &levels, + &member_table_ids, &mut levels_handler, &mut local_stats, HashMap::default(), diff --git a/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs index ce86b523f6e86..ea25211b44749 100644 --- a/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs @@ -291,7 +291,6 @@ pub mod tests { sub_levels: vec![l1, l2], }), levels: vec![], - member_table_ids: vec![1], ..Default::default() }; let config = Arc::new( diff --git a/src/meta/src/hummock/compaction/picker/tombstone_reclaim_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/tombstone_reclaim_compaction_picker.rs index f07a8872c9b03..c7be0347fce61 100644 --- a/src/meta/src/hummock/compaction/picker/tombstone_reclaim_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/tombstone_reclaim_compaction_picker.rs @@ -154,7 +154,6 @@ pub mod tests { ], ), ], - member_table_ids: vec![1], ..Default::default() }; let levels_handler = vec![ diff --git a/src/meta/src/hummock/compaction/picker/ttl_reclaim_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/ttl_reclaim_compaction_picker.rs index f690b0f80112c..bc1fc2ce304be 100644 --- a/src/meta/src/hummock/compaction/picker/ttl_reclaim_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/ttl_reclaim_compaction_picker.rs @@ -201,6 +201,7 @@ impl TtlReclaimCompactionPicker { #[cfg(test)] mod test { + use std::collections::BTreeSet; use std::sync::Arc; use itertools::Itertools; @@ -377,6 +378,7 @@ mod test { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, table_id_to_options, @@ -427,6 +429,7 @@ mod test { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, table_id_to_options.clone(), @@ -460,6 +463,7 @@ mod test { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, table_id_to_options.clone(), @@ -516,6 +520,7 @@ mod test { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, table_id_to_options, @@ -557,6 +562,7 @@ mod test { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, HashMap::default(), @@ -619,6 +625,7 @@ mod test { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, table_id_to_options.clone(), @@ -711,6 +718,7 @@ mod test { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handler, &mut local_stats, table_id_to_options.clone(), diff --git a/src/meta/src/hummock/compaction/selector/emergency_selector.rs b/src/meta/src/hummock/compaction/selector/emergency_selector.rs index c386aee5c8644..685ab1487d51c 100644 --- a/src/meta/src/hummock/compaction/selector/emergency_selector.rs +++ b/src/meta/src/hummock/compaction/selector/emergency_selector.rs @@ -37,6 +37,7 @@ impl CompactionSelector for EmergencySelector { task_id: HummockCompactionTaskId, group: &CompactionGroup, levels: &Levels, + _member_table_ids: &std::collections::BTreeSet, level_handlers: &mut [LevelHandler], selector_stats: &mut LocalSelectorStatistic, _table_id_to_options: HashMap, diff --git a/src/meta/src/hummock/compaction/selector/level_selector.rs b/src/meta/src/hummock/compaction/selector/level_selector.rs index a49f71877c6a3..38d1e35e22502 100644 --- a/src/meta/src/hummock/compaction/selector/level_selector.rs +++ b/src/meta/src/hummock/compaction/selector/level_selector.rs @@ -422,6 +422,7 @@ impl CompactionSelector for DynamicLevelSelector { task_id: HummockCompactionTaskId, compaction_group: &CompactionGroup, levels: &Levels, + _member_table_ids: &std::collections::BTreeSet, level_handlers: &mut [LevelHandler], selector_stats: &mut LocalSelectorStatistic, _table_id_to_options: HashMap, @@ -478,7 +479,7 @@ impl CompactionSelector for DynamicLevelSelector { #[cfg(test)] pub mod tests { - use std::collections::HashMap; + use std::collections::{BTreeSet, HashMap}; use std::sync::Arc; use itertools::Itertools; @@ -600,7 +601,6 @@ pub mod tests { 3, 10, ))), - member_table_ids: vec![1], ..Default::default() }; @@ -612,6 +612,7 @@ pub mod tests { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handlers, &mut local_stats, HashMap::default(), @@ -639,6 +640,7 @@ pub mod tests { 1, &group_config, &levels, + &BTreeSet::new(), &mut levels_handlers, &mut local_stats, HashMap::default(), @@ -658,6 +660,7 @@ pub mod tests { 2, &group_config, &levels, + &BTreeSet::new(), &mut levels_handlers, &mut local_stats, HashMap::default(), @@ -694,6 +697,7 @@ pub mod tests { 2, &group_config, &levels, + &BTreeSet::new(), &mut levels_handlers, &mut local_stats, HashMap::default(), diff --git a/src/meta/src/hummock/compaction/selector/manual_selector.rs b/src/meta/src/hummock/compaction/selector/manual_selector.rs index 427efadf3914d..62c94c8f888df 100644 --- a/src/meta/src/hummock/compaction/selector/manual_selector.rs +++ b/src/meta/src/hummock/compaction/selector/manual_selector.rs @@ -79,6 +79,7 @@ impl CompactionSelector for ManualCompactionSelector { task_id: HummockCompactionTaskId, group: &CompactionGroup, levels: &Levels, + _member_table_ids: &std::collections::BTreeSet, level_handlers: &mut [LevelHandler], _selector_stats: &mut LocalSelectorStatistic, _table_id_to_options: HashMap, diff --git a/src/meta/src/hummock/compaction/selector/mod.rs b/src/meta/src/hummock/compaction/selector/mod.rs index a342a661ecb7b..aca2457da62dc 100644 --- a/src/meta/src/hummock/compaction/selector/mod.rs +++ b/src/meta/src/hummock/compaction/selector/mod.rs @@ -24,13 +24,13 @@ mod space_reclaim_selector; mod tombstone_compaction_selector; mod ttl_selector; -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::sync::Arc; pub use emergency_selector::EmergencySelector; pub use level_selector::{DynamicLevelSelector, DynamicLevelSelectorCore}; pub use manual_selector::{ManualCompactionOption, ManualCompactionSelector}; -use risingwave_common::catalog::TableOption; +use risingwave_common::catalog::{TableId, TableOption}; use risingwave_hummock_sdk::HummockCompactionTaskId; use risingwave_pb::hummock::compact_task; use risingwave_pb::hummock::hummock_version::Levels; @@ -53,6 +53,7 @@ pub trait CompactionSelector: Sync + Send { task_id: HummockCompactionTaskId, group: &CompactionGroup, levels: &Levels, + member_table_ids: &BTreeSet, level_handlers: &mut [LevelHandler], selector_stats: &mut LocalSelectorStatistic, table_id_to_options: HashMap, diff --git a/src/meta/src/hummock/compaction/selector/space_reclaim_selector.rs b/src/meta/src/hummock/compaction/selector/space_reclaim_selector.rs index c48cb0fe605c5..b284a6a538b3e 100644 --- a/src/meta/src/hummock/compaction/selector/space_reclaim_selector.rs +++ b/src/meta/src/hummock/compaction/selector/space_reclaim_selector.rs @@ -17,10 +17,10 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::sync::Arc; -use risingwave_common::catalog::TableOption; +use risingwave_common::catalog::{TableId, TableOption}; use risingwave_hummock_sdk::HummockCompactionTaskId; use risingwave_pb::hummock::compact_task; use risingwave_pb::hummock::hummock_version::Levels; @@ -44,6 +44,7 @@ impl CompactionSelector for SpaceReclaimCompactionSelector { task_id: HummockCompactionTaskId, group: &CompactionGroup, levels: &Levels, + member_table_ids: &BTreeSet, level_handlers: &mut [LevelHandler], _selector_stats: &mut LocalSelectorStatistic, _table_id_to_options: HashMap, @@ -53,7 +54,10 @@ impl CompactionSelector for SpaceReclaimCompactionSelector { DynamicLevelSelectorCore::new(group.compaction_config.clone(), developer_config); let mut picker = SpaceReclaimCompactionPicker::new( group.compaction_config.max_space_reclaim_bytes, - levels.member_table_ids.iter().cloned().collect(), + member_table_ids + .iter() + .map(|table_id| table_id.table_id) + .collect(), ); let ctx = dynamic_level_core.calculate_level_base_size(levels); let state = self.state.entry(group.group_id).or_default(); diff --git a/src/meta/src/hummock/compaction/selector/tombstone_compaction_selector.rs b/src/meta/src/hummock/compaction/selector/tombstone_compaction_selector.rs index 5cbae609caa86..b05802513733d 100644 --- a/src/meta/src/hummock/compaction/selector/tombstone_compaction_selector.rs +++ b/src/meta/src/hummock/compaction/selector/tombstone_compaction_selector.rs @@ -42,6 +42,7 @@ impl CompactionSelector for TombstoneCompactionSelector { task_id: HummockCompactionTaskId, group: &CompactionGroup, levels: &Levels, + _member_table_ids: &std::collections::BTreeSet, level_handlers: &mut [LevelHandler], _selector_stats: &mut LocalSelectorStatistic, _table_id_to_options: HashMap, diff --git a/src/meta/src/hummock/compaction/selector/ttl_selector.rs b/src/meta/src/hummock/compaction/selector/ttl_selector.rs index ed099ede4158f..0e9497b06b17d 100644 --- a/src/meta/src/hummock/compaction/selector/ttl_selector.rs +++ b/src/meta/src/hummock/compaction/selector/ttl_selector.rs @@ -44,6 +44,7 @@ impl CompactionSelector for TtlCompactionSelector { task_id: HummockCompactionTaskId, group: &CompactionGroup, levels: &Levels, + _member_table_ids: &std::collections::BTreeSet, level_handlers: &mut [LevelHandler], _selector_stats: &mut LocalSelectorStatistic, table_id_to_options: HashMap, diff --git a/src/meta/src/hummock/manager/commit_epoch.rs b/src/meta/src/hummock/manager/commit_epoch.rs index d42c032d5a054..44117139dc192 100644 --- a/src/meta/src/hummock/manager/commit_epoch.rs +++ b/src/meta/src/hummock/manager/commit_epoch.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use itertools::Itertools; use risingwave_common::catalog::TableId; @@ -27,9 +27,7 @@ use risingwave_hummock_sdk::{ use risingwave_pb::hummock::compact_task::{self}; use risingwave_pb::hummock::group_delta::DeltaType; use risingwave_pb::hummock::hummock_version_delta::ChangeLogDelta; -use risingwave_pb::hummock::{ - GroupDelta, GroupMetaChange, HummockSnapshot, IntraLevelDelta, StateTableInfoDelta, -}; +use risingwave_pb::hummock::{GroupDelta, HummockSnapshot, IntraLevelDelta, StateTableInfoDelta}; use crate::hummock::error::{Error, Result}; use crate::hummock::manager::transaction::{ @@ -140,81 +138,51 @@ impl HummockManager { let mut table_compaction_group_mapping = new_version_delta .latest_version() - .build_compaction_group_info(); + .state_table_info + .build_table_compaction_group_id(); let mut new_table_ids = None; // Add new table if let Some(new_fragment_table_info) = new_table_fragment_info { - let new_table_ids = new_table_ids.insert(HashSet::new()); + let new_table_ids = new_table_ids.insert(HashMap::new()); if !new_fragment_table_info.internal_table_ids.is_empty() { - if let Some(levels) = new_version_delta - .latest_version() - .levels - .get(&(StaticCompactionGroupId::StateDefault as u64)) - { - for table_id in &new_fragment_table_info.internal_table_ids { - if levels.member_table_ids.contains(&table_id.table_id) { - return Err(Error::CompactionGroup(format!( - "table {} already in group {}", - table_id, - StaticCompactionGroupId::StateDefault as u64 - ))); - } + for table_id in &new_fragment_table_info.internal_table_ids { + if let Some(info) = new_version_delta + .latest_version() + .state_table_info + .info() + .get(table_id) + { + return Err(Error::CompactionGroup(format!( + "table {} already exist {:?}", + table_id.table_id, info, + ))); } } - let group_deltas = &mut new_version_delta - .group_deltas - .entry(StaticCompactionGroupId::StateDefault as u64) - .or_default() - .group_deltas; - group_deltas.push(GroupDelta { - delta_type: Some(DeltaType::GroupMetaChange(GroupMetaChange { - table_ids_add: new_fragment_table_info - .internal_table_ids - .iter() - .map(|table_id| table_id.table_id) - .collect(), - ..Default::default() - })), - }); - for table_id in &new_fragment_table_info.internal_table_ids { table_compaction_group_mapping .insert(*table_id, StaticCompactionGroupId::StateDefault as u64); - new_table_ids.insert(*table_id); + new_table_ids.insert(*table_id, StaticCompactionGroupId::StateDefault as u64); } } if let Some(table_id) = new_fragment_table_info.mv_table_id { - if let Some(levels) = new_version_delta + if let Some(info) = new_version_delta .latest_version() - .levels - .get(&(StaticCompactionGroupId::MaterializedView as u64)) + .state_table_info + .info() + .get(&table_id) { - if levels.member_table_ids.contains(&table_id.table_id) { - return Err(Error::CompactionGroup(format!( - "table {} already in group {}", - table_id, - StaticCompactionGroupId::MaterializedView as u64 - ))); - } + return Err(Error::CompactionGroup(format!( + "table {} already exist {:?}", + table_id.table_id, info, + ))); } - let group_deltas = &mut new_version_delta - .group_deltas - .entry(StaticCompactionGroupId::MaterializedView as u64) - .or_default() - .group_deltas; - group_deltas.push(GroupDelta { - delta_type: Some(DeltaType::GroupMetaChange(GroupMetaChange { - table_ids_add: vec![table_id.table_id], - ..Default::default() - })), - }); let _ = table_compaction_group_mapping .insert(table_id, StaticCompactionGroupId::MaterializedView as u64); - new_table_ids.insert(table_id); + new_table_ids.insert(table_id, StaticCompactionGroupId::MaterializedView as u64); } } @@ -231,10 +199,12 @@ impl HummockManager { .levels .get(compaction_group_id) { - Some(compaction_group) => sst - .table_ids - .iter() - .all(|t| compaction_group.member_table_ids.contains(t)), + Some(_compaction_group) => sst.table_ids.iter().all(|t| { + table_compaction_group_mapping + .get(&TableId::new(*t)) + .map(|table_cg_id| table_cg_id == compaction_group_id) + .unwrap_or(false) + }), None => false, }; if !is_sst_belong_to_group_declared { @@ -339,23 +309,32 @@ impl HummockManager { // update state table info new_version_delta.with_latest_version(|version, delta| { if let Some(new_table_ids) = new_table_ids { - for table_id in new_table_ids { + for (table_id, cg_id) in new_table_ids { delta.state_table_info_delta.insert( table_id, StateTableInfoDelta { committed_epoch: epoch, safe_epoch: epoch, + compaction_group_id: cg_id, }, ); } } for (table_id, info) in version.state_table_info.info() { - delta.state_table_info_delta.insert( - *table_id, - StateTableInfoDelta { - committed_epoch: epoch, - safe_epoch: info.safe_epoch, - }, + assert!( + delta + .state_table_info_delta + .insert( + *table_id, + StateTableInfoDelta { + committed_epoch: epoch, + safe_epoch: info.safe_epoch, + compaction_group_id: info.compaction_group_id, + } + ) + .is_none(), + "newly added table exists previously: {:?}", + table_id ); } }); @@ -416,10 +395,17 @@ impl HummockManager { tracing::trace!("new committed epoch {}", epoch); let mut table_groups = HashMap::::default(); - for group in versioning.current_version.levels.values() { - for table_id in &group.member_table_ids { - table_groups.insert(*table_id, group.member_table_ids.len()); - } + for (table_id, info) in versioning.current_version.state_table_info.info() { + table_groups.insert( + table_id.table_id, + versioning + .current_version + .state_table_info + .compaction_group_member_tables() + .get(&info.compaction_group_id) + .expect("should exist") + .len(), + ); } drop(versioning_guard); // Don't trigger compactions if we enable deterministic compaction diff --git a/src/meta/src/hummock/manager/compaction.rs b/src/meta/src/hummock/manager/compaction.rs index 1969fc8ffc348..7138586f3ed20 100644 --- a/src/meta/src/hummock/manager/compaction.rs +++ b/src/meta/src/hummock/manager/compaction.rs @@ -206,6 +206,7 @@ impl<'a> HummockVersionTransaction<'a> { StateTableInfoDelta { committed_epoch: info.committed_epoch, safe_epoch: new_safe_epoch, + compaction_group_id: info.compaction_group_id, }, ); } @@ -732,11 +733,13 @@ impl HummockManager { || matches!(selector.task_type(), TaskType::Emergency); let mut stats = LocalSelectorStatistic::default(); - let member_table_ids = version + let member_table_ids: Vec<_> = version .latest_version() - .get_compaction_group_levels(compaction_group_id) - .member_table_ids - .clone(); + .state_table_info + .compaction_group_member_table_ids(compaction_group_id) + .iter() + .map(|table_id| table_id.table_id) + .collect(); let mut table_id_to_option: HashMap = HashMap::default(); @@ -750,6 +753,10 @@ impl HummockManager { version .latest_version() .get_compaction_group_levels(compaction_group_id), + version + .latest_version() + .state_table_info + .compaction_group_member_table_ids(compaction_group_id), task_id as HummockCompactionTaskId, &group_config, &mut stats, diff --git a/src/meta/src/hummock/manager/compaction_group_manager.rs b/src/meta/src/hummock/manager/compaction_group_manager.rs index 3a3d596844e95..79596c8a3e774 100644 --- a/src/meta/src/hummock/manager/compaction_group_manager.rs +++ b/src/meta/src/hummock/manager/compaction_group_manager.rs @@ -19,8 +19,7 @@ use std::sync::Arc; use itertools::Itertools; use risingwave_common::catalog::TableId; use risingwave_hummock_sdk::compaction_group::hummock_version_ext::{ - get_compaction_group_ids, get_member_table_ids, try_get_compaction_group_id_by_table_id, - TableGroupInfo, + get_compaction_group_ids, get_member_table_ids, TableGroupInfo, }; use risingwave_hummock_sdk::compaction_group::{StateTableId, StaticCompactionGroupId}; use risingwave_hummock_sdk::CompactionGroupId; @@ -33,7 +32,7 @@ use risingwave_pb::hummock::subscribe_compaction_event_request::ReportTask; use risingwave_pb::hummock::write_limits::WriteLimit; use risingwave_pb::hummock::{ compact_task, CompactionConfig, CompactionGroupInfo, CompatibilityVersion, GroupConstruct, - GroupDelta, GroupDestroy, GroupMetaChange, StateTableInfoDelta, + GroupDelta, GroupDestroy, StateTableInfoDelta, }; use tokio::sync::OnceCell; @@ -196,12 +195,14 @@ impl HummockManager { let mut compaction_groups_txn = compaction_group_manager.start_compaction_groups_txn(); for (table_id, _) in pairs { - if let Some(old_group) = - try_get_compaction_group_id_by_table_id(current_version, *table_id) + if let Some(info) = current_version + .state_table_info + .info() + .get(&TableId::new(*table_id)) { return Err(Error::CompactionGroup(format!( - "table {} already in group {}", - *table_id, old_group + "table {} already {:?}", + *table_id, info ))); } } @@ -256,18 +257,6 @@ impl HummockManager { }); } } - let group_deltas = &mut new_version_delta - .group_deltas - .entry(group_id) - .or_default() - .group_deltas; - - group_deltas.push(GroupDelta { - delta_type: Some(DeltaType::GroupMetaChange(GroupMetaChange { - table_ids_add: vec![*table_id], - ..Default::default() - })), - }); assert!(new_version_delta .state_table_info_delta .insert( @@ -275,6 +264,7 @@ impl HummockManager { StateTableInfoDelta { committed_epoch: epoch, safe_epoch: epoch, + compaction_group_id: *raw_group_id, } ) .is_none()); @@ -302,32 +292,24 @@ impl HummockManager { HashMap::new(); // Remove member tables for table_id in table_ids.iter().unique() { - let group_id = match try_get_compaction_group_id_by_table_id( - new_version_delta.latest_version(), - *table_id, - ) { - Some(group_id) => group_id, - None => continue, + let version = new_version_delta.latest_version(); + let Some(info) = version + .state_table_info + .info() + .get(&TableId::new(*table_id)) + else { + continue; }; - let group_deltas = &mut new_version_delta - .group_deltas - .entry(group_id) - .or_default() - .group_deltas; - group_deltas.push(GroupDelta { - delta_type: Some(DeltaType::GroupMetaChange(GroupMetaChange { - table_ids_remove: vec![*table_id], - ..Default::default() - })), - }); + modified_groups - .entry(group_id) + .entry(info.compaction_group_id) .and_modify(|count| *count -= 1) .or_insert( - new_version_delta - .latest_version() - .get_compaction_group_levels(group_id) - .member_table_ids + version + .state_table_info + .compaction_group_member_tables() + .get(&info.compaction_group_id) + .expect("should exist") .len() as u64 - 1, ); @@ -427,7 +409,12 @@ impl HummockManager { let group = CompactionGroupInfo { id: levels.group_id, parent_id: levels.parent_group_id, - member_table_ids: levels.member_table_ids.clone(), + member_table_ids: current_version + .state_table_info + .compaction_group_member_table_ids(levels.group_id) + .iter() + .map(|table_id| table_id.table_id) + .collect_vec(), compaction_config: Some(compaction_config), }; results.push(group); @@ -469,13 +456,24 @@ impl HummockManager { let mut versioning_guard = self.versioning.write().await; let versioning = versioning_guard.deref_mut(); // Validate parameters. - let parent_group = versioning + if !versioning .current_version .levels - .get(&parent_group_id) - .ok_or_else(|| Error::CompactionGroup(format!("invalid group {}", parent_group_id)))?; + .contains_key(&parent_group_id) + { + return Err(Error::CompactionGroup(format!( + "invalid group {}", + parent_group_id + ))); + } + for table_id in &table_ids { - if !parent_group.member_table_ids.contains(table_id) { + if !versioning + .current_version + .state_table_info + .compaction_group_member_table_ids(parent_group_id) + .contains(&TableId::new(*table_id)) + { return Err(Error::CompactionGroup(format!( "table {} doesn't in group {}", table_id, parent_group_id @@ -483,7 +481,13 @@ impl HummockManager { } } - if table_ids.len() == parent_group.member_table_ids.len() { + if table_ids.len() + == versioning + .current_version + .state_table_info + .compaction_group_member_table_ids(parent_group_id) + .len() + { return Err(Error::CompactionGroup(format!( "invalid split attempt for group {}: all member tables are moved", parent_group_id @@ -521,6 +525,8 @@ impl HummockManager { .clone(); config.split_weight_by_vnode = partition_vnode_count; + #[expect(deprecated)] + // fill the deprecated field with default value new_version_delta.group_deltas.insert( new_compaction_group_id, GroupDeltas { @@ -530,20 +536,8 @@ impl HummockManager { group_id: new_compaction_group_id, parent_group_id, new_sst_start_id, - table_ids: table_ids.to_vec(), - version: CompatibilityVersion::NoTrivialSplit as i32, - })), - }], - }, - ); - - new_version_delta.group_deltas.insert( - parent_group_id, - GroupDeltas { - group_deltas: vec![GroupDelta { - delta_type: Some(DeltaType::GroupMetaChange(GroupMetaChange { - table_ids_remove: table_ids.to_vec(), - ..Default::default() + table_ids: vec![], + version: CompatibilityVersion::NoMemberTableIds as i32, })), }], }, @@ -553,6 +547,27 @@ impl HummockManager { }; let (new_compaction_group_id, config) = new_group; + new_version_delta.with_latest_version(|version, new_version_delta| { + for table_id in &table_ids { + let table_id = TableId::new(*table_id); + let info = version + .state_table_info + .info() + .get(&table_id) + .expect("have check exist previously"); + assert!(new_version_delta + .state_table_info_delta + .insert( + table_id, + StateTableInfoDelta { + committed_epoch: info.committed_epoch, + safe_epoch: info.safe_epoch, + compaction_group_id: new_compaction_group_id, + } + ) + .is_none()); + } + }); { let mut compaction_group_manager = self.compaction_group_manager.write().await; let mut compaction_groups_txn = compaction_group_manager.start_compaction_groups_txn(); @@ -606,21 +621,26 @@ impl HummockManager { { let versioning_guard = self.versioning.read().await; let version = &versioning_guard.current_version; - for (group_id, group) in &version.levels { + for group_id in version.levels.keys() { let mut group_info = TableGroupInfo { group_id: *group_id, ..Default::default() }; - for table_id in &group.member_table_ids { + for table_id in version + .state_table_info + .compaction_group_member_table_ids(*group_id) + { let stats_size = versioning_guard .version_stats .table_stats - .get(table_id) + .get(&table_id.table_id) .map(|stats| stats.total_key_size + stats.total_value_size) .unwrap_or(0); let table_size = std::cmp::max(stats_size, 0) as u64; group_info.group_size += table_size; - group_info.table_statistic.insert(*table_id, table_size); + group_info + .table_statistic + .insert(table_id.table_id, table_size); } infos.push(group_info); } diff --git a/src/meta/src/hummock/manager/tests.rs b/src/meta/src/hummock/manager/tests.rs index 5b62e5f0694b7..8d4fcf89b33be 100644 --- a/src/meta/src/hummock/manager/tests.rs +++ b/src/meta/src/hummock/manager/tests.rs @@ -1378,14 +1378,20 @@ async fn test_split_compaction_group_on_commit() { ); assert_eq!( current_version - .get_compaction_group_levels(2) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(2) + .iter() + .map(|table_id| table_id.table_id) + .collect_vec(), vec![100] ); assert_eq!( current_version - .get_compaction_group_levels(3) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(3) + .iter() + .map(|table_id| table_id.table_id) + .collect_vec(), vec![101] ); } @@ -1511,14 +1517,21 @@ async fn test_split_compaction_group_on_demand_basic() { ); assert_eq!( current_version - .get_compaction_group_levels(2) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(2) + .iter() + .map(|table_id| table_id.table_id) + .collect_vec(), vec![102] ); assert_eq!( current_version - .get_compaction_group_levels(new_group_id) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(new_group_id) + .iter() + .map(|table_id| table_id.table_id) + .sorted() + .collect_vec(), vec![100, 101] ); } @@ -1575,14 +1588,20 @@ async fn test_split_compaction_group_on_demand_non_trivial() { ); assert_eq!( current_version - .get_compaction_group_levels(2) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(2) + .iter() + .map(|table_id| table_id.table_id) + .collect_vec(), vec![101] ); assert_eq!( current_version - .get_compaction_group_levels(new_group_id) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(new_group_id) + .iter() + .map(|table_id| table_id.table_id) + .collect_vec(), vec![100] ); } @@ -1698,14 +1717,21 @@ async fn test_split_compaction_group_trivial_expired() { assert!(new_group_id > StaticCompactionGroupId::End as u64); assert_eq!( current_version - .get_compaction_group_levels(2) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(2) + .iter() + .map(|table_id| table_id.table_id) + .sorted() + .collect_vec(), vec![101, 102] ); assert_eq!( current_version - .get_compaction_group_levels(new_group_id) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(new_group_id) + .iter() + .map(|table_id| table_id.table_id) + .collect_vec(), vec![100] ); @@ -2370,14 +2396,20 @@ async fn test_unregister_moved_table() { ); assert_eq!( current_version - .get_compaction_group_levels(2) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(2) + .iter() + .map(|table_id| table_id.table_id) + .collect_vec(), vec![101] ); assert_eq!( current_version - .get_compaction_group_levels(new_group_id) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(new_group_id) + .iter() + .map(|table_id| table_id.table_id) + .collect_vec(), vec![100] ); @@ -2391,8 +2423,11 @@ async fn test_unregister_moved_table() { ); assert_eq!( current_version - .get_compaction_group_levels(2) - .member_table_ids, + .state_table_info + .compaction_group_member_table_ids(2) + .iter() + .map(|table_id| table_id.table_id) + .collect_vec(), vec![101] ); } diff --git a/src/meta/src/hummock/manager/versioning.rs b/src/meta/src/hummock/manager/versioning.rs index 790ac6b54fef1..2e6b2512a8be0 100644 --- a/src/meta/src/hummock/manager/versioning.rs +++ b/src/meta/src/hummock/manager/versioning.rs @@ -332,7 +332,12 @@ pub(super) fn calc_new_write_limits( new_write_limits.insert( *id, WriteLimit { - table_ids: levels.member_table_ids.clone(), + table_ids: version + .state_table_info + .compaction_group_member_table_ids(*id) + .iter() + .map(|table_id| table_id.table_id) + .collect(), reason: write_limit_type.as_str(), }, ); diff --git a/src/meta/src/hummock/metrics_utils.rs b/src/meta/src/hummock/metrics_utils.rs index 19eeb24f67aa9..3779ff5b2be97 100644 --- a/src/meta/src/hummock/metrics_utils.rs +++ b/src/meta/src/hummock/metrics_utils.rs @@ -529,12 +529,17 @@ pub fn trigger_write_stop_stats( pub fn trigger_split_stat(metrics: &MetaMetrics, version: &HummockVersion) { let branched_ssts = version.build_branched_sst_info(); - for (compaction_group_id, group) in &version.levels { + for compaction_group_id in version.levels.keys() { let group_label = compaction_group_id.to_string(); metrics .state_table_count .with_label_values(&[&group_label]) - .set(group.member_table_ids.len() as _); + .set( + version + .state_table_info + .compaction_group_member_table_ids(*compaction_group_id) + .len() as _, + ); let branched_sst_count: usize = branched_ssts .values() diff --git a/src/meta/src/hummock/mock_hummock_meta_client.rs b/src/meta/src/hummock/mock_hummock_meta_client.rs index 3227c4ad3d34e..90f70a4f74f3a 100644 --- a/src/meta/src/hummock/mock_hummock_meta_client.rs +++ b/src/meta/src/hummock/mock_hummock_meta_client.rs @@ -172,9 +172,10 @@ impl HummockMetaClient for MockHummockMetaClient { sync_result.uncommitted_ssts.iter().map(|sst| &sst.sst_info), &vec![epoch], version - .levels - .values() - .flat_map(|group| group.member_table_ids.iter().map(|table_id| (*table_id, 0))), + .state_table_info + .info() + .keys() + .map(|table_id| (table_id.table_id, 0)), ); self.hummock_manager .commit_epoch( diff --git a/src/storage/benches/bench_table_watermarks.rs b/src/storage/benches/bench_table_watermarks.rs index ce8980598f772..f1fdf66c2f895 100644 --- a/src/storage/benches/bench_table_watermarks.rs +++ b/src/storage/benches/bench_table_watermarks.rs @@ -25,6 +25,7 @@ use risingwave_common::buffer::{Bitmap, BitmapBuilder}; use risingwave_common::catalog::TableId; use risingwave_common::hash::VirtualNode; use risingwave_common::util::epoch::test_epoch; +use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; use risingwave_hummock_sdk::table_watermark::{ TableWatermarks, TableWatermarksIndex, VnodeWatermark, WatermarkDirection, }; @@ -132,6 +133,7 @@ fn gen_version( StateTableInfoDelta { committed_epoch, safe_epoch: test_epoch(old_epoch_idx as _), + compaction_group_id: StaticCompactionGroupId::StateDefault as _, }, ) }) diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs index 2231878dc9ef6..102e364b8d00b 100644 --- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs +++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs @@ -496,18 +496,40 @@ impl HummockVersion { ); let parent_group_id = group_construct.parent_group_id; new_levels.parent_group_id = parent_group_id; + #[expect(deprecated)] + // for backward-compatibility of previous hummock version delta new_levels .member_table_ids .clone_from(&group_construct.table_ids); self.levels.insert(*compaction_group_id, new_levels); + let member_table_ids = + if group_construct.version >= CompatibilityVersion::NoMemberTableIds as _ { + self.state_table_info + .compaction_group_member_table_ids(*compaction_group_id) + .iter() + .map(|table_id| table_id.table_id) + .collect() + } else { + #[expect(deprecated)] + // for backward-compatibility of previous hummock version delta + HashSet::from_iter(group_construct.table_ids.clone()) + }; + self.init_with_parent_group( parent_group_id, *compaction_group_id, - HashSet::from_iter(group_construct.table_ids.clone()), + member_table_ids, group_construct.get_new_sst_start_id(), - group_construct.version() == CompatibilityVersion::VersionUnspecified, + group_construct.version < CompatibilityVersion::NoTrivialSplit as _, ); } else if let Some(group_change) = &summary.group_table_change { + // TODO: may deprecate this branch? This enum variant is not created anywhere + assert!( + group_change.version <= CompatibilityVersion::NoTrivialSplit as _, + "DeltaType::GroupTableChange is not used anymore after CompatibilityVersion::NoMemberTableIds is added" + ); + #[expect(deprecated)] + // for backward-compatibility of previous hummock version delta self.init_with_parent_group( group_change.origin_group_id, group_change.target_group_id, @@ -520,10 +542,14 @@ impl HummockVersion { .levels .get_mut(&group_change.origin_group_id) .expect("compaction group should exist"); + #[expect(deprecated)] + // for backward-compatibility of previous hummock version delta let mut moving_tables = levels .member_table_ids .extract_if(|t| group_change.table_ids.contains(t)) .collect_vec(); + #[expect(deprecated)] + // for backward-compatibility of previous hummock version delta self.levels .get_mut(compaction_group_id) .expect("compaction group should exist") @@ -536,6 +562,7 @@ impl HummockVersion { .get_mut(compaction_group_id) .expect("compaction group should exist"); + #[expect(deprecated)] // for backward-compatibility of previous hummock version delta for group_meta_delta in &summary.group_meta_changes { levels .member_table_ids @@ -583,7 +610,11 @@ impl HummockVersion { } } else { // `max_committed_epoch` is not changed. The delta is caused by compaction. - levels.apply_compact_ssts(summary); + levels.apply_compact_ssts( + summary, + self.state_table_info + .compaction_group_member_table_ids(*compaction_group_id), + ); } if has_destroy { self.levels.remove(compaction_group_id); @@ -693,16 +724,6 @@ impl HummockVersion { } } - pub fn build_compaction_group_info(&self) -> HashMap { - let mut ret = HashMap::new(); - for (compaction_group_id, levels) in &self.levels { - for table_id in &levels.member_table_ids { - ret.insert(TableId::new(*table_id), *compaction_group_id); - } - } - ret - } - pub fn build_branched_sst_info(&self) -> BTreeMap { let mut ret: BTreeMap<_, _> = BTreeMap::new(); for (compaction_group_id, group) in &self.levels { @@ -757,7 +778,11 @@ impl Levels { .sum() } - pub fn apply_compact_ssts(&mut self, summary: GroupDeltasSummary) { + pub fn apply_compact_ssts( + &mut self, + summary: GroupDeltasSummary, + member_table_ids: &BTreeSet, + ) { let GroupDeltasSummary { delete_sst_levels, delete_sst_ids_set, @@ -810,9 +835,11 @@ impl Levels { insert_sub_level_id, delete_sst_ids_set, l0.sub_levels.iter().map(|level| level.sub_level_id).collect_vec() ); if l0.sub_levels[index].table_infos.is_empty() - && self.member_table_ids.len() == 1 + && member_table_ids.len() == 1 && insert_table_infos.iter().all(|sst| { - sst.table_ids.len() == 1 && sst.table_ids[0] == self.member_table_ids[0] + sst.table_ids.len() == 1 + && sst.table_ids[0] + == member_table_ids.iter().next().expect("non-empty").table_id }) { // Only change vnode_partition_count for group which has only one state-table. @@ -830,7 +857,7 @@ impl Levels { self.levels[idx].vnode_partition_count = new_vnode_partition_count; } else if self.levels[idx].vnode_partition_count != 0 && new_vnode_partition_count == 0 - && self.member_table_ids.len() > 1 + && member_table_ids.len() > 1 { self.levels[idx].vnode_partition_count = 0; } @@ -901,6 +928,7 @@ pub fn build_initial_compaction_group_levels( vnode_partition_count: 0, }); } + #[expect(deprecated)] // for backward-compatibility of previous hummock version delta Levels { levels, l0: Some(OverlappingLevel { @@ -952,18 +980,6 @@ fn split_sst_info_for_level( insert_table_infos } -pub fn try_get_compaction_group_id_by_table_id( - version: &HummockVersion, - table_id: StateTableId, -) -> Option { - for (group_id, levels) in &version.levels { - if levels.member_table_ids.contains(&table_id) { - return Some(*group_id); - } - } - None -} - /// Gets all compaction group ids. pub fn get_compaction_group_ids( version: &HummockVersion, @@ -974,9 +990,10 @@ pub fn get_compaction_group_ids( /// Gets all member table ids. pub fn get_member_table_ids(version: &HummockVersion) -> HashSet { version - .levels - .iter() - .flat_map(|(_, levels)| levels.member_table_ids.clone()) + .state_table_info + .info() + .keys() + .map(|table_id| table_id.table_id) .collect() } @@ -984,14 +1001,10 @@ pub fn get_table_compaction_group_id_mapping( version: &HummockVersion, ) -> HashMap { version - .levels + .state_table_info + .info() .iter() - .flat_map(|(group_id, levels)| { - levels - .member_table_ids - .iter() - .map(|table_id| (*table_id, *group_id)) - }) + .map(|(table_id, info)| (table_id.table_id, info.compaction_group_id)) .collect() } @@ -1209,7 +1222,6 @@ pub fn validate_version(version: &HummockVersion) -> Vec { )); } - let mut table_to_group = HashMap::new(); // Ensure each table maps to only one compaction group for (group_id, levels) in &version.levels { // Ensure compaction group id matches @@ -1220,31 +1232,6 @@ pub fn validate_version(version: &HummockVersion) -> Vec { )); } - // Ensure table id is sorted - if !levels.member_table_ids.is_sorted() { - res.push(format!( - "GROUP {}: memtable_table_ids is not sorted: {:?}", - group_id, levels.member_table_ids - )); - } - - // Ensure table id is unique - for table_id in &levels.member_table_ids { - match table_to_group.entry(table_id) { - Entry::Occupied(e) => { - res.push(format!( - "GROUP {}: Duplicated table_id {}. First found in group {}", - group_id, - table_id, - e.get() - )); - } - Entry::Vacant(e) => { - e.insert(group_id); - } - } - } - let validate_level = |group: CompactionGroupId, expected_level_idx: u32, level: &Level, diff --git a/src/storage/hummock_sdk/src/lib.rs b/src/storage/hummock_sdk/src/lib.rs index 06c9a358883f4..97e1a334dcf98 100644 --- a/src/storage/hummock_sdk/src/lib.rs +++ b/src/storage/hummock_sdk/src/lib.rs @@ -22,6 +22,7 @@ #![feature(is_sorted)] #![feature(let_chains)] #![feature(btree_cursors)] +#![feature(lazy_cell)] mod key_cmp; use std::cmp::Ordering; diff --git a/src/storage/hummock_sdk/src/table_stats.rs b/src/storage/hummock_sdk/src/table_stats.rs index 6f6bb9b9e3d40..4fce0e0f048ae 100644 --- a/src/storage/hummock_sdk/src/table_stats.rs +++ b/src/storage/hummock_sdk/src/table_stats.rs @@ -13,8 +13,9 @@ // limitations under the License. use std::borrow::Borrow; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; +use risingwave_common::catalog::TableId; use risingwave_pb::hummock::PbTableStats; use crate::version::HummockVersion; @@ -106,11 +107,12 @@ pub fn purge_prost_table_stats( table_stats: &mut PbTableStatsMap, hummock_version: &HummockVersion, ) -> bool { - let mut all_tables_in_version: HashSet = HashSet::default(); let prev_count = table_stats.len(); - for group in hummock_version.levels.values() { - all_tables_in_version.extend(group.member_table_ids.clone()); - } - table_stats.retain(|k, _| all_tables_in_version.contains(k)); + table_stats.retain(|table_id, _| { + hummock_version + .state_table_info + .info() + .contains_key(&TableId::new(*table_id)) + }); prev_count != table_stats.len() } diff --git a/src/storage/hummock_sdk/src/version.rs b/src/storage/hummock_sdk/src/version.rs index b30c1ba0e5191..0315ee9b8f298 100644 --- a/src/storage/hummock_sdk/src/version.rs +++ b/src/storage/hummock_sdk/src/version.rs @@ -13,9 +13,10 @@ // limitations under the License. use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::mem::{replace, size_of}; -use std::sync::Arc; +use std::ops::Deref; +use std::sync::{Arc, LazyLock}; use prost::Message; use risingwave_common::catalog::TableId; @@ -39,13 +40,37 @@ use crate::{CompactionGroupId, HummockSstableObjectId, HummockVersionId, FIRST_V #[derive(Debug, Clone, PartialEq)] pub struct HummockVersionStateTableInfo { state_table_info: HashMap, + + // in memory index + compaction_group_member_tables: HashMap>, } impl HummockVersionStateTableInfo { pub fn empty() -> Self { Self { state_table_info: HashMap::new(), + compaction_group_member_tables: HashMap::new(), + } + } + + fn build_compaction_group_member_tables( + state_table_info: &HashMap, + ) -> HashMap> { + let mut ret: HashMap<_, BTreeSet<_>> = HashMap::new(); + for (table_id, info) in state_table_info { + assert!(ret + .entry(info.compaction_group_id) + .or_default() + .insert(*table_id)); } + ret + } + + pub fn build_table_compaction_group_id(&self) -> HashMap { + self.state_table_info + .iter() + .map(|(table_id, info)| (*table_id, info.compaction_group_id)) + .collect() } pub fn from_protobuf(state_table_info: &HashMap) -> Self { @@ -53,7 +78,12 @@ impl HummockVersionStateTableInfo { .iter() .map(|(table_id, info)| (TableId::new(*table_id), info.clone())) .collect(); - Self { state_table_info } + let compaction_group_member_tables = + Self::build_compaction_group_member_tables(&state_table_info); + Self { + state_table_info, + compaction_group_member_tables, + } } pub fn to_protobuf(&self) -> HashMap { @@ -69,8 +99,28 @@ impl HummockVersionStateTableInfo { removed_table_id: &HashSet, ) -> HashMap> { let mut changed_table = HashMap::new(); + fn remove_table_from_compaction_group( + compaction_group_member_tables: &mut HashMap>, + compaction_group_id: CompactionGroupId, + table_id: TableId, + ) { + let member_tables = compaction_group_member_tables + .get_mut(&compaction_group_id) + .expect("should exist"); + assert!(member_tables.remove(&table_id)); + if member_tables.is_empty() { + assert!(compaction_group_member_tables + .remove(&compaction_group_id) + .is_some()); + } + } for table_id in removed_table_id { if let Some(prev_info) = self.state_table_info.remove(table_id) { + remove_table_from_compaction_group( + &mut self.compaction_group_member_tables, + prev_info.compaction_group_id, + *table_id, + ); assert!(changed_table.insert(*table_id, Some(prev_info)).is_none()); } else { warn!( @@ -86,6 +136,7 @@ impl HummockVersionStateTableInfo { let new_info = StateTableInfo { committed_epoch: delta.committed_epoch, safe_epoch: delta.safe_epoch, + compaction_group_id: delta.compaction_group_id, }; match self.state_table_info.entry(*table_id) { Entry::Occupied(mut entry) => { @@ -98,21 +149,57 @@ impl HummockVersionStateTableInfo { prev_info, new_info ); + if prev_info.compaction_group_id != new_info.compaction_group_id { + // table moved to another compaction group + remove_table_from_compaction_group( + &mut self.compaction_group_member_tables, + prev_info.compaction_group_id, + *table_id, + ); + assert!(self + .compaction_group_member_tables + .entry(new_info.compaction_group_id) + .or_default() + .insert(*table_id)); + } let prev_info = replace(prev_info, new_info); changed_table.insert(*table_id, Some(prev_info)); } Entry::Vacant(entry) => { + assert!(self + .compaction_group_member_tables + .entry(new_info.compaction_group_id) + .or_default() + .insert(*table_id)); entry.insert(new_info); changed_table.insert(*table_id, None); } } } + debug_assert_eq!( + self.compaction_group_member_tables, + Self::build_compaction_group_member_tables(&self.state_table_info) + ); changed_table } pub fn info(&self) -> &HashMap { &self.state_table_info } + + pub fn compaction_group_member_table_ids( + &self, + compaction_group_id: CompactionGroupId, + ) -> &BTreeSet { + static EMPTY_SET: LazyLock> = LazyLock::new(BTreeSet::new); + self.compaction_group_member_tables + .get(&compaction_group_id) + .unwrap_or_else(|| EMPTY_SET.deref()) + } + + pub fn compaction_group_member_tables(&self) -> &HashMap> { + &self.compaction_group_member_tables + } } #[derive(Debug, Clone, PartialEq)] @@ -225,18 +312,21 @@ impl HummockVersion { } pub fn need_fill_backward_compatible_state_table_info_delta(&self) -> bool { - // state_table_info is not previously filled, but there previously exists some tables + // for backward-compatibility of previous hummock version delta self.state_table_info.state_table_info.is_empty() - && self - .levels - .values() - .any(|group| !group.member_table_ids.is_empty()) + && self.levels.values().any(|group| { + // state_table_info is not previously filled, but there previously exists some tables + #[expect(deprecated)] + !group.member_table_ids.is_empty() + }) } pub fn may_fill_backward_compatible_state_table_info_delta( &self, delta: &mut HummockVersionDelta, ) { + #[expect(deprecated)] + // for backward-compatibility of previous hummock version delta for (cg_id, group) in &self.levels { for table_id in &group.member_table_ids { assert!( @@ -247,6 +337,7 @@ impl HummockVersion { StateTableInfoDelta { committed_epoch: self.max_committed_epoch, safe_epoch: self.safe_epoch, + compaction_group_id: *cg_id, } ) .is_none(), diff --git a/src/storage/src/hummock/local_version/pinned_version.rs b/src/storage/src/hummock/local_version/pinned_version.rs index 6302f91739c20..5e89a135d825c 100644 --- a/src/storage/src/hummock/local_version/pinned_version.rs +++ b/src/storage/src/hummock/local_version/pinned_version.rs @@ -85,7 +85,7 @@ impl PinnedVersion { pinned_version_manager_tx: UnboundedSender, ) -> Self { let version_id = version.id; - let compaction_group_index = version.build_compaction_group_info(); + let compaction_group_index = version.state_table_info.build_table_compaction_group_id(); PinnedVersion { version: Arc::new(version), @@ -109,7 +109,7 @@ impl PinnedVersion { self.version.id ); let version_id = version.id; - let compaction_group_index = version.build_compaction_group_info(); + let compaction_group_index = version.state_table_info.build_table_compaction_group_id(); PinnedVersion { version: Arc::new(version), @@ -135,9 +135,10 @@ impl PinnedVersion { pub fn levels(&self, table_id: TableId) -> impl Iterator { #[auto_enum(Iterator)] - match self.compaction_group_index.get(&table_id) { - Some(compaction_group_id) => { - let levels = self.levels_by_compaction_groups_id(*compaction_group_id); + match self.version.state_table_info.info().get(&table_id) { + Some(info) => { + let compaction_group_id = info.compaction_group_id; + let levels = self.levels_by_compaction_groups_id(compaction_group_id); levels .l0 .as_ref() From 5f41f27f7992b9cb90bbb5c5a867980c59598974 Mon Sep 17 00:00:00 2001 From: xxchan Date: Thu, 13 Jun 2024 17:48:52 +0800 Subject: [PATCH 16/19] ci: make stale bot more kind (#17230) Signed-off-by: xxchan --- .github/workflows/stale.yml | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index cdd8b591d65c8..765befde5a752 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -24,10 +24,26 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-issue-message: > This issue has been open for 60 days with no activity. - Could you please update the status? Feel free to continue discussion or close as not planned. + + If you think it is still relevant today, and needs to be done *in the near future*, you can comment to update the status, or just manually remove the `no-issue-activity` label. + + You can also confidently close this issue as not planned to keep our backlog clean. + Don't worry if you think the issue is still valuable to continue in the future. + It's searchable and can be reopened when it's time. 😄 stale-pr-message: > This PR has been open for 60 days with no activity. - Could you please update the status? Feel free to ping a reviewer if you are waiting for review. + + If it's blocked by code review, feel free to ping a reviewer or ask someone else to review it. + + If you think it is still relevant today, and have time to work on it *in the near future*, you can comment to update the status, or just manually remove the `no-pr-activity` label. + + You can also confidently close this PR to keep our backlog clean. (If no further action taken, the PR will be automatically closed after 7 days. Sorry! 🙏) + Don't worry if you think the PR is still valuable to continue in the future. + It's searchable and can be reopened when it's time. 😄 + close-pr-message: > + Close this PR as there's no further actions taken after it is marked as stale for 7 days. Sorry! 🙏 + + You can reopen it when you have time to continue working on it. stale-issue-label: 'no-issue-activity' stale-pr-label: 'no-pr-activity' days-before-close: -1 From 679d90ec09dc3e731245c209908bf626b268343f Mon Sep 17 00:00:00 2001 From: Richard Chien Date: Thu, 13 Jun 2024 18:04:13 +0800 Subject: [PATCH 17/19] feat(over window): frontend support for session frame (#17098) Signed-off-by: Richard Chien --- e2e_test/batch/over_window/main.slt.part | 1 + .../batch/over_window/session/mod.slt.part | 61 ++++++++++ e2e_test/streaming/eowc/eowc_over_window.slt | 26 ++++ .../testdata/input/over_window_function.yaml | 50 ++++++++ .../testdata/output/over_window_function.yaml | 112 ++++++++++++++++++ src/frontend/src/binder/expr/function.rs | 78 +++++++----- .../plan_node/logical_over_window.rs | 11 ++ src/sqlparser/src/ast/mod.rs | 41 +++++-- src/sqlparser/src/keywords.rs | 1 + src/sqlparser/src/parser.rs | 24 ++-- 10 files changed, 352 insertions(+), 53 deletions(-) create mode 100644 e2e_test/batch/over_window/session/mod.slt.part diff --git a/e2e_test/batch/over_window/main.slt.part b/e2e_test/batch/over_window/main.slt.part index dca370c0306ee..9ef0a7572380d 100644 --- a/e2e_test/batch/over_window/main.slt.part +++ b/e2e_test/batch/over_window/main.slt.part @@ -1 +1,2 @@ include ./generated/main.slt.part +include ./session/mod.slt.part diff --git a/e2e_test/batch/over_window/session/mod.slt.part b/e2e_test/batch/over_window/session/mod.slt.part new file mode 100644 index 0000000000000..eb0e09675cff7 --- /dev/null +++ b/e2e_test/batch/over_window/session/mod.slt.part @@ -0,0 +1,61 @@ +# Because currently general streaming version of session window is not supported yet, +# we only add e2e for batch mode. + +statement ok +create table t ( + tm timestamp, + foo int, + bar int +); + +statement ok +create view v1 as +select + *, + first_value(tm) over (partition by bar order by tm session with gap '10 minutes') as window_start, + last_value(tm) over (partition by bar order by tm session with gap '10 minutes') as window_end +from t; + +statement ok +insert into t values + ('2023-05-06 16:51:00', 1, 100) +, ('2023-05-06 16:56:00', 8, 100) +, ('2023-05-06 17:30:00', 3, 200) +, ('2023-05-06 17:35:00', 5, 100) +, ('2023-05-06 17:59:00', 4, 100) +, ('2023-05-06 18:01:00', 6, 200) +; + +query TiiTT +select * from v1 order by tm; +---- +2023-05-06 16:51:00 1 100 2023-05-06 16:51:00 2023-05-06 16:56:00 +2023-05-06 16:56:00 8 100 2023-05-06 16:51:00 2023-05-06 16:56:00 +2023-05-06 17:30:00 3 200 2023-05-06 17:30:00 2023-05-06 17:30:00 +2023-05-06 17:35:00 5 100 2023-05-06 17:35:00 2023-05-06 17:35:00 +2023-05-06 17:59:00 4 100 2023-05-06 17:59:00 2023-05-06 17:59:00 +2023-05-06 18:01:00 6 200 2023-05-06 18:01:00 2023-05-06 18:01:00 + +statement ok +insert into t values + ('2023-05-06 18:08:00', 7, 100) +, ('2023-05-06 18:10:00', 9, 200) +; + +query TiiTT +select * from v1 order by tm; +---- +2023-05-06 16:51:00 1 100 2023-05-06 16:51:00 2023-05-06 16:56:00 +2023-05-06 16:56:00 8 100 2023-05-06 16:51:00 2023-05-06 16:56:00 +2023-05-06 17:30:00 3 200 2023-05-06 17:30:00 2023-05-06 17:30:00 +2023-05-06 17:35:00 5 100 2023-05-06 17:35:00 2023-05-06 17:35:00 +2023-05-06 17:59:00 4 100 2023-05-06 17:59:00 2023-05-06 18:08:00 +2023-05-06 18:01:00 6 200 2023-05-06 18:01:00 2023-05-06 18:10:00 +2023-05-06 18:08:00 7 100 2023-05-06 17:59:00 2023-05-06 18:08:00 +2023-05-06 18:10:00 9 200 2023-05-06 18:01:00 2023-05-06 18:10:00 + +statement ok +drop view v1; + +statement ok +drop table t; diff --git a/e2e_test/streaming/eowc/eowc_over_window.slt b/e2e_test/streaming/eowc/eowc_over_window.slt index fe2570e93b6df..e762d54743e7f 100644 --- a/e2e_test/streaming/eowc/eowc_over_window.slt +++ b/e2e_test/streaming/eowc/eowc_over_window.slt @@ -42,6 +42,15 @@ select from t emit on window close; +statement ok +create materialized view mv4 as +select + *, + first_value(tm) over (partition by bar order by tm session with gap '10 minutes') as window_start, + last_value(tm) over (partition by bar order by tm session with gap '10 minutes') as window_end +from t +emit on window close; + statement ok insert into t values ('2023-05-06 16:51:00', 1, 100) @@ -71,6 +80,12 @@ select * from mv3 order by tm; 2023-05-06 17:30:00 3 200 1 2023-05-06 17:35:00 5 100 3 +query TiiTT +select * from mv4 order by tm; +---- +2023-05-06 16:51:00 1 100 2023-05-06 16:51:00 2023-05-06 16:56:00 +2023-05-06 16:56:00 8 100 2023-05-06 16:51:00 2023-05-06 16:56:00 + statement ok insert into t values ('2023-05-06 18:10:00', 7, 100) @@ -100,6 +115,14 @@ select * from mv3 order by tm; 2023-05-06 17:59:00 4 100 4 2023-05-06 18:01:00 6 200 2 +query TiiTT +select * from mv4 order by tm; +---- +2023-05-06 16:51:00 1 100 2023-05-06 16:51:00 2023-05-06 16:56:00 +2023-05-06 16:56:00 8 100 2023-05-06 16:51:00 2023-05-06 16:56:00 +2023-05-06 17:30:00 3 200 2023-05-06 17:30:00 2023-05-06 17:30:00 +2023-05-06 17:35:00 5 100 2023-05-06 17:35:00 2023-05-06 17:35:00 + statement ok drop materialized view mv1; @@ -109,5 +132,8 @@ drop materialized view mv2; statement ok drop materialized view mv3; +statement ok +drop materialized view mv4; + statement ok drop table t; diff --git a/src/frontend/planner_test/tests/testdata/input/over_window_function.yaml b/src/frontend/planner_test/tests/testdata/input/over_window_function.yaml index 4406089b273b8..09f9c1aa6ecc4 100644 --- a/src/frontend/planner_test/tests/testdata/input/over_window_function.yaml +++ b/src/frontend/planner_test/tests/testdata/input/over_window_function.yaml @@ -591,3 +591,53 @@ from t; expected_outputs: - binder_error + +# Session frames +- sql: | + create table t (i int, bi bigint, d decimal, f float, da date, t time, ts timestamp, tstz timestamptz, itv interval); + select + first_value(i) over (partition by bi order by i session with gap 10) as col1, + first_value(bi) over (partition by i order by bi session with gap 10) as col2, + first_value(i) over (partition by bi order by d session with gap 1.5) as col3, + first_value(i) over (partition by bi order by f session with gap 1.5) as col4, + -- first_value(i) over (partition by bi order by da session with gap '1 day') as col5, -- `date` not supported yet + -- first_value(i) over (partition by bi order by t session with gap '1 min') as col6, -- `time` not supported yet + first_value(i) over (partition by bi order by ts session with gap '1 day 1 hour') as col7, + first_value(i) over (partition by bi order by tstz session with gap '1 min') as col8 + from t; + expected_outputs: + - logical_plan + - optimized_logical_plan_for_stream + - stream_error # not supported yet + - batch_plan +- sql: | + create table t (i int, bi bigint, ts timestamp, watermark for ts as ts - interval '1 minute') append only; + select + first_value(i) over (partition by bi order by ts session with gap '10 minutes') as window_start, + last_value(i) over (partition by bi order by ts session with gap '10 minutes') as window_end + from t; + expected_outputs: + - logical_plan + - eowc_stream_plan + - batch_plan +- sql: | + create table t (i int, bi bigint, d decimal, f float, da date, t time, ts timestamp, tstz timestamptz, itv interval); + select + count(*) over (partition by 1::int order by da session with gap '1 day') -- `date` not supported yet + from t; + expected_outputs: + - binder_error +- sql: | + create table t (i int, bi bigint, d decimal, f float, da date, t time, ts timestamp, tstz timestamptz, itv interval); + select + count(*) over (partition by 1::int order by t session with gap '1 min') -- `time` not supported yet + from t; + expected_outputs: + - binder_error +- sql: | + create table t (i int, bi bigint, d decimal, f float, da date, t time, ts timestamp, tstz timestamptz, itv interval); + select + count(*) over (partition by 1::int order by tstz session with gap '1 day 1 hour') -- `timestamptz` +/- 'x month x day' not supported yet + from t; + expected_outputs: + - binder_error diff --git a/src/frontend/planner_test/tests/testdata/output/over_window_function.yaml b/src/frontend/planner_test/tests/testdata/output/over_window_function.yaml index 14d6d59885b40..3449606ad97ba 100644 --- a/src/frontend/planner_test/tests/testdata/output/over_window_function.yaml +++ b/src/frontend/planner_test/tests/testdata/output/over_window_function.yaml @@ -1235,3 +1235,115 @@ Caused by these errors (recent errors listed first): 1: Expr error 2: for frame order column of type `timestamptz`, offset should not have non-zero `month` and `day` +- sql: | + create table t (i int, bi bigint, d decimal, f float, da date, t time, ts timestamp, tstz timestamptz, itv interval); + select + first_value(i) over (partition by bi order by i session with gap 10) as col1, + first_value(bi) over (partition by i order by bi session with gap 10) as col2, + first_value(i) over (partition by bi order by d session with gap 1.5) as col3, + first_value(i) over (partition by bi order by f session with gap 1.5) as col4, + -- first_value(i) over (partition by bi order by da session with gap '1 day') as col5, -- `date` not supported yet + -- first_value(i) over (partition by bi order by t session with gap '1 min') as col6, -- `time` not supported yet + first_value(i) over (partition by bi order by ts session with gap '1 day 1 hour') as col7, + first_value(i) over (partition by bi order by tstz session with gap '1 min') as col8 + from t; + logical_plan: |- + LogicalProject { exprs: [first_value, first_value, first_value, first_value, first_value, first_value] } + └─LogicalOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.i ASC SESSION WITH GAP 10), first_value(t.bi) OVER(PARTITION BY t.i ORDER BY t.bi ASC SESSION WITH GAP 10), first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.d ASC SESSION WITH GAP 1.5), first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.f ASC SESSION WITH GAP 1.5), first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.ts ASC SESSION WITH GAP 1 day 01:00:00), first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.tstz ASC SESSION WITH GAP 00:01:00)] } + └─LogicalProject { exprs: [t.i, t.bi, t.d, t.f, t.da, t.t, t.ts, t.tstz, t.itv, t._row_id] } + └─LogicalScan { table: t, columns: [t.i, t.bi, t.d, t.f, t.da, t.t, t.ts, t.tstz, t.itv, t._row_id] } + optimized_logical_plan_for_stream: |- + LogicalProject { exprs: [first_value, first_value, first_value, first_value, first_value, first_value] } + └─LogicalOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.tstz ASC SESSION WITH GAP 00:01:00)] } + └─LogicalProject { exprs: [t.i, t.bi, t.tstz, first_value, first_value, first_value, first_value, first_value] } + └─LogicalOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.ts ASC SESSION WITH GAP 1 day 01:00:00)] } + └─LogicalProject { exprs: [t.i, t.bi, t.ts, t.tstz, first_value, first_value, first_value, first_value] } + └─LogicalOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.f ASC SESSION WITH GAP 1.5)] } + └─LogicalProject { exprs: [t.i, t.bi, t.f, t.ts, t.tstz, first_value, first_value, first_value] } + └─LogicalOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.d ASC SESSION WITH GAP 1.5)] } + └─LogicalOverWindow { window_functions: [first_value(t.bi) OVER(PARTITION BY t.i ORDER BY t.bi ASC SESSION WITH GAP 10)] } + └─LogicalOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.i ASC SESSION WITH GAP 10)] } + └─LogicalScan { table: t, columns: [t.i, t.bi, t.d, t.f, t.ts, t.tstz] } + batch_plan: |- + BatchExchange { order: [], dist: Single } + └─BatchProject { exprs: [first_value, first_value, first_value, first_value, first_value, first_value] } + └─BatchOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.tstz ASC SESSION WITH GAP 00:01:00)] } + └─BatchSort { order: [t.bi ASC, t.tstz ASC] } + └─BatchProject { exprs: [t.i, t.bi, t.tstz, first_value, first_value, first_value, first_value, first_value] } + └─BatchOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.ts ASC SESSION WITH GAP 1 day 01:00:00)] } + └─BatchSort { order: [t.bi ASC, t.ts ASC] } + └─BatchProject { exprs: [t.i, t.bi, t.ts, t.tstz, first_value, first_value, first_value, first_value] } + └─BatchOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.f ASC SESSION WITH GAP 1.5)] } + └─BatchSort { order: [t.bi ASC, t.f ASC] } + └─BatchProject { exprs: [t.i, t.bi, t.f, t.ts, t.tstz, first_value, first_value, first_value] } + └─BatchOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.d ASC SESSION WITH GAP 1.5)] } + └─BatchExchange { order: [t.bi ASC, t.d ASC], dist: HashShard(t.bi) } + └─BatchSort { order: [t.bi ASC, t.d ASC] } + └─BatchOverWindow { window_functions: [first_value(t.bi) OVER(PARTITION BY t.i ORDER BY t.bi ASC SESSION WITH GAP 10)] } + └─BatchExchange { order: [t.i ASC, t.bi ASC], dist: HashShard(t.i) } + └─BatchSort { order: [t.i ASC, t.bi ASC] } + └─BatchOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.i ASC SESSION WITH GAP 10)] } + └─BatchExchange { order: [t.bi ASC, t.i ASC], dist: HashShard(t.bi) } + └─BatchSort { order: [t.bi ASC, t.i ASC] } + └─BatchScan { table: t, columns: [t.i, t.bi, t.d, t.f, t.ts, t.tstz], distribution: SomeShard } + stream_error: |- + Feature is not yet implemented: Session frame is not yet supported in general streaming mode. Please consider using Emit-On-Window-Close mode. + No tracking issue yet. Feel free to submit a feature request at https://github.com/risingwavelabs/risingwave/issues/new?labels=type%2Ffeature&template=feature_request.yml +- sql: | + create table t (i int, bi bigint, ts timestamp, watermark for ts as ts - interval '1 minute') append only; + select + first_value(i) over (partition by bi order by ts session with gap '10 minutes') as window_start, + last_value(i) over (partition by bi order by ts session with gap '10 minutes') as window_end + from t; + logical_plan: |- + LogicalProject { exprs: [first_value, last_value] } + └─LogicalOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.ts ASC SESSION WITH GAP 00:10:00), last_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.ts ASC SESSION WITH GAP 00:10:00)] } + └─LogicalProject { exprs: [t.i, t.bi, t.ts, t._row_id] } + └─LogicalScan { table: t, columns: [t.i, t.bi, t.ts, t._row_id] } + batch_plan: |- + BatchExchange { order: [], dist: Single } + └─BatchProject { exprs: [first_value, last_value] } + └─BatchOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.ts ASC SESSION WITH GAP 00:10:00), last_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.ts ASC SESSION WITH GAP 00:10:00)] } + └─BatchExchange { order: [t.bi ASC, t.ts ASC], dist: HashShard(t.bi) } + └─BatchSort { order: [t.bi ASC, t.ts ASC] } + └─BatchScan { table: t, columns: [t.i, t.bi, t.ts], distribution: SomeShard } + eowc_stream_plan: |- + StreamMaterialize { columns: [window_start, window_end, t._row_id(hidden), t.bi(hidden)], stream_key: [t._row_id, t.bi], pk_columns: [t._row_id, t.bi], pk_conflict: NoCheck } + └─StreamProject { exprs: [first_value, last_value, t._row_id, t.bi] } + └─StreamEowcOverWindow { window_functions: [first_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.ts ASC SESSION WITH GAP 00:10:00), last_value(t.i) OVER(PARTITION BY t.bi ORDER BY t.ts ASC SESSION WITH GAP 00:10:00)] } + └─StreamEowcSort { sort_column: t.ts } + └─StreamExchange { dist: HashShard(t.bi) } + └─StreamTableScan { table: t, columns: [t.i, t.bi, t.ts, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } +- sql: | + create table t (i int, bi bigint, d decimal, f float, da date, t time, ts timestamp, tstz timestamptz, itv interval); + select + count(*) over (partition by 1::int order by da session with gap '1 day') -- `date` not supported yet + from t; + binder_error: | + Failed to bind expression: count(*) OVER (PARTITION BY CAST(1 AS INT) ORDER BY da SESSION WITH GAP '1 day') + + Caused by: + Feature is not yet implemented: `SESSION` frame with offset of type `date` is not implemented yet, please manually cast the `ORDER BY` column to `timestamp` + No tracking issue yet. Feel free to submit a feature request at https://github.com/risingwavelabs/risingwave/issues/new?labels=type%2Ffeature&template=feature_request.yml +- sql: | + create table t (i int, bi bigint, d decimal, f float, da date, t time, ts timestamp, tstz timestamptz, itv interval); + select + count(*) over (partition by 1::int order by t session with gap '1 min') -- `time` not supported yet + from t; + binder_error: | + Failed to bind expression: count(*) OVER (PARTITION BY CAST(1 AS INT) ORDER BY t SESSION WITH GAP '1 min') + + Caused by: + Feature is not yet implemented: `SESSION` frame with offset of type `time without time zone` is not implemented yet, please manually cast the `ORDER BY` column to `timestamp` + No tracking issue yet. Feel free to submit a feature request at https://github.com/risingwavelabs/risingwave/issues/new?labels=type%2Ffeature&template=feature_request.yml +- sql: | + create table t (i int, bi bigint, d decimal, f float, da date, t time, ts timestamp, tstz timestamptz, itv interval); + select + count(*) over (partition by 1::int order by tstz session with gap '1 day 1 hour') -- `timestamptz` +/- 'x month x day' not supported yet + from t; + binder_error: | + Failed to bind expression: count(*) OVER (PARTITION BY CAST(1 AS INT) ORDER BY tstz SESSION WITH GAP '1 day 1 hour') + + Caused by these errors (recent errors listed first): + 1: Expr error + 2: for session order column of type `timestamptz`, gap should not have non-zero `month` and `day` diff --git a/src/frontend/src/binder/expr/function.rs b/src/frontend/src/binder/expr/function.rs index 74162c35e9820..2a4812c2e1d1f 100644 --- a/src/frontend/src/binder/expr/function.rs +++ b/src/frontend/src/binder/expr/function.rs @@ -23,15 +23,15 @@ use risingwave_common::array::ListValue; use risingwave_common::catalog::{INFORMATION_SCHEMA_SCHEMA_NAME, PG_CATALOG_SCHEMA_NAME}; use risingwave_common::session_config::USER_NAME_WILD_CARD; use risingwave_common::types::{data_types, DataType, ScalarImpl, Timestamptz}; -use risingwave_common::{bail_not_implemented, current_cluster_version, no_function}; +use risingwave_common::{bail_not_implemented, current_cluster_version, must_match, no_function}; use risingwave_expr::aggregate::{agg_kinds, AggKind}; use risingwave_expr::window_function::{ Frame, FrameBound, FrameBounds, FrameExclusion, RangeFrameBounds, RangeFrameOffset, - RowsFrameBounds, WindowFuncKind, + RowsFrameBounds, SessionFrameBounds, SessionFrameGap, WindowFuncKind, }; use risingwave_sqlparser::ast::{ - self, Function, FunctionArg, FunctionArgExpr, Ident, WindowFrameBound, WindowFrameExclusion, - WindowFrameUnits, WindowSpec, + self, Function, FunctionArg, FunctionArgExpr, Ident, WindowFrameBound, WindowFrameBounds, + WindowFrameExclusion, WindowFrameUnits, WindowSpec, }; use risingwave_sqlparser::parser::ParserError; use thiserror_ext::AsReport; @@ -630,34 +630,35 @@ impl Binder { }; let bounds = match frame.units { WindowFrameUnits::Rows => { - let (start, end) = - self.bind_window_frame_usize_bounds(frame.start_bound, frame.end_bound)?; + let (start, end) = must_match!(frame.bounds, WindowFrameBounds::Bounds { start, end } => (start, end)); + let (start, end) = self.bind_window_frame_usize_bounds(start, end)?; FrameBounds::Rows(RowsFrameBounds { start, end }) } - WindowFrameUnits::Range => { + unit @ (WindowFrameUnits::Range | WindowFrameUnits::Session) => { let order_by_expr = order_by .sort_exprs .iter() - // for `RANGE` frame, there should be exactly one `ORDER BY` column + // for `RANGE | SESSION` frame, there should be exactly one `ORDER BY` column .exactly_one() .map_err(|_| { - ErrorCode::InvalidInputSyntax( - "there should be exactly one ordering column for `RANGE` frame" - .to_string(), - ) + ErrorCode::InvalidInputSyntax(format!( + "there should be exactly one ordering column for `{}` frame", + unit + )) })?; let order_data_type = order_by_expr.expr.return_type(); let order_type = order_by_expr.order_type; let offset_data_type = match &order_data_type { - // for numeric ordering columns, `offset` should be the same type + // for numeric ordering columns, `offset`/`gap` should be the same type // NOTE: actually in PG it can be a larger type, but we don't support this here t @ data_types::range_frame_numeric!() => t.clone(), - // for datetime ordering columns, `offset` should be interval + // for datetime ordering columns, `offset`/`gap` should be interval t @ data_types::range_frame_datetime!() => { if matches!(t, DataType::Date | DataType::Time) { bail_not_implemented!( - "`RANGE` frame with offset of type `{}` is not implemented yet, please manually cast the `ORDER BY` column to `timestamp`", + "`{}` frame with offset of type `{}` is not implemented yet, please manually cast the `ORDER BY` column to `timestamp`", + unit, t ); } @@ -667,8 +668,8 @@ impl Binder { t => { return Err(ErrorCode::NotSupported( format!( - "`RANGE` frame with offset of type `{}` is not supported", - t + "`{}` frame with offset of type `{}` is not supported", + unit, t ), "Please re-consider the `ORDER BY` column".to_string(), ) @@ -676,18 +677,31 @@ impl Binder { } }; - let (start, end) = self.bind_window_frame_scalar_impl_bounds( - frame.start_bound, - frame.end_bound, - &offset_data_type, - )?; - FrameBounds::Range(RangeFrameBounds { - order_data_type, - order_type, - offset_data_type, - start: start.map(RangeFrameOffset::new), - end: end.map(RangeFrameOffset::new), - }) + if unit == WindowFrameUnits::Range { + let (start, end) = must_match!(frame.bounds, WindowFrameBounds::Bounds { start, end } => (start, end)); + let (start, end) = self.bind_window_frame_scalar_impl_bounds( + start, + end, + &offset_data_type, + )?; + FrameBounds::Range(RangeFrameBounds { + order_data_type, + order_type, + offset_data_type, + start: start.map(RangeFrameOffset::new), + end: end.map(RangeFrameOffset::new), + }) + } else { + let gap = must_match!(frame.bounds, WindowFrameBounds::Gap(gap) => gap); + let gap_value = + self.bind_window_frame_bound_offset(*gap, offset_data_type.clone())?; + FrameBounds::Session(SessionFrameBounds { + order_data_type, + order_type, + gap_data_type: offset_data_type, + gap: SessionFrameGap::new(gap_value), + }) + } } WindowFrameUnits::Groups => { bail_not_implemented!( @@ -782,13 +796,13 @@ impl Binder { let mut offset = self.bind_expr(offset)?; if !offset.is_const() { return Err(ErrorCode::InvalidInputSyntax( - "offset in window frame bounds must be constant".to_string(), + "offset/gap in window frame bounds must be constant".to_string(), ) .into()); } if offset.cast_implicit_mut(cast_to.clone()).is_err() { return Err(ErrorCode::InvalidInputSyntax(format!( - "offset in window frame bounds must be castable to {}", + "offset/gap in window frame bounds must be castable to {}", cast_to )) .into()); @@ -796,7 +810,7 @@ impl Binder { let offset = offset.fold_const()?; let Some(offset) = offset else { return Err(ErrorCode::InvalidInputSyntax( - "offset in window frame bounds must not be NULL".to_string(), + "offset/gap in window frame bounds must not be NULL".to_string(), ) .into()); }; diff --git a/src/frontend/src/optimizer/plan_node/logical_over_window.rs b/src/frontend/src/optimizer/plan_node/logical_over_window.rs index cf130b57bac94..bd38569397992 100644 --- a/src/frontend/src/optimizer/plan_node/logical_over_window.rs +++ b/src/frontend/src/optimizer/plan_node/logical_over_window.rs @@ -694,6 +694,17 @@ impl ToStream for LogicalOverWindow { } else { // General (Emit-On-Update) case + if self + .window_functions() + .iter() + .any(|f| f.frame.bounds.is_session()) + { + bail_not_implemented!( + "Session frame is not yet supported in general streaming mode. \ + Please consider using Emit-On-Window-Close mode." + ); + } + // TODO(rc): Let's not introduce too many cases at once. Later we may decide to support // empty PARTITION BY by simply removing the following check. let partition_key_indices = self.window_functions()[0] diff --git a/src/sqlparser/src/ast/mod.rs b/src/sqlparser/src/ast/mod.rs index 37ff551b99511..49fddbfaa4b82 100644 --- a/src/sqlparser/src/ast/mod.rs +++ b/src/sqlparser/src/ast/mod.rs @@ -869,11 +869,7 @@ impl fmt::Display for WindowSpec { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct WindowFrame { pub units: WindowFrameUnits, - pub start_bound: WindowFrameBound, - /// The right bound of the `BETWEEN .. AND` clause. The end bound of `None` - /// indicates the shorthand form (e.g. `ROWS 1 PRECEDING`), which must - /// behave the same as `end_bound = WindowFrameBound::CurrentRow`. - pub end_bound: Option, + pub bounds: WindowFrameBounds, pub exclusion: Option, } @@ -883,18 +879,36 @@ pub enum WindowFrameUnits { Rows, Range, Groups, + Session, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub enum WindowFrameBounds { + Bounds { + start: WindowFrameBound, + /// The right bound of the `BETWEEN .. AND` clause. The end bound of `None` + /// indicates the shorthand form (e.g. `ROWS 1 PRECEDING`), which must + /// behave the same as `end_bound = WindowFrameBound::CurrentRow`. + end: Option, + }, + Gap(Box), } impl fmt::Display for WindowFrame { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if let Some(end_bound) = &self.end_bound { - write!( - f, - "{} BETWEEN {} AND {}", - self.units, self.start_bound, end_bound - ) - } else { - write!(f, "{} {}", self.units, self.start_bound) + write!(f, "{} ", self.units)?; + match &self.bounds { + WindowFrameBounds::Bounds { start, end } => { + if let Some(end) = end { + write!(f, "BETWEEN {} AND {}", start, end) + } else { + write!(f, "{}", start) + } + } + WindowFrameBounds::Gap(gap) => { + write!(f, "WITH GAP {}", gap) + } } } } @@ -905,6 +919,7 @@ impl fmt::Display for WindowFrameUnits { WindowFrameUnits::Rows => "ROWS", WindowFrameUnits::Range => "RANGE", WindowFrameUnits::Groups => "GROUPS", + WindowFrameUnits::Session => "SESSION", }) } } diff --git a/src/sqlparser/src/keywords.rs b/src/sqlparser/src/keywords.rs index c94ef26e993ba..fc5971bf4640b 100644 --- a/src/sqlparser/src/keywords.rs +++ b/src/sqlparser/src/keywords.rs @@ -248,6 +248,7 @@ define_keywords!( FUNCTION, FUNCTIONS, FUSION, + GAP, GENERATOR, GET, GLOBAL, diff --git a/src/sqlparser/src/parser.rs b/src/sqlparser/src/parser.rs index 37cfa42dffd7c..dbc79542949d2 100644 --- a/src/sqlparser/src/parser.rs +++ b/src/sqlparser/src/parser.rs @@ -820,7 +820,7 @@ impl Parser<'_> { let distinct = self.parse_all_or_distinct()?; let (args, order_by, variadic) = self.parse_optional_args()?; let over = if self.parse_keyword(Keyword::OVER) { - // TBD: support window names (`OVER mywin`) in place of inline specification + // TODO: support window names (`OVER mywin`) in place of inline specification self.expect_token(&Token::LParen)?; let partition_by = if self.parse_keywords(&[Keyword::PARTITION, Keyword::BY]) { // a list of possibly-qualified column names @@ -887,6 +887,7 @@ impl Parser<'_> { Keyword::ROWS => keyword.value(WindowFrameUnits::Rows), Keyword::RANGE => keyword.value(WindowFrameUnits::Range), Keyword::GROUPS => keyword.value(WindowFrameUnits::Groups), + Keyword::SESSION => keyword.value(WindowFrameUnits::Session), _ => fail, } .expect("ROWS, RANGE, or GROUPS") @@ -895,13 +896,21 @@ impl Parser<'_> { pub fn parse_window_frame(&mut self) -> PResult { let units = self.parse_window_frame_units()?; - let (start_bound, end_bound) = if self.parse_keyword(Keyword::BETWEEN) { - let start_bound = self.parse_window_frame_bound()?; + let bounds = if self.parse_keyword(Keyword::BETWEEN) { + // `BETWEEN AND ` + let start = self.parse_window_frame_bound()?; self.expect_keyword(Keyword::AND)?; - let end_bound = Some(self.parse_window_frame_bound()?); - (start_bound, end_bound) + let end = Some(self.parse_window_frame_bound()?); + WindowFrameBounds::Bounds { start, end } + } else if self.parse_keywords(&[Keyword::WITH, Keyword::GAP]) { + // `WITH GAP `, only for session frames + WindowFrameBounds::Gap(Box::new(self.parse_expr()?)) } else { - (self.parse_window_frame_bound()?, None) + // `` + WindowFrameBounds::Bounds { + start: self.parse_window_frame_bound()?, + end: None, + } }; let exclusion = if self.parse_keyword(Keyword::EXCLUDE) { Some(self.parse_window_frame_exclusion()?) @@ -910,8 +919,7 @@ impl Parser<'_> { }; Ok(WindowFrame { units, - start_bound, - end_bound, + bounds, exclusion, }) } From 41fe5010ba764506b1913e63c4eadc900543e8b8 Mon Sep 17 00:00:00 2001 From: StrikeW Date: Thu, 13 Jun 2024 18:28:14 +0800 Subject: [PATCH 18/19] fix(jdbc-sink): set network timeout (#17244) --- .../java/com/risingwave/connector/JdbcUtils.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JdbcUtils.java b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JdbcUtils.java index 7d28e4553fe3e..2360ac96585dd 100644 --- a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JdbcUtils.java +++ b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JdbcUtils.java @@ -25,6 +25,9 @@ public abstract class JdbcUtils { + static final int CONNECTION_TIMEOUT = 30; + static final int SOCKET_TIMEOUT = 300; + public static Optional getDialectFactory(String jdbcUrl) { if (jdbcUrl.startsWith("jdbc:mysql")) { return Optional.of(new MySqlDialectFactory()); @@ -43,6 +46,16 @@ public static Connection getConnection(String jdbcUrl) throws SQLException { // https://jdbc.postgresql.org/documentation/use/ // https://dev.mysql.com/doc/connectors/en/connector-j-connp-props-networking.html#cj-conn-prop_tcpKeepAlive props.setProperty("tcpKeepAlive", "true"); + + // default timeout in seconds + boolean isPg = jdbcUrl.startsWith("jdbc:postgresql"); + + // postgres use seconds and mysql use milliseconds + int connectTimeout = isPg ? CONNECTION_TIMEOUT : CONNECTION_TIMEOUT * 1000; + int socketTimeout = isPg ? SOCKET_TIMEOUT : SOCKET_TIMEOUT * 1000; + props.setProperty("connectTimeout", String.valueOf(connectTimeout)); + props.setProperty("socketTimeout", String.valueOf(socketTimeout)); + var conn = DriverManager.getConnection(jdbcUrl, props); // disable auto commit can improve performance conn.setAutoCommit(false); From 5c24685b79735432f184d62854b6498a3575cc4b Mon Sep 17 00:00:00 2001 From: Xinhao Xu <84456268+xxhZs@users.noreply.github.com> Date: Fri, 14 Jun 2024 11:05:46 +0800 Subject: [PATCH 19/19] feat(sink): enable sink_decouple by default for kafka, pulsar, kinesis, google pubsub, nats, mqtt clickhouse (#17221) --- e2e_test/sink/clickhouse_sink.slt | 3 +++ e2e_test/sink/kafka/avro.slt | 3 +++ e2e_test/sink/kafka/create_sink.slt | 3 +++ e2e_test/sink/kafka/protobuf.slt | 3 +++ e2e_test/sink/mqtt_sink.slt | 3 +++ e2e_test/sink/pulsar_sink.slt | 3 +++ src/connector/src/sink/clickhouse.rs | 5 ++--- src/connector/src/sink/google_pubsub.rs | 5 ++--- src/connector/src/sink/kafka.rs | 5 ++--- src/connector/src/sink/kinesis.rs | 5 ++--- src/connector/src/sink/mqtt.rs | 5 ++--- src/connector/src/sink/nats.rs | 5 ++--- src/connector/src/sink/pulsar.rs | 5 ++--- 13 files changed, 32 insertions(+), 21 deletions(-) diff --git a/e2e_test/sink/clickhouse_sink.slt b/e2e_test/sink/clickhouse_sink.slt index 2adc70dcf409e..e5bac0d8d521d 100644 --- a/e2e_test/sink/clickhouse_sink.slt +++ b/e2e_test/sink/clickhouse_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 bigint, v3 varchar, v4 smallint, v5 decimal); diff --git a/e2e_test/sink/kafka/avro.slt b/e2e_test/sink/kafka/avro.slt index 1cf27b811d9be..bd684bebdf785 100644 --- a/e2e_test/sink/kafka/avro.slt +++ b/e2e_test/sink/kafka/avro.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table from_kafka ( *, gen_i32_field int as int32_field + 2, primary key (some_key) ) include key as some_key diff --git a/e2e_test/sink/kafka/create_sink.slt b/e2e_test/sink/kafka/create_sink.slt index c3f738fbce2ae..f5e2e0333bc35 100644 --- a/e2e_test/sink/kafka/create_sink.slt +++ b/e2e_test/sink/kafka/create_sink.slt @@ -1,6 +1,9 @@ statement ok set rw_implicit_flush=true; +statement ok +set sink_decouple = false; + statement ok create table t_kafka ( id integer primary key, diff --git a/e2e_test/sink/kafka/protobuf.slt b/e2e_test/sink/kafka/protobuf.slt index c3f6f0d3ad8e2..61a91435567da 100644 --- a/e2e_test/sink/kafka/protobuf.slt +++ b/e2e_test/sink/kafka/protobuf.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table from_kafka with ( connector = 'kafka', diff --git a/e2e_test/sink/mqtt_sink.slt b/e2e_test/sink/mqtt_sink.slt index d19addb024c97..2602d2ddc6198 100644 --- a/e2e_test/sink/mqtt_sink.slt +++ b/e2e_test/sink/mqtt_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE mqtt ( device_id varchar, diff --git a/e2e_test/sink/pulsar_sink.slt b/e2e_test/sink/pulsar_sink.slt index 2284f9c1877e3..f8d6aa1aaff4c 100644 --- a/e2e_test/sink/pulsar_sink.slt +++ b/e2e_test/sink/pulsar_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE pulsar ( id BIGINT, diff --git a/src/connector/src/sink/clickhouse.rs b/src/connector/src/sink/clickhouse.rs index 26a779dc94ae1..c506f00e6d2ca 100644 --- a/src/connector/src/sink/clickhouse.rs +++ b/src/connector/src/sink/clickhouse.rs @@ -379,11 +379,10 @@ impl Sink for ClickHouseSink { const SINK_NAME: &'static str = CLICKHOUSE_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(_desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { match user_specified { - SinkDecouple::Default => Ok(desc.sink_type.is_append_only()), + SinkDecouple::Default | SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), - SinkDecouple::Enable => Ok(true), } } diff --git a/src/connector/src/sink/google_pubsub.rs b/src/connector/src/sink/google_pubsub.rs index 9f250b6692cd2..ad039ad020704 100644 --- a/src/connector/src/sink/google_pubsub.rs +++ b/src/connector/src/sink/google_pubsub.rs @@ -96,11 +96,10 @@ impl Sink for GooglePubSubSink { const SINK_NAME: &'static str = PUBSUB_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(_desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { match user_specified { - SinkDecouple::Default => Ok(desc.sink_type.is_append_only()), + SinkDecouple::Default | SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), - SinkDecouple::Enable => Ok(true), } } diff --git a/src/connector/src/sink/kafka.rs b/src/connector/src/sink/kafka.rs index 1abfed3a39a45..617f427ae71f1 100644 --- a/src/connector/src/sink/kafka.rs +++ b/src/connector/src/sink/kafka.rs @@ -320,11 +320,10 @@ impl Sink for KafkaSink { const SINK_NAME: &'static str = KAFKA_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(_desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { match user_specified { - SinkDecouple::Default => Ok(desc.sink_type.is_append_only()), + SinkDecouple::Default | SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), - SinkDecouple::Enable => Ok(true), } } diff --git a/src/connector/src/sink/kinesis.rs b/src/connector/src/sink/kinesis.rs index 04cd3390f1c9d..771d3c8a6f91d 100644 --- a/src/connector/src/sink/kinesis.rs +++ b/src/connector/src/sink/kinesis.rs @@ -80,11 +80,10 @@ impl Sink for KinesisSink { const SINK_NAME: &'static str = KINESIS_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(_desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { match user_specified { - SinkDecouple::Default => Ok(desc.sink_type.is_append_only()), + SinkDecouple::Default | SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), - SinkDecouple::Enable => Ok(true), } } diff --git a/src/connector/src/sink/mqtt.rs b/src/connector/src/sink/mqtt.rs index 7bb0872b27f35..d9dfdbe03b21b 100644 --- a/src/connector/src/sink/mqtt.rs +++ b/src/connector/src/sink/mqtt.rs @@ -165,11 +165,10 @@ impl Sink for MqttSink { const SINK_NAME: &'static str = MQTT_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(_desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { match user_specified { - SinkDecouple::Default => Ok(desc.sink_type.is_append_only()), + SinkDecouple::Default | SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), - SinkDecouple::Enable => Ok(true), } } diff --git a/src/connector/src/sink/nats.rs b/src/connector/src/sink/nats.rs index 109e52473b92b..162aca3c4d2e8 100644 --- a/src/connector/src/sink/nats.rs +++ b/src/connector/src/sink/nats.rs @@ -99,11 +99,10 @@ impl Sink for NatsSink { const SINK_NAME: &'static str = NATS_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(_desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { match user_specified { - SinkDecouple::Default => Ok(desc.sink_type.is_append_only()), + SinkDecouple::Default | SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), - SinkDecouple::Enable => Ok(true), } } diff --git a/src/connector/src/sink/pulsar.rs b/src/connector/src/sink/pulsar.rs index 8b6f963c6a3a7..3f016ad94946d 100644 --- a/src/connector/src/sink/pulsar.rs +++ b/src/connector/src/sink/pulsar.rs @@ -170,11 +170,10 @@ impl Sink for PulsarSink { const SINK_NAME: &'static str = PULSAR_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(_desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { match user_specified { - SinkDecouple::Default => Ok(desc.sink_type.is_append_only()), + SinkDecouple::Default | SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), - SinkDecouple::Enable => Ok(true), } }