diff --git a/.gitignore b/.gitignore index 72e3ed1487fe0..5f54a467b21b7 100644 --- a/.gitignore +++ b/.gitignore @@ -71,12 +71,12 @@ e2e_test/generated/* scale-test.tar.zst simulation-it-test.tar.zst - # hummock-trace .trace # spark binary e2e_test/iceberg/spark-*-bin* +e2e_test/iceberg/metastore_db **/poetry.lock diff --git a/.typos.toml b/.typos.toml index 4d4bbfca1c082..498d954a55d88 100644 --- a/.typos.toml +++ b/.typos.toml @@ -36,4 +36,5 @@ extend-exclude = [ # We don't want to fix "fals" here, but may want in other places. # Ideally, we should just ignore that line: https://github.com/crate-ci/typos/issues/316 "src/common/src/cast/mod.rs", + "src/tests/simulation/tests/integration_tests/scale/shared_source.rs", ] diff --git a/Cargo.lock b/Cargo.lock index c8bb3bb7afa86..366fb3b36a672 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -991,7 +991,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "136d4d23bcc79e27423727b36823d86233aad06dfea531837b038394d11e9928" dependencies = [ "concurrent-queue", - "event-listener 5.2.0", + "event-listener 5.3.1", "event-listener-strategy", "futures-core", "pin-project-lite", @@ -2345,9 +2345,9 @@ dependencies = [ [[package]] name = "bytecount" -version = "0.6.3" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" [[package]] name = "bytemuck" @@ -2418,9 +2418,9 @@ checksum = "981520c98f422fcc584dc1a95c334e6953900b9106bc47a9839b81790009eb21" [[package]] name = "camino" -version = "1.1.6" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +checksum = "8b96ec4966b5813e2c0507c1f86115c8c5abaadc3980879c3424042a02fd1ad3" dependencies = [ "serde", ] @@ -2498,9 +2498,9 @@ checksum = "1582e1c9e755dd6ad6b224dcffb135d199399a4568d454bd89fe515ca8425695" [[package]] name = "cargo-platform" -version = "0.1.3" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cfa25e60aea747ec7e1124f238816749faa93759c6ff5b31f1ccdda137f4479" +checksum = "24b1f0365a6c6bb4020cd05806fd0d33c44d38046b8bd7f0e40814b9763cabfc" dependencies = [ "serde", ] @@ -2852,9 +2852,9 @@ dependencies = [ [[package]] name = "concurrent-queue" -version = "2.2.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" dependencies = [ "crossbeam-utils", ] @@ -4568,9 +4568,9 @@ checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" [[package]] name = "event-listener" -version = "5.2.0" +version = "5.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b5fb89194fa3cad959b833185b3063ba881dbfc7030680b314250779fb4cc91" +checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba" dependencies = [ "concurrent-queue", "parking", @@ -4583,7 +4583,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "332f51cb23d20b0de8458b86580878211da09bcd4503cb579c225b3d124cabb3" dependencies = [ - "event-listener 5.2.0", + "event-listener 5.3.1", "pin-project-lite", ] @@ -6004,8 +6004,7 @@ dependencies = [ [[package]] name = "iceberg" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "651dfca7c429918e164607a549287cfdd1e7814d2e4cb577d0d6dc57fe19b785" +source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=84bf51c9d0d5886e4ee306ca4f383f029e1767a4#84bf51c9d0d5886e4ee306ca4f383f029e1767a4" dependencies = [ "anyhow", "apache-avro 0.17.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -6025,11 +6024,13 @@ dependencies = [ "fnv", "futures", "itertools 0.13.0", + "moka", "murmur3", "once_cell", "opendal 0.49.0", "ordered-float 4.1.1", "parquet 52.0.0", + "paste", "reqwest 0.12.4", "rust_decimal", "serde", @@ -6039,7 +6040,7 @@ dependencies = [ "serde_repr", "serde_with 3.8.0", "tokio", - "typed-builder 0.19.1", + "typed-builder 0.20.0", "url", "uuid", ] @@ -6047,8 +6048,7 @@ dependencies = [ [[package]] name = "iceberg-catalog-glue" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64ef7c992442a80c46975e08f3862140ca3e1c1c772aa68baaf65bb08f97ff07" +source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=84bf51c9d0d5886e4ee306ca4f383f029e1767a4#84bf51c9d0d5886e4ee306ca4f383f029e1767a4" dependencies = [ "anyhow", "async-trait", @@ -6058,15 +6058,14 @@ dependencies = [ "log", "serde_json", "tokio", - "typed-builder 0.19.1", + "typed-builder 0.20.0", "uuid", ] [[package]] name = "iceberg-catalog-rest" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f351c7b964fa6f3b4f976f8de3f16f1bf84eea8478606aaebdfd6a871d6b082c" +source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=84bf51c9d0d5886e4ee306ca4f383f029e1767a4#84bf51c9d0d5886e4ee306ca4f383f029e1767a4" dependencies = [ "async-trait", "chrono", @@ -6079,14 +6078,14 @@ dependencies = [ "serde_derive", "serde_json", "tokio", - "typed-builder 0.19.1", + "typed-builder 0.20.0", "uuid", ] [[package]] name = "icelake" version = "0.3.141592654" -source = "git+https://github.com/risingwavelabs/icelake.git?rev=1860eb315183a5f3f72b4097c1e40d49407f8373#1860eb315183a5f3f72b4097c1e40d49407f8373" +source = "git+https://github.com/risingwavelabs/icelake.git?rev=3f4724158acee37a4785f56670a1427993a58739#3f4724158acee37a4785f56670a1427993a58739" dependencies = [ "anyhow", "apache-avro 0.17.0 (git+https://github.com/apache/avro.git)", @@ -9127,7 +9126,7 @@ checksum = "8bdf592881d821b83d471f8af290226c8d51402259e9bb5be7f9f8bdebbb11ac" dependencies = [ "bytes", "heck 0.4.1", - "itertools 0.10.5", + "itertools 0.11.0", "log", "multimap 0.8.3", "once_cell", @@ -9182,7 +9181,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "265baba7fabd416cf5078179f7d2cbeca4ce7a9041111900675ea7c4cb8a4c32" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.11.0", "proc-macro2", "quote", "syn 2.0.66", @@ -9216,9 +9215,12 @@ version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55a6a9143ae25c25fa7b6a48d6cc08b10785372060009c25140a4e7c340e95af" dependencies = [ + "base64 0.22.0", "once_cell", "prost 0.13.1", "prost-types 0.13.1", + "serde", + "serde-value", ] [[package]] @@ -9308,11 +9310,11 @@ dependencies = [ [[package]] name = "pulldown-cmark" -version = "0.9.3" +version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a1a2f1f0a7ecff9c31abbe177637be0e97a0aef46cf8738ece09327985d998" +checksum = "57206b407293d2bcd3af849ce869d52068623f19e1b5ff8e8778e3309439682b" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", "memchr", "unicase", ] @@ -9365,7 +9367,7 @@ dependencies = [ "indoc", "libc", "memoffset", - "parking_lot 0.11.2", + "parking_lot 0.12.1", "portable-atomic", "pyo3-build-config", "pyo3-ffi", @@ -11166,7 +11168,10 @@ name = "risingwave_license" version = "2.1.0-alpha" dependencies = [ "expect-test", + "jsonbb", "jsonwebtoken", + "risingwave_pb", + "risingwave_telemetry_event", "serde", "thiserror", "thiserror-ext", @@ -11207,6 +11212,7 @@ dependencies = [ "comfy-table", "crepe", "easy-ext", + "educe", "either", "enum-as-inner 0.6.0", "expect-test", @@ -11554,6 +11560,7 @@ dependencies = [ "madsim-etcd-client", "madsim-rdkafka", "madsim-tokio", + "maplit", "paste", "pin-project", "pretty_assertions", @@ -14825,6 +14832,15 @@ dependencies = [ "typed-builder-macro 0.19.1", ] +[[package]] +name = "typed-builder" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e14ed59dc8b7b26cacb2a92bad2e8b1f098806063898ab42a3bd121d7d45e75" +dependencies = [ + "typed-builder-macro 0.20.0", +] + [[package]] name = "typed-builder-macro" version = "0.16.2" @@ -14858,6 +14874,17 @@ dependencies = [ "syn 2.0.66", ] +[[package]] +name = "typed-builder-macro" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "typenum" version = "1.16.0" diff --git a/Cargo.toml b/Cargo.toml index a5da9b82b658c..63feb981d9a1b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -135,16 +135,18 @@ tonic-build = { package = "madsim-tonic-build", version = "0.5" } otlp-embedded = { git = "https://github.com/risingwavelabs/otlp-embedded", rev = "e6cd165b9bc85783b42c106e99186b86b73e3507" } prost = { version = "0.13" } prost-build = { version = "0.13" } -icelake = { git = "https://github.com/risingwavelabs/icelake.git", rev = "1860eb315183a5f3f72b4097c1e40d49407f8373", features = [ +# branch dylan/fix_parquet_nested_type_field_id +icelake = { git = "https://github.com/risingwavelabs/icelake.git", rev = "3f4724158acee37a4785f56670a1427993a58739", features = [ "prometheus", ] } arrow-array-iceberg = { package = "arrow-array", version = "52" } arrow-schema-iceberg = { package = "arrow-schema", version = "52" } arrow-buffer-iceberg = { package = "arrow-buffer", version = "52" } arrow-cast-iceberg = { package = "arrow-cast", version = "52" } -iceberg = "0.3.0" -iceberg-catalog-rest = "0.3.0" -iceberg-catalog-glue = "0.3.0" +# branch dev +iceberg = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "84bf51c9d0d5886e4ee306ca4f383f029e1767a4" } +iceberg-catalog-rest = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "84bf51c9d0d5886e4ee306ca4f383f029e1767a4" } +iceberg-catalog-glue = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "84bf51c9d0d5886e4ee306ca4f383f029e1767a4" } opendal = "0.47" arrow-array = "50" arrow-arith = "50" diff --git a/README.md b/README.md index 7128dccede28b..4c0e043b71513 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ RisingWave is a Postgres-compatible SQL database engineered to provide the ingest millions of events per second, continuously join and analyze live data streams with historical tables, serve ad-hoc queries in real-time, and deliver fresh, consistent results wherever needed. -![RisingWave](./docs/dev/src/images/architecture_20240814.png) +![RisingWave](./docs/dev/src/images/architecture_20240908.png) ## Try it out in 60 seconds diff --git a/ci/build-ci-image.sh b/ci/build-ci-image.sh index 88542b4aa5f12..9d00b47bcd3aa 100755 --- a/ci/build-ci-image.sh +++ b/ci/build-ci-image.sh @@ -10,7 +10,7 @@ cat ../rust-toolchain # shellcheck disable=SC2155 # REMEMBER TO ALSO UPDATE ci/docker-compose.yml -export BUILD_ENV_VERSION=v20240812 +export BUILD_ENV_VERSION=v20240911 export BUILD_TAG="public.ecr.aws/w1p7b4n3/rw-build-env:${BUILD_ENV_VERSION}" diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml index 4b1954ff5ae2c..11d29d7236367 100644 --- a/ci/docker-compose.yml +++ b/ci/docker-compose.yml @@ -71,7 +71,7 @@ services: retries: 5 source-test-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240812 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 depends_on: - mysql - sqlserver-server @@ -85,7 +85,7 @@ services: - ..:/risingwave sink-test-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240812 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 depends_on: - mysql - db @@ -108,12 +108,12 @@ services: rw-build-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240812 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 volumes: - ..:/risingwave ci-flamegraph-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240812 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 # NOTE(kwannoel): This is used in order to permit # syscalls for `nperf` (perf_event_open), # so it can do CPU profiling. @@ -124,7 +124,7 @@ services: - ..:/risingwave regress-test-env: - image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240812 + image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 depends_on: db: condition: service_healthy diff --git a/ci/rust-toolchain b/ci/rust-toolchain index 6bc57a2a65d8f..158ecbbdb0dfd 100644 --- a/ci/rust-toolchain +++ b/ci/rust-toolchain @@ -4,4 +4,4 @@ # 3. (optional) **follow the instructions in lints/README.md** to update the toolchain and dependencies for lints [toolchain] -channel = "nightly-2024-06-06" +channel = "nightly-2024-07-19" diff --git a/ci/scripts/e2e-cassandra-sink-test.sh b/ci/scripts/e2e-cassandra-sink-test.sh index 0e1c9a98d49e8..b222e4a944967 100755 --- a/ci/scripts/e2e-cassandra-sink-test.sh +++ b/ci/scripts/e2e-cassandra-sink-test.sh @@ -42,8 +42,9 @@ tar xfvz cassandra_latest.tar.gz export LATEST_CASSANDRA_VERSION=$(get_latest_cassandra_version) export CASSANDRA_DIR="./apache-cassandra-${LATEST_CASSANDRA_VERSION}" # remove bundled packages, and use installed packages, because Python 3.12 has removed asyncore, but I failed to install libev support for bundled Python driver. -rm ${CASSANDRA_DIR}/lib/six-1.12.0-py2.py3-none-any.zip -rm ${CASSANDRA_DIR}/lib/cassandra-driver-internal-only-3.25.0.zip + +rm ${CASSANDRA_DIR}/lib/futures-2.1.6-py2.py3-none-any.zip +rm ${CASSANDRA_DIR}/lib/cassandra-driver-internal-only-3.29.0.zip apt-get install -y libev4 libev-dev pip3 install --break-system-packages cassandra-driver export CQLSH_HOST=cassandra-server diff --git a/ci/scripts/e2e-iceberg-sink-v2-test.sh b/ci/scripts/e2e-iceberg-sink-v2-test.sh index dd2f78037a5f2..c039c625aa213 100755 --- a/ci/scripts/e2e-iceberg-sink-v2-test.sh +++ b/ci/scripts/e2e-iceberg-sink-v2-test.sh @@ -45,6 +45,8 @@ poetry run python main.py -t ./test_case/partition_upsert.toml poetry run python main.py -t ./test_case/range_partition_append_only.toml poetry run python main.py -t ./test_case/range_partition_upsert.toml poetry run python main.py -t ./test_case/append_only_with_checkpoint_interval.toml +poetry run python main.py -t ./test_case/iceberg_select_empty_table.toml +poetry run python main.py -t ./test_case/iceberg_source_eq_delete.toml echo "--- Kill cluster" diff --git a/ci/scripts/e2e-source-test.sh b/ci/scripts/e2e-source-test.sh index 56a06ac756931..29f2a0ac7b5ce 100755 --- a/ci/scripts/e2e-source-test.sh +++ b/ci/scripts/e2e-source-test.sh @@ -130,7 +130,7 @@ echo "> inserted new rows into postgres" # start cluster w/o clean-data unset RISINGWAVE_CI -export RUST_LOG="risingwave_stream=debug,risingwave_batch=info,risingwave_storage=info" \ +export RUST_LOG="risingwave_stream=debug,risingwave_batch=info,risingwave_storage=info" risedev dev ci-1cn-1fe-with-recovery echo "> wait for cluster recovery finish" diff --git a/dashboard/package-lock.json b/dashboard/package-lock.json index c06e209600477..496093d8c2fe4 100644 --- a/dashboard/package-lock.json +++ b/dashboard/package-lock.json @@ -54,7 +54,7 @@ "eslint-plugin-n": "^15.2.5", "eslint-plugin-promise": "^6.0.1", "eslint-plugin-react": "^7.31.6", - "express": "^4.19.2", + "express": "^4.20.0", "prettier": "^2.7.1", "prettier-plugin-organize-imports": "^3.1.1", "typescript": "5.4.2" @@ -3792,9 +3792,9 @@ } }, "node_modules/body-parser": { - "version": "1.20.2", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz", - "integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==", + "version": "1.20.3", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz", + "integrity": "sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==", "dev": true, "dependencies": { "bytes": "3.1.2", @@ -3805,7 +3805,7 @@ "http-errors": "2.0.0", "iconv-lite": "0.4.24", "on-finished": "2.4.1", - "qs": "6.11.0", + "qs": "6.13.0", "raw-body": "2.5.2", "type-is": "~1.6.18", "unpipe": "1.0.0" @@ -3842,6 +3842,21 @@ "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", "dev": true }, + "node_modules/body-parser/node_modules/qs": { + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", + "integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==", + "dev": true, + "dependencies": { + "side-channel": "^1.0.6" + }, + "engines": { + "node": ">=0.6" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/bootstrap-icons": { "version": "1.9.1", "resolved": "https://registry.npmjs.org/bootstrap-icons/-/bootstrap-icons-1.9.1.tgz", @@ -3975,14 +3990,19 @@ } }, "node_modules/call-bind": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.5.tgz", - "integrity": "sha512-C3nQxfFZxFRVoJoGKKI8y3MOEo129NQ+FgQ08iye+Mk4zNZZGdjfs06bVTr+DBSlA66Q2VEcMki/cUCP4SercQ==", + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz", + "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==", "dev": true, "dependencies": { + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.1", - "set-function-length": "^1.1.1" + "get-intrinsic": "^1.2.4", + "set-function-length": "^1.2.1" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -4874,17 +4894,20 @@ } }, "node_modules/define-data-property": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.1.tgz", - "integrity": "sha512-E7uGkTzkk1d0ByLeSc6ZsFS79Axg+m1P/VsgYsxHgiuc3tFSj+MjMIwe90FC4lOAZzNBdY7kkO2P2wKdsQ1vgQ==", + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", + "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==", "dev": true, "dependencies": { - "get-intrinsic": "^1.2.1", - "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.0" + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", + "gopd": "^1.0.1" }, "engines": { "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, "node_modules/define-lazy-prop": { @@ -5166,6 +5189,27 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/es-define-property": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz", + "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==", + "dev": true, + "dependencies": { + "get-intrinsic": "^1.2.4" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/es-iterator-helpers": { "version": "1.0.15", "resolved": "https://registry.npmjs.org/es-iterator-helpers/-/es-iterator-helpers-1.0.15.tgz", @@ -6120,37 +6164,37 @@ } }, "node_modules/express": { - "version": "4.19.2", - "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz", - "integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==", + "version": "4.20.0", + "resolved": "https://registry.npmjs.org/express/-/express-4.20.0.tgz", + "integrity": "sha512-pLdae7I6QqShF5PnNTCVn4hI91Dx0Grkn2+IAsMTgMIKuQVte2dN9PeGSSAME2FR8anOhVA62QDIUaWVfEXVLw==", "dev": true, "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", - "body-parser": "1.20.2", + "body-parser": "1.20.3", "content-disposition": "0.5.4", "content-type": "~1.0.4", "cookie": "0.6.0", "cookie-signature": "1.0.6", "debug": "2.6.9", "depd": "2.0.0", - "encodeurl": "~1.0.2", + "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "etag": "~1.8.1", "finalhandler": "1.2.0", "fresh": "0.5.2", "http-errors": "2.0.0", - "merge-descriptors": "1.0.1", + "merge-descriptors": "1.0.3", "methods": "~1.1.2", "on-finished": "2.4.1", "parseurl": "~1.3.3", - "path-to-regexp": "0.1.7", + "path-to-regexp": "0.1.10", "proxy-addr": "~2.0.7", "qs": "6.11.0", "range-parser": "~1.2.1", "safe-buffer": "5.2.1", - "send": "0.18.0", - "serve-static": "1.15.0", + "send": "0.19.0", + "serve-static": "1.16.0", "setprototypeof": "1.2.0", "statuses": "2.0.1", "type-is": "~1.6.18", @@ -6170,6 +6214,15 @@ "ms": "2.0.0" } }, + "node_modules/express/node_modules/encodeurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", + "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", + "dev": true, + "engines": { + "node": ">= 0.8" + } + }, "node_modules/express/node_modules/ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", @@ -6643,16 +6696,20 @@ } }, "node_modules/get-intrinsic": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.2.tgz", - "integrity": "sha512-0gSo4ml/0j98Y3lngkFEot/zhiCeWsbYIlZ+uZOVgzLyLaUw7wxUL+nCTP0XJvJg1AXulJRI3UJi8GsbDuxdGA==", + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz", + "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==", "dev": true, "dependencies": { + "es-errors": "^1.3.0", "function-bind": "^1.1.2", "has-proto": "^1.0.1", "has-symbols": "^1.0.3", "hasown": "^2.0.0" }, + "engines": { + "node": ">= 0.4" + }, "funding": { "url": "https://github.com/sponsors/ljharb" } @@ -6833,12 +6890,12 @@ } }, "node_modules/has-property-descriptors": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.1.tgz", - "integrity": "sha512-VsX8eaIewvas0xnvinAe9bw4WfIeODpGYikiWYLH+dma0Jw6KHYqWiWfhQlgOVK8D6PvjubK5Uc4P0iIhIcNVg==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz", + "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==", "dev": true, "dependencies": { - "get-intrinsic": "^1.2.2" + "es-define-property": "^1.0.0" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -8320,10 +8377,13 @@ } }, "node_modules/merge-descriptors": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", - "integrity": "sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w==", - "dev": true + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz", + "integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } }, "node_modules/merge-stream": { "version": "2.0.0", @@ -8720,10 +8780,13 @@ } }, "node_modules/object-inspect": { - "version": "1.12.3", - "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.12.3.tgz", - "integrity": "sha512-geUvdk7c+eizMNUDkRpW1wJwgfOiOeHbxBR/hLXK1aT6zmVSO0jsQcs7fj6MGw89jC/cjGfLcNOrtMYtGqm81g==", + "version": "1.13.2", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz", + "integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==", "dev": true, + "engines": { + "node": ">= 0.4" + }, "funding": { "url": "https://github.com/sponsors/ljharb" } @@ -9056,9 +9119,9 @@ } }, "node_modules/path-to-regexp": { - "version": "0.1.7", - "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", - "integrity": "sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==", + "version": "0.1.10", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz", + "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==", "dev": true }, "node_modules/path-type": { @@ -10076,9 +10139,9 @@ } }, "node_modules/send": { - "version": "0.18.0", - "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", - "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==", + "version": "0.19.0", + "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", + "integrity": "sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==", "dev": true, "dependencies": { "debug": "2.6.9", @@ -10121,9 +10184,9 @@ "dev": true }, "node_modules/serve-static": { - "version": "1.15.0", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz", - "integrity": "sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==", + "version": "1.16.0", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.0.tgz", + "integrity": "sha512-pDLK8zwl2eKaYrs8mrPZBJua4hMplRWJ1tIFksVC3FtBEBnl8dxgeHtsaMS8DhS9i4fLObaon6ABoc4/hQGdPA==", "dev": true, "dependencies": { "encodeurl": "~1.0.2", @@ -10135,6 +10198,51 @@ "node": ">= 0.8.0" } }, + "node_modules/serve-static/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/serve-static/node_modules/debug/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true + }, + "node_modules/serve-static/node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "dev": true + }, + "node_modules/serve-static/node_modules/send": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", + "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==", + "dev": true, + "dependencies": { + "debug": "2.6.9", + "depd": "2.0.0", + "destroy": "1.2.0", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "fresh": "0.5.2", + "http-errors": "2.0.0", + "mime": "1.6.0", + "ms": "2.1.3", + "on-finished": "2.4.1", + "range-parser": "~1.2.1", + "statuses": "2.0.1" + }, + "engines": { + "node": ">= 0.8.0" + } + }, "node_modules/set-blocking": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", @@ -10142,16 +10250,17 @@ "optional": true }, "node_modules/set-function-length": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.0.tgz", - "integrity": "sha512-4DBHDoyHlM1IRPGYcoxexgh67y4ueR53FKV1yyxwFMY7aCqcN/38M1+SwZ/qJQ8iLv7+ck385ot4CcisOAPT9w==", + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", + "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==", "dev": true, "dependencies": { - "define-data-property": "^1.1.1", + "define-data-property": "^1.1.4", + "es-errors": "^1.3.0", "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.2", + "get-intrinsic": "^1.2.4", "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.1" + "has-property-descriptors": "^1.0.2" }, "engines": { "node": ">= 0.4" @@ -10209,14 +10318,18 @@ } }, "node_modules/side-channel": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz", - "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==", + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz", + "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==", "dev": true, "dependencies": { - "call-bind": "^1.0.0", - "get-intrinsic": "^1.0.2", - "object-inspect": "^1.9.0" + "call-bind": "^1.0.7", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.4", + "object-inspect": "^1.13.1" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -14446,9 +14559,9 @@ "dev": true }, "body-parser": { - "version": "1.20.2", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz", - "integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==", + "version": "1.20.3", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz", + "integrity": "sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==", "dev": true, "requires": { "bytes": "3.1.2", @@ -14459,7 +14572,7 @@ "http-errors": "2.0.0", "iconv-lite": "0.4.24", "on-finished": "2.4.1", - "qs": "6.11.0", + "qs": "6.13.0", "raw-body": "2.5.2", "type-is": "~1.6.18", "unpipe": "1.0.0" @@ -14488,6 +14601,15 @@ "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", "dev": true + }, + "qs": { + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", + "integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==", + "dev": true, + "requires": { + "side-channel": "^1.0.6" + } } } }, @@ -14586,14 +14708,16 @@ "dev": true }, "call-bind": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.5.tgz", - "integrity": "sha512-C3nQxfFZxFRVoJoGKKI8y3MOEo129NQ+FgQ08iye+Mk4zNZZGdjfs06bVTr+DBSlA66Q2VEcMki/cUCP4SercQ==", + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz", + "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==", "dev": true, "requires": { + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.1", - "set-function-length": "^1.1.1" + "get-intrinsic": "^1.2.4", + "set-function-length": "^1.2.1" } }, "callsites": { @@ -15255,14 +15379,14 @@ } }, "define-data-property": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.1.tgz", - "integrity": "sha512-E7uGkTzkk1d0ByLeSc6ZsFS79Axg+m1P/VsgYsxHgiuc3tFSj+MjMIwe90FC4lOAZzNBdY7kkO2P2wKdsQ1vgQ==", + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", + "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==", "dev": true, "requires": { - "get-intrinsic": "^1.2.1", - "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.0" + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", + "gopd": "^1.0.1" } }, "define-lazy-prop": { @@ -15488,6 +15612,21 @@ "which-typed-array": "^1.1.10" } }, + "es-define-property": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz", + "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==", + "dev": true, + "requires": { + "get-intrinsic": "^1.2.4" + } + }, + "es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "dev": true + }, "es-iterator-helpers": { "version": "1.0.15", "resolved": "https://registry.npmjs.org/es-iterator-helpers/-/es-iterator-helpers-1.0.15.tgz", @@ -16174,37 +16313,37 @@ } }, "express": { - "version": "4.19.2", - "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz", - "integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==", + "version": "4.20.0", + "resolved": "https://registry.npmjs.org/express/-/express-4.20.0.tgz", + "integrity": "sha512-pLdae7I6QqShF5PnNTCVn4hI91Dx0Grkn2+IAsMTgMIKuQVte2dN9PeGSSAME2FR8anOhVA62QDIUaWVfEXVLw==", "dev": true, "requires": { "accepts": "~1.3.8", "array-flatten": "1.1.1", - "body-parser": "1.20.2", + "body-parser": "1.20.3", "content-disposition": "0.5.4", "content-type": "~1.0.4", "cookie": "0.6.0", "cookie-signature": "1.0.6", "debug": "2.6.9", "depd": "2.0.0", - "encodeurl": "~1.0.2", + "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "etag": "~1.8.1", "finalhandler": "1.2.0", "fresh": "0.5.2", "http-errors": "2.0.0", - "merge-descriptors": "1.0.1", + "merge-descriptors": "1.0.3", "methods": "~1.1.2", "on-finished": "2.4.1", "parseurl": "~1.3.3", - "path-to-regexp": "0.1.7", + "path-to-regexp": "0.1.10", "proxy-addr": "~2.0.7", "qs": "6.11.0", "range-parser": "~1.2.1", "safe-buffer": "5.2.1", - "send": "0.18.0", - "serve-static": "1.15.0", + "send": "0.19.0", + "serve-static": "1.16.0", "setprototypeof": "1.2.0", "statuses": "2.0.1", "type-is": "~1.6.18", @@ -16221,6 +16360,12 @@ "ms": "2.0.0" } }, + "encodeurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", + "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", + "dev": true + }, "ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", @@ -16602,11 +16747,12 @@ "peer": true }, "get-intrinsic": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.2.tgz", - "integrity": "sha512-0gSo4ml/0j98Y3lngkFEot/zhiCeWsbYIlZ+uZOVgzLyLaUw7wxUL+nCTP0XJvJg1AXulJRI3UJi8GsbDuxdGA==", + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz", + "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==", "dev": true, "requires": { + "es-errors": "^1.3.0", "function-bind": "^1.1.2", "has-proto": "^1.0.1", "has-symbols": "^1.0.3", @@ -16735,12 +16881,12 @@ "integrity": "sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw==" }, "has-property-descriptors": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.1.tgz", - "integrity": "sha512-VsX8eaIewvas0xnvinAe9bw4WfIeODpGYikiWYLH+dma0Jw6KHYqWiWfhQlgOVK8D6PvjubK5Uc4P0iIhIcNVg==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz", + "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==", "dev": true, "requires": { - "get-intrinsic": "^1.2.2" + "es-define-property": "^1.0.0" } }, "has-proto": { @@ -17803,9 +17949,9 @@ "dev": true }, "merge-descriptors": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", - "integrity": "sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w==", + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz", + "integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==", "dev": true }, "merge-stream": { @@ -18081,9 +18227,9 @@ "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==" }, "object-inspect": { - "version": "1.12.3", - "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.12.3.tgz", - "integrity": "sha512-geUvdk7c+eizMNUDkRpW1wJwgfOiOeHbxBR/hLXK1aT6zmVSO0jsQcs7fj6MGw89jC/cjGfLcNOrtMYtGqm81g==", + "version": "1.13.2", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz", + "integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==", "dev": true }, "object-keys": { @@ -18322,9 +18468,9 @@ } }, "path-to-regexp": { - "version": "0.1.7", - "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", - "integrity": "sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==", + "version": "0.1.10", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz", + "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==", "dev": true }, "path-type": { @@ -19040,9 +19186,9 @@ "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" }, "send": { - "version": "0.18.0", - "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", - "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==", + "version": "0.19.0", + "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", + "integrity": "sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==", "dev": true, "requires": { "debug": "2.6.9", @@ -19086,15 +19232,61 @@ } }, "serve-static": { - "version": "1.15.0", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz", - "integrity": "sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==", + "version": "1.16.0", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.0.tgz", + "integrity": "sha512-pDLK8zwl2eKaYrs8mrPZBJua4hMplRWJ1tIFksVC3FtBEBnl8dxgeHtsaMS8DhS9i4fLObaon6ABoc4/hQGdPA==", "dev": true, "requires": { "encodeurl": "~1.0.2", "escape-html": "~1.0.3", "parseurl": "~1.3.3", "send": "0.18.0" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "requires": { + "ms": "2.0.0" + }, + "dependencies": { + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true + } + } + }, + "ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "dev": true + }, + "send": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", + "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==", + "dev": true, + "requires": { + "debug": "2.6.9", + "depd": "2.0.0", + "destroy": "1.2.0", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "fresh": "0.5.2", + "http-errors": "2.0.0", + "mime": "1.6.0", + "ms": "2.1.3", + "on-finished": "2.4.1", + "range-parser": "~1.2.1", + "statuses": "2.0.1" + } + } } }, "set-blocking": { @@ -19104,16 +19296,17 @@ "optional": true }, "set-function-length": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.0.tgz", - "integrity": "sha512-4DBHDoyHlM1IRPGYcoxexgh67y4ueR53FKV1yyxwFMY7aCqcN/38M1+SwZ/qJQ8iLv7+ck385ot4CcisOAPT9w==", + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", + "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==", "dev": true, "requires": { - "define-data-property": "^1.1.1", + "define-data-property": "^1.1.4", + "es-errors": "^1.3.0", "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.2", + "get-intrinsic": "^1.2.4", "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.1" + "has-property-descriptors": "^1.0.2" } }, "set-function-name": { @@ -19159,14 +19352,15 @@ "dev": true }, "side-channel": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz", - "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==", + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz", + "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==", "dev": true, "requires": { - "call-bind": "^1.0.0", - "get-intrinsic": "^1.0.2", - "object-inspect": "^1.9.0" + "call-bind": "^1.0.7", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.4", + "object-inspect": "^1.13.1" } }, "signal-exit": { diff --git a/dashboard/package.json b/dashboard/package.json index a3716f7802ccf..71621e4159f5d 100644 --- a/dashboard/package.json +++ b/dashboard/package.json @@ -61,7 +61,7 @@ "eslint-plugin-n": "^15.2.5", "eslint-plugin-promise": "^6.0.1", "eslint-plugin-react": "^7.31.6", - "express": "^4.19.2", + "express": "^4.20.0", "prettier": "^2.7.1", "prettier-plugin-organize-imports": "^3.1.1", "typescript": "5.4.2" diff --git a/docker/docker-compose-distributed-etcd.yml b/docker/docker-compose-distributed-etcd.yml index 1e23484c22f72..5fbfcf11e461c 100644 --- a/docker/docker-compose-distributed-etcd.yml +++ b/docker/docker-compose-distributed-etcd.yml @@ -1,7 +1,7 @@ --- version: "3" x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: compactor-0: <<: *image diff --git a/docker/docker-compose-distributed.yml b/docker/docker-compose-distributed.yml index 8de40728fd963..6eea5a1a1fb37 100644 --- a/docker/docker-compose-distributed.yml +++ b/docker/docker-compose-distributed.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: compactor-0: <<: *image diff --git a/docker/docker-compose-etcd.yml b/docker/docker-compose-etcd.yml index ef444fa2f0d82..f44646f49768e 100644 --- a/docker/docker-compose-etcd.yml +++ b/docker/docker-compose-etcd.yml @@ -1,7 +1,7 @@ --- version: "3" x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-azblob.yml b/docker/docker-compose-with-azblob.yml index 7c6a30e1f336c..490ac3eecc07e 100644 --- a/docker/docker-compose-with-azblob.yml +++ b/docker/docker-compose-with-azblob.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-gcs.yml b/docker/docker-compose-with-gcs.yml index 9327e6b4ee8cb..9787e405df046 100644 --- a/docker/docker-compose-with-gcs.yml +++ b/docker/docker-compose-with-gcs.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-local-fs.yml b/docker/docker-compose-with-local-fs.yml index d52a2adc911fd..d55156dd0cfd2 100644 --- a/docker/docker-compose-with-local-fs.yml +++ b/docker/docker-compose-with-local-fs.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-obs.yml b/docker/docker-compose-with-obs.yml index d6beb4f86e89e..14184a828a8a6 100644 --- a/docker/docker-compose-with-obs.yml +++ b/docker/docker-compose-with-obs.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-oss.yml b/docker/docker-compose-with-oss.yml index 74e4ec15d8f3e..04a1d05852633 100644 --- a/docker/docker-compose-with-oss.yml +++ b/docker/docker-compose-with-oss.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-s3.yml b/docker/docker-compose-with-s3.yml index c6ca1a885b448..ea9647092007a 100644 --- a/docker/docker-compose-with-s3.yml +++ b/docker/docker-compose-with-s3.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose-with-sqlite.yml b/docker/docker-compose-with-sqlite.yml index a4b008c1374cd..0dcdb6c11a814 100644 --- a/docker/docker-compose-with-sqlite.yml +++ b/docker/docker-compose-with-sqlite.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index bce57e69147f4..781e3e9a476f0 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,6 +1,6 @@ --- x-image: &image - image: ${RW_IMAGE:-risingwavelabs/risingwave:v1.10.0} + image: ${RW_IMAGE:-risingwavelabs/risingwave:v2.0.0-rc.1} services: risingwave-standalone: <<: *image @@ -60,7 +60,7 @@ services: ENABLE_TELEMETRY: ${ENABLE_TELEMETRY:-true} RW_TELEMETRY_TYPE: ${RW_TELEMETRY_TYPE:-"docker-compose"} RW_SECRET_STORE_PRIVATE_KEY_HEX: ${RW_SECRET_STORE_PRIVATE_KEY_HEX:-0123456789abcdef} - RW_LICENSE_KEY: ${RW_LICENSE_KEY:-""} + RW_LICENSE_KEY: ${RW_LICENSE_KEY:-} container_name: risingwave-standalone healthcheck: test: @@ -113,7 +113,7 @@ services: - "./grafana-risedev-datasource.yml:/etc/grafana/provisioning/datasources/grafana-risedev-datasource.yml" - "./grafana-risedev-dashboard.yml:/etc/grafana/provisioning/dashboards/grafana-risedev-dashboard.yml" - "./dashboards:/dashboards" - environment: { } + environment: {} container_name: grafana-0 healthcheck: test: @@ -187,7 +187,7 @@ services: volumes: - "prometheus-0:/prometheus" - "./prometheus.yaml:/etc/prometheus/prometheus.yml" - environment: { } + environment: {} container_name: prometheus-0 healthcheck: test: @@ -229,7 +229,7 @@ services: depends_on: [ ] volumes: - "message_queue:/var/lib/redpanda/data" - environment: { } + environment: {} container_name: message_queue healthcheck: test: curl -f localhost:9644/v1/status/ready diff --git a/docs/dev/src/images/architecture_20240908.png b/docs/dev/src/images/architecture_20240908.png new file mode 100644 index 0000000000000..40ba8b8174c68 Binary files /dev/null and b/docs/dev/src/images/architecture_20240908.png differ diff --git a/e2e_test/backup_restore/tpch_snapshot_create.slt b/e2e_test/backup_restore/tpch_snapshot_create.slt index c1fad2a2e0759..bb14dd369b837 100644 --- a/e2e_test/backup_restore/tpch_snapshot_create.slt +++ b/e2e_test/backup_restore/tpch_snapshot_create.slt @@ -1,5 +1,8 @@ include ../tpch/create_tables.slt.part +statement ok +CREATE SECRET secret1 WITH (backend = 'meta') AS 'demo-secret' + # First, insert the data into the tables include ../tpch/insert_customer.slt.part include ../tpch/insert_lineitem.slt.part diff --git a/e2e_test/backup_restore/tpch_snapshot_drop.slt b/e2e_test/backup_restore/tpch_snapshot_drop.slt index 0e593371347b7..27d271c35c617 100644 --- a/e2e_test/backup_restore/tpch_snapshot_drop.slt +++ b/e2e_test/backup_restore/tpch_snapshot_drop.slt @@ -1,3 +1,6 @@ +statement ok +DROP SECRET secret1; + statement ok drop materialized view tpch_q7; diff --git a/e2e_test/batch/catalog/pg_settings.slt.part b/e2e_test/batch/catalog/pg_settings.slt.part index 3482ce4850246..e05d466c3a4d6 100644 --- a/e2e_test/batch/catalog/pg_settings.slt.part +++ b/e2e_test/batch/catalog/pg_settings.slt.part @@ -22,6 +22,7 @@ user backfill_rate_limit user background_ddl user batch_enable_distributed_dml user batch_parallelism +user bypass_cluster_limits user bytea_output user cdc_source_wait_streaming_start_timeout user client_encoding diff --git a/e2e_test/batch/types/map.slt.part b/e2e_test/batch/types/map.slt.part index b4b4be7e5cba7..fe98fa3633000 100644 --- a/e2e_test/batch/types/map.slt.part +++ b/e2e_test/batch/types/map.slt.part @@ -122,6 +122,63 @@ select to_jsonb(m1), to_jsonb(m2), to_jsonb(m3), to_jsonb(l), to_jsonb(s) from t {"a": 1.0, "b": 2.0, "c": 3.0} null null null null {"a": 1.0, "b": 2.0, "c": 3.0} {"1": true, "2": false, "3": true} {"a": {"a1": "a2"}, "b": {"b1": "b2"}} [{"a": 1, "b": 2, "c": 3}, {"d": 4, "e": 5, "f": 6}] {"m": {"a": {"x": 1}, "b": {"x": 2}, "c": {"x": 3}}} +query ? +select jsonb_populate_map( + null::map(varchar, int), + '{"a": 1, "b": 2}'::jsonb +); +---- +{a:1,b:2} + + +query ? +select jsonb_populate_map( + MAP {'a': 1, 'b': 2}, + '{"b": 3, "c": 4}'::jsonb +); +---- +{a:1,b:3,c:4} + + +# implicit cast (int -> varchar) +query ? +select jsonb_populate_map( + MAP {'a': 'a', 'b': 'b'}, + '{"b": 3, "c": 4}'::jsonb +); +---- +{a:a,b:3,c:4} + + +query error +select jsonb_populate_map( + MAP {'a': 1, 'b': 2}, + '{"b": "3", "c": 4}'::jsonb +); +---- +db error: ERROR: Failed to run the query + +Caused by these errors (recent errors listed first): + 1: Expr error + 2: error while evaluating expression `jsonb_populate_map('{a:1,b:2}', '{"b": "3", "c": 4}')` + 3: Parse error: cannot cast jsonb string to type number + + +query error +select jsonb_populate_map( + null::map(int, int), + '{"a": 1, "b": 2}'::jsonb +); +---- +db error: ERROR: Failed to run the query + +Caused by these errors (recent errors listed first): + 1: Expr error + 2: error while evaluating expression `jsonb_populate_map(NULL, '{"a": 1, "b": 2}')` + 3: Parse error: cannot convert jsonb to a map with non-string keys + + + statement ok drop table t; diff --git a/e2e_test/commands/risectl b/e2e_test/commands/risectl new file mode 100755 index 0000000000000..2bb462d83fbab --- /dev/null +++ b/e2e_test/commands/risectl @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +RUST_LOG="error" .risingwave/bin/risingwave/risectl "$@" diff --git a/e2e_test/iceberg/main.py b/e2e_test/iceberg/main.py index 01017f3db783d..4279b899c5c1d 100644 --- a/e2e_test/iceberg/main.py +++ b/e2e_test/iceberg/main.py @@ -55,16 +55,23 @@ def execute_slt(args, slt): def verify_result(args, verify_sql, verify_schema, verify_data): tc = unittest.TestCase() - print(f"Executing sql: {verify_sql}") + + time.sleep(3) + print(f"verify_result:\nExecuting sql: {verify_sql}") spark = get_spark(args) df = spark.sql(verify_sql).collect() + print(f"Result:") + print(f"================") for row in df: print(row) + print(f"================") rows = verify_data.splitlines() - tc.assertEqual(len(df), len(rows)) + tc.assertEqual(len(df), len(rows), "row length mismatch") + tc.assertEqual(len(verify_schema), len(df[0]), "column length mismatch") for row1, row2 in zip(df, rows): print(f"Row1: {row1}, Row 2: {row2}") - row2 = row2.split(",") + # New parsing logic for row2 + row2 = parse_row(row2) for idx, ty in enumerate(verify_schema): if ty == "int" or ty == "long": tc.assertEqual(row1[idx], int(row2[idx])) @@ -89,7 +96,7 @@ def verify_result(args, verify_sql, verify_schema, verify_data): else: tc.assertEqual(row1[idx], decimal.Decimal(row2[idx])) else: - tc.fail(f"Unsupported type {ty}") + tc.assertEqual(str(row1[idx]), str(row2[idx])) def compare_sql(args, cmp_sqls): assert len(cmp_sqls) == 2 @@ -113,6 +120,32 @@ def drop_table(args, drop_sqls): spark.sql(sql) +def parse_row(row): + result = [] + current = "" + parenthesis_count = {"{": 0, "[": 0, "(": 0} + for char in row: + if char in parenthesis_count: + parenthesis_count[char] += 1 + elif char == "}": + parenthesis_count["{"] -= 1 + elif char == "]": + parenthesis_count["["] -= 1 + elif char == ")": + parenthesis_count["("] -= 1 + + if char == "," and all(value == 0 for value in parenthesis_count.values()): + result.append(current.strip()) + current = "" + else: + current += char + + if current: + result.append(current.strip()) + + return result + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Test script for iceberg") parser.add_argument("-t", dest="test_case", type=str, help="Test case file") @@ -151,4 +184,3 @@ def drop_table(args, drop_sqls): execute_slt(config, verify_slt) if drop_sqls is not None and drop_sqls != "": drop_table(config, drop_sqls) - diff --git a/e2e_test/iceberg/start_spark_connect_server.sh b/e2e_test/iceberg/start_spark_connect_server.sh index 345653778b14c..f0f3f19a1fab7 100755 --- a/e2e_test/iceberg/start_spark_connect_server.sh +++ b/e2e_test/iceberg/start_spark_connect_server.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + set -ex ICEBERG_VERSION=1.4.3 diff --git a/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt b/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt index 0dc937303a852..b0e433c819f83 100644 --- a/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt +++ b/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt @@ -1,6 +1,3 @@ -statement ok -set sink_decouple = false; - statement ok set streaming_parallelism=4; @@ -37,7 +34,6 @@ CREATE SINK sink1 AS select * from mv1 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); statement ok @@ -54,7 +50,6 @@ CREATE SINK sink2 AS select * from mv1 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); sleep 20s diff --git a/e2e_test/iceberg/test_case/cdc/load.slt b/e2e_test/iceberg/test_case/cdc/load.slt index df0c319990374..6e6850725f98a 100644 --- a/e2e_test/iceberg/test_case/cdc/load.slt +++ b/e2e_test/iceberg/test_case/cdc/load.slt @@ -1,4 +1,6 @@ # CDC source basic test +statement ok +set sink_decouple = false; statement ok create source mysql_mydb with ( diff --git a/e2e_test/iceberg/test_case/iceberg_select_empty_table.slt b/e2e_test/iceberg/test_case/iceberg_select_empty_table.slt new file mode 100644 index 0000000000000..832a7b781f7fb --- /dev/null +++ b/e2e_test/iceberg/test_case/iceberg_select_empty_table.slt @@ -0,0 +1,60 @@ +statement ok +set sink_decouple = false; + +statement ok +set streaming_parallelism=4; + +statement ok +CREATE TABLE s1 (i1 int, i2 varchar, i3 varchar); + +statement ok +CREATE MATERIALIZED VIEW mv1 AS SELECT * FROM s1; + +statement ok +CREATE SINK sink1 AS select * from mv1 WITH ( + connector = 'iceberg', + type = 'append-only', + force_append_only = 'true', + database.name = 'demo_db', + table.name = 't1', + catalog.name = 'demo', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + commit_checkpoint_interval = 1, + create_table_if_not_exists = 'true' +); + +statement ok +CREATE SOURCE iceberg_t1_source +WITH ( + connector = 'iceberg', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + database.name = 'demo_db', + table.name = 't1', +); + +statement ok +flush; + +query I +select count(*) from iceberg_t1_source; +---- +0 + +statement ok +DROP SINK sink1; + +statement ok +DROP SOURCE iceberg_t1_source; + +statement ok +DROP TABLE s1 cascade; diff --git a/e2e_test/iceberg/test_case/iceberg_select_empty_table.toml b/e2e_test/iceberg/test_case/iceberg_select_empty_table.toml new file mode 100644 index 0000000000000..fa6eeff134c26 --- /dev/null +++ b/e2e_test/iceberg/test_case/iceberg_select_empty_table.toml @@ -0,0 +1,11 @@ +init_sqls = [ + 'CREATE SCHEMA IF NOT EXISTS demo_db', + 'DROP TABLE IF EXISTS demo_db.t1', +] + +slt = 'test_case/iceberg_select_empty_table.slt' + +drop_sqls = [ + 'DROP TABLE IF EXISTS demo_db.t1', + 'DROP SCHEMA IF EXISTS demo_db', +] diff --git a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt index a83173fc48ab6..49c4cf3fb1145 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt @@ -16,7 +16,10 @@ v_bool boolean, v_date date, v_timestamp timestamptz, v_ts_ntz timestamp, -v_decimal decimal +v_decimal decimal, +v_map map(int, int), +v_array int[], +v_struct struct ); statement ok @@ -36,15 +39,15 @@ CREATE SINK s6 AS select * from mv6 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 + create_table_if_not_exists = 'true' ); statement ok INSERT INTO t6 VALUES -(1, 1, 1000, 1.1, 1.11, '1-1', true, '2022-03-11', '2022-03-11 01:00:00Z'::timestamptz, '2022-03-11 01:00:00',1.11), -(2, 2, 2000, 2.2, 2.22, '2-2', false, '2022-03-12', '2022-03-12 02:00:00Z'::timestamptz, '2022-03-12 02:00:00',2.22), -(3, 3, 3000, 3.3, 3.33, '3-3', true, '2022-03-13', '2022-03-13 03:00:00Z'::timestamptz, '2022-03-13 03:00:00','inf'), -(4, 4, 4000, 4.4, 4.44, '4-4', false, '2022-03-14', '2022-03-14 04:00:00Z'::timestamptz, '2022-03-14 04:00:00','-inf'); +(1, 1, 1000, 1.1, 1.11, '1-1', true, '2022-03-11', '2022-03-11 01:00:00Z'::timestamptz, '2022-03-11 01:00:00',1.11, map {1:100,2:200}, array[1,2,3], row(1,2)), +(2, 2, 2000, 2.2, 2.22, '2-2', false, '2022-03-12', '2022-03-12 02:00:00Z'::timestamptz, '2022-03-12 02:00:00',2.22, map {3:300}, array[1,null,3], row(3,null)), +(3, 3, 3000, 3.3, 3.33, '3-3', true, '2022-03-13', '2022-03-13 03:00:00Z'::timestamptz, '2022-03-13 03:00:00','inf', null, null, null), +(4, 4, 4000, 4.4, 4.44, '4-4', false, '2022-03-14', '2022-03-14 04:00:00Z'::timestamptz, '2022-03-14 04:00:00','-inf', null, null, null); statement ok FLUSH; @@ -53,13 +56,37 @@ sleep 5s statement ok INSERT INTO t6 VALUES -(5, 5, 5000, 5.5, 5.55, '5-5', true, '2022-03-15', '2022-03-15 05:00:00Z'::timestamptz, '2022-03-15 05:00:00','nan'); +(5, 5, 5000, 5.5, 5.55, '5-5', true, '2022-03-15', '2022-03-15 05:00:00Z'::timestamptz, '2022-03-15 05:00:00','nan', null, null, null); statement ok FLUSH; sleep 5s +statement ok +CREATE Source iceberg_s WITH ( + connector = 'iceberg', + database.name = 'demo_db', + table.name = 'no_partition_append_only_table', + catalog.name = 'demo', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin' +); + +query ?????????????? rowsort +select * from iceberg_s +---- +1 1 1000 1.1 1.11 1-1 t 2022-03-11 2022-03-11 01:00:00+00:00 2022-03-11 01:00:00 1.11000 {1:100,2:200} {1,2,3} (1,2) +2 2 2000 2.2 2.22 2-2 f 2022-03-12 2022-03-12 02:00:00+00:00 2022-03-12 02:00:00 2.22000 {3:300} {1,NULL,3} (3,) +3 3 3000 3.3 3.33 3-3 t 2022-03-13 2022-03-13 03:00:00+00:00 2022-03-13 03:00:00 99999.99999 NULL NULL NULL +4 4 4000 4.4 4.44 4-4 f 2022-03-14 2022-03-14 04:00:00+00:00 2022-03-14 04:00:00 -99999.99999 NULL NULL NULL +5 5 5000 5.5 5.55 5-5 t 2022-03-15 2022-03-15 05:00:00+00:00 2022-03-15 05:00:00 NULL NULL NULL NULL + + statement ok DROP SINK s6; @@ -68,3 +95,6 @@ DROP MATERIALIZED VIEW mv6; statement ok DROP TABLE t6; + +statement ok +DROP SOURCE iceberg_s; diff --git a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt index de96205a2debf..73d953bc2937a 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt @@ -25,7 +25,6 @@ CREATE SINK s6 AS select mv6.id as id, mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', primary_key = 'v1', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt index 72f0bce46d183..3a27df42903ee 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt @@ -36,7 +36,6 @@ CREATE SINK s6 AS select * from mv6 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt index 2b213a77175bd..39f170a834382 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt @@ -25,7 +25,6 @@ CREATE SINK s6 AS select mv6.id as id, mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', primary_key = 'v1', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt index 46670ac362599..f0cf9f5fa3133 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt @@ -36,7 +36,6 @@ CREATE SINK s6 AS select * from mv6 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt index 5637ce34c940f..f43e2788a020a 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt @@ -25,7 +25,6 @@ CREATE SINK s6 AS select mv6.id as id, mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', primary_key = 'v1', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_source_eq_delete.slt b/e2e_test/iceberg/test_case/iceberg_source_eq_delete.slt new file mode 100644 index 0000000000000..820776fb7e773 --- /dev/null +++ b/e2e_test/iceberg/test_case/iceberg_source_eq_delete.slt @@ -0,0 +1,113 @@ +statement ok +set sink_decouple = false; + +statement ok +set streaming_parallelism=4; + +statement ok +CREATE TABLE s1 (i1 int, i2 varchar, i3 varchar); + +statement ok +CREATE MATERIALIZED VIEW mv1 AS SELECT * FROM s1; + +statement ok +CREATE SINK sink1 AS select * from mv1 WITH ( + connector = 'iceberg', + type = 'upsert', + database.name = 'demo_db', + table.name = 't1', + catalog.name = 'demo', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + create_table_if_not_exists = 'true', + primary_key = 'i1,i2', +); + +statement ok +insert into s1 values(1,'2','3'); + +statement ok +insert into s1 values(7,'8','9'); + +statement ok +insert into s1 values(4,'5','6'); + +statement ok +flush; + +statement ok +delete from s1 where i1 = 7; + +statement ok +flush; + +sleep 5s + +statement ok +CREATE SOURCE iceberg_t1_source +WITH ( + connector = 'iceberg', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + database.name = 'demo_db', + table.name = 't1', +); + +query I +select * from iceberg_t1_source order by i1; +---- +1 2 3 +4 5 6 + +query I +select i1,i2,i3 from iceberg_t1_source order by i1; +---- +1 2 3 +4 5 6 + +query I +select i3,i2 from iceberg_t1_source order by i2; +---- +3 2 +6 5 + +query I +select i2,i1 from iceberg_t1_source order by i1; +---- +2 1 +5 4 + +query I +select i1 from iceberg_t1_source order by i1; +---- +1 +4 + +query I +select i2 from iceberg_t1_source order by i2; +---- +2 +5 + +query I +select i3 from iceberg_t1_source order by i3; +---- +3 +6 + +statement ok +DROP SINK sink1; + +statement ok +DROP SOURCE iceberg_t1_source; + +statement ok +DROP TABLE s1 cascade; diff --git a/e2e_test/iceberg/test_case/iceberg_source_eq_delete.toml b/e2e_test/iceberg/test_case/iceberg_source_eq_delete.toml new file mode 100644 index 0000000000000..6e49ca949f501 --- /dev/null +++ b/e2e_test/iceberg/test_case/iceberg_source_eq_delete.toml @@ -0,0 +1,11 @@ +init_sqls = [ + 'CREATE SCHEMA IF NOT EXISTS demo_db', + 'DROP TABLE IF EXISTS demo_db.t1', +] + +slt = 'test_case/iceberg_source_eq_delete.slt' + +drop_sqls = [ + 'DROP TABLE IF EXISTS demo_db.t1', + 'DROP SCHEMA IF EXISTS demo_db', +] \ No newline at end of file diff --git a/e2e_test/iceberg/test_case/no_partition_append_only.toml b/e2e_test/iceberg/test_case/no_partition_append_only.toml index 7d2952c508756..9d49b7a29d17f 100644 --- a/e2e_test/iceberg/test_case/no_partition_append_only.toml +++ b/e2e_test/iceberg/test_case/no_partition_append_only.toml @@ -13,24 +13,27 @@ init_sqls = [ v_date date, v_timestamp timestamp, v_ts_ntz timestamp_ntz, - v_decimal decimal(10,5) + v_decimal decimal(10,5), + v_map map, + v_array array, + v_struct struct ) USING iceberg TBLPROPERTIES ('format-version'='2'); ''' ] slt = 'test_case/iceberg_sink_no_partition_append_only_table.slt' -verify_schema = ['long', 'int', 'long', 'float', 'double', 'string', 'boolean', 'date', 'timestamp', 'timestamp_ntz','decimal'] +verify_schema = ['long', 'int', 'long', 'float', 'double', 'string', 'boolean', 'date', 'timestamp', 'timestamp_ntz','decimal', 'map', 'array', 'struct'] verify_sql = 'SELECT * FROM demo_db.no_partition_append_only_table ORDER BY id ASC' verify_data = """ -1,1,1000,1.1,1.11,1-1,true,2022-03-11,2022-03-11 01:00:00+00:00,2022-03-11 01:00:00,1.11 -2,2,2000,2.2,2.22,2-2,false,2022-03-12,2022-03-12 02:00:00+00:00,2022-03-12 02:00:00,2.22 -3,3,3000,3.3,3.33,3-3,true,2022-03-13,2022-03-13 03:00:00+00:00,2022-03-13 03:00:00,99999.99999 -4,4,4000,4.4,4.44,4-4,false,2022-03-14,2022-03-14 04:00:00+00:00,2022-03-14 04:00:00,-99999.99999 -5,5,5000,5.5,5.55,5-5,true,2022-03-15,2022-03-15 05:00:00+00:00,2022-03-15 05:00:00,none +1,1,1000,1.1,1.11,1-1,true,2022-03-11,2022-03-11 01:00:00+00:00,2022-03-11 01:00:00,1.11,{1: 100, 2: 200},[1, 2, 3],Row(a=1, b=2) +2,2,2000,2.2,2.22,2-2,false,2022-03-12,2022-03-12 02:00:00+00:00,2022-03-12 02:00:00,2.22,{3: 300},[1, None, 3],Row(a=3, b=None) +3,3,3000,3.3,3.33,3-3,true,2022-03-13,2022-03-13 03:00:00+00:00,2022-03-13 03:00:00,99999.99999,None,None,None +4,4,4000,4.4,4.44,4-4,false,2022-03-14,2022-03-14 04:00:00+00:00,2022-03-14 04:00:00,-99999.99999,None,None,None +5,5,5000,5.5,5.55,5-5,true,2022-03-15,2022-03-15 05:00:00+00:00,2022-03-15 05:00:00,none,None,None,None """ verify_slt = 'test_case/iceberg_sink_no_partition_append_only_table_verify.slt' diff --git a/e2e_test/iceberg/test_case/no_partition_upsert.toml b/e2e_test/iceberg/test_case/no_partition_upsert.toml index 24444e025f6fe..0c5d63e88216e 100644 --- a/e2e_test/iceberg/test_case/no_partition_upsert.toml +++ b/e2e_test/iceberg/test_case/no_partition_upsert.toml @@ -15,7 +15,7 @@ init_sqls = [ slt = 'test_case/iceberg_sink_no_partition_upsert_table.slt' -verify_schema = ['int','int','long','string'] +verify_schema = ['int','int','long','string','date'] verify_sql = 'SELECT * FROM demo_db.no_partition_upsert_table ORDER BY id, v1 ASC' diff --git a/e2e_test/iceberg/test_case/partition_upsert.toml b/e2e_test/iceberg/test_case/partition_upsert.toml index 38e6455fa9b0a..52cb1c40ea344 100644 --- a/e2e_test/iceberg/test_case/partition_upsert.toml +++ b/e2e_test/iceberg/test_case/partition_upsert.toml @@ -16,7 +16,7 @@ init_sqls = [ slt = 'test_case/iceberg_sink_partition_upsert_table.slt' -verify_schema = ['int','int','long','string'] +verify_schema = ['int','int','long','string', 'date'] verify_sql = 'SELECT * FROM demo_db.partition_upsert_table ORDER BY id, v1 ASC' diff --git a/e2e_test/iceberg/test_case/range_partition_upsert.toml b/e2e_test/iceberg/test_case/range_partition_upsert.toml index 0e63c4218eadc..ceea071d9c8a2 100644 --- a/e2e_test/iceberg/test_case/range_partition_upsert.toml +++ b/e2e_test/iceberg/test_case/range_partition_upsert.toml @@ -16,7 +16,7 @@ init_sqls = [ slt = 'test_case/iceberg_sink_range_partition_upsert_table.slt' -verify_schema = ['int','int','long','string'] +verify_schema = ['int','int','long','string','date'] verify_sql = 'SELECT * FROM demo_db.range_partition_upsert_table ORDER BY id, v1 ASC' diff --git a/e2e_test/s3/fs_parquet_source_and_sink.py b/e2e_test/s3/fs_parquet_source_and_sink.py index 3ae00d3fcee15..6425ef1d3a9d6 100644 --- a/e2e_test/s3/fs_parquet_source_and_sink.py +++ b/e2e_test/s3/fs_parquet_source_and_sink.py @@ -116,6 +116,7 @@ def _table(): return 's3_test_parquet' # Execute a SELECT statement + cur.execute(f'''set sink_decouple = false;''') cur.execute(f'''CREATE sink test_file_sink as select id, name, @@ -137,7 +138,7 @@ def _table(): s3.bucket_name = '{config['S3_BUCKET']}', s3.credentials.access = '{config['S3_ACCESS_KEY']}', s3.credentials.secret = '{config['S3_SECRET_KEY']}', - s3.endpoint_url = 'https://{config['S3_ENDPOINT']}' + s3.endpoint_url = 'https://{config['S3_ENDPOINT']}', s3.path = '', s3.file_type = 'parquet', type = 'append-only', diff --git a/e2e_test/sink/clickhouse_sink.slt b/e2e_test/sink/clickhouse_sink.slt index e037618bb460e..e5bac0d8d521d 100644 --- a/e2e_test/sink/clickhouse_sink.slt +++ b/e2e_test/sink/clickhouse_sink.slt @@ -17,7 +17,6 @@ CREATE SINK s6 AS select mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, mv6.v4 as v4, clickhouse.password = '', clickhouse.database = 'default', clickhouse.table='demo_test', - commit_checkpoint_interval = 1, ); statement ok diff --git a/e2e_test/sink/create_sink_as.slt b/e2e_test/sink/create_sink_as.slt index 5c66c5623553e..dc6d0f61419c6 100644 --- a/e2e_test/sink/create_sink_as.slt +++ b/e2e_test/sink/create_sink_as.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t4 (v1 int primary key, v2 int); diff --git a/e2e_test/sink/deltalake_rust_sink.slt b/e2e_test/sink/deltalake_rust_sink.slt index 74dca623a9d0a..cb9f9e7817212 100644 --- a/e2e_test/sink/deltalake_rust_sink.slt +++ b/e2e_test/sink/deltalake_rust_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamptz, v9 boolean, v10 decimal, v11 decimal[]); diff --git a/e2e_test/sink/doris_sink.slt b/e2e_test/sink/doris_sink.slt index 3242206badaea..3e6a4aca9d9f6 100644 --- a/e2e_test/sink/doris_sink.slt +++ b/e2e_test/sink/doris_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamp, v9 boolean, v10 jsonb); diff --git a/e2e_test/sink/iceberg_sink.slt b/e2e_test/sink/iceberg_sink.slt index e3917908f651b..b08abd8a4918c 100644 --- a/e2e_test/sink/iceberg_sink.slt +++ b/e2e_test/sink/iceberg_sink.slt @@ -31,7 +31,6 @@ CREATE SINK s6 AS select mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3 from mv6 WITH catalog.type = 'storage', database.name='demo_db', table.name='e2e_demo_table', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/sink/license.slt b/e2e_test/sink/license.slt index e38470d1c70d7..6e65b3653a536 100644 --- a/e2e_test/sink/license.slt +++ b/e2e_test/sink/license.slt @@ -7,32 +7,6 @@ ALTER SYSTEM SET license_key TO ''; statement ok CREATE TABLE t (k INT); -statement error -CREATE SINK file_sink -FROM - t -WITH -( - connector = 's3', - s3.region_name = 'us-east-1', - s3.bucket_name = 'test', - s3.path = '', - s3.file_type = 'parquet', - type = 'append-only', - force_append_only='true' -) FORMAT PLAIN ENCODE PARQUET(force_append_only='true'); ----- -db error: ERROR: Failed to run the query - -Caused by these errors (recent errors listed first): - 1: gRPC request to meta service failed: Internal error - 2: failed to validate sink - 3: Internal error - 4: feature FileSink is only available for tier Paid and above, while the current tier is Free - -Hint: You may want to set a license key with `ALTER SYSTEM SET license_key = '...';` command. - - statement error CREATE SINK dynamodb_sink FROM diff --git a/e2e_test/sink/mongodb_sink.slt b/e2e_test/sink/mongodb_sink.slt index 2122993e3003a..ddc5a91a20c3f 100644 --- a/e2e_test/sink/mongodb_sink.slt +++ b/e2e_test/sink/mongodb_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table t1( a smallint, diff --git a/e2e_test/sink/redis_cluster_sink.slt b/e2e_test/sink/redis_cluster_sink.slt index 03d197485777a..3effd7795d039 100644 --- a/e2e_test/sink/redis_cluster_sink.slt +++ b/e2e_test/sink/redis_cluster_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 int); diff --git a/e2e_test/sink/redis_sink.slt b/e2e_test/sink/redis_sink.slt index 7475a80ae696e..8828c22b80d27 100644 --- a/e2e_test/sink/redis_sink.slt +++ b/e2e_test/sink/redis_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamptz, v9 boolean); diff --git a/e2e_test/sink/remote/types.slt b/e2e_test/sink/remote/types.slt index f2421eabec906..e511d5e6a6ee7 100644 --- a/e2e_test/sink/remote/types.slt +++ b/e2e_test/sink/remote/types.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table t5 (v1 smallint primary key, v2 int, v3 bigint, v4 float, v5 double, v6 decimal, v7 varchar, v8 timestamp, v9 boolean); diff --git a/e2e_test/sink/sqlserver_sink.slt b/e2e_test/sink/sqlserver_sink.slt index 156b8b865ffc8..08bbd3364ed9a 100644 --- a/e2e_test/sink/sqlserver_sink.slt +++ b/e2e_test/sink/sqlserver_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table t_many_data_type_rw ( k1 int, k2 int, diff --git a/e2e_test/sink/starrocks_sink.slt b/e2e_test/sink/starrocks_sink.slt index dedb01755cbbe..0aceac592618a 100644 --- a/e2e_test/sink/starrocks_sink.slt +++ b/e2e_test/sink/starrocks_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamp, v9 boolean, v10 jsonb, v11 decimal); diff --git a/e2e_test/source/basic/kafka.slt b/e2e_test/source/basic/kafka.slt index 40e9b46036112..0e413c3389d58 100644 --- a/e2e_test/source/basic/kafka.slt +++ b/e2e_test/source/basic/kafka.slt @@ -498,6 +498,26 @@ FORMAT DEBEZIUM ENCODE JSON ( ignore_key = 'true' ) +statement error INCLUDE payload is only allowed when using ENCODE JSON, but got ENCODE Bytes +CREATE TABLE test_include_payload (a bytea) +INCLUDE payload +WITH ( + connector = 'kafka', + topic = 'kafka_1_partition_topic', + properties.bootstrap.server = 'message_queue:29092', + scan.startup.mode = 'earliest' +) FORMAT PLAIN ENCODE BYTES + +statement ok +CREATE TABLE test_include_payload (v1 int, v2 varchar) +INCLUDE payload +WITH ( + connector = 'kafka', + topic = 'kafka_1_partition_topic', + properties.bootstrap.server = 'message_queue:29092', + scan.startup.mode = 'earliest' +) FORMAT PLAIN ENCODE JSON + statement ok flush; @@ -512,6 +532,13 @@ select v1, v2 from t0; 3 333 4 4444 +query ITT rowsort +select v1, v2, _rw_kafka_payload from test_include_payload; +---- +1 1 {"v1": 1, "v2": "1"} +2 22 {"v1": 2, "v2": "22"} +3 333 {"v1": 3, "v2": "333"} +4 4444 {"v1": 4, "v2": "4444"} query IT rowsort select v1, v2 from s0; @@ -916,3 +943,6 @@ drop table source_with_rdkafka_props; statement ok drop table debezium_ignore_key; + +statement ok +drop table test_include_payload; diff --git a/e2e_test/source/opendal/posix_fs.slt b/e2e_test/source/opendal/posix_fs.slt index 3fc572a1a1cc8..1bf026aed2744 100644 --- a/e2e_test/source/opendal/posix_fs.slt +++ b/e2e_test/source/opendal/posix_fs.slt @@ -2,21 +2,22 @@ statement ok SET RW_IMPLICIT_FLUSH TO true; statement ok -CREATE TABLE diamonds ( +CREATE TABLE diamonds_recursive_read ( carat FLOAT, cut TEXT, color TEXT, depth FLOAT, ) WITH ( - connector = 'posix_fs', - match_pattern = 'data*.csv', - posix_fs.root = 'e2e_test/source/opendal/data', + connector = 'posix_fs', + match_pattern = 'data*.csv', + posix_fs.root = 'e2e_test/source/opendal/data', + recursive_scan = 'true', ) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ','); sleep 10s query TTTT rowsort -select * from diamonds; +select * from diamonds_recursive_read; ---- 0.22 Premium I 62 0.23 Very Good H 57.5 @@ -29,5 +30,26 @@ select * from diamonds; 1.28 Good J 63.1 1.3 Fair E 64.7 +statement ok +CREATE TABLE diamonds ( + carat FLOAT, + cut TEXT, + color TEXT, + depth FLOAT, +) WITH ( + connector = 'posix_fs', + match_pattern = 'data*.csv', + posix_fs.root = 'e2e_test/source/opendal', +) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ','); + +sleep 10s + +query TTTT rowsort +select * from diamonds; +---- + statement ok DROP TABLE diamonds; + +statement ok +DROP TABLE diamonds_recursive_read; diff --git a/e2e_test/source_inline/kafka/protobuf/recover.slt b/e2e_test/source_inline/kafka/protobuf/recover.slt new file mode 100644 index 0000000000000..3babf26793f2a --- /dev/null +++ b/e2e_test/source_inline/kafka/protobuf/recover.slt @@ -0,0 +1,97 @@ +control substitution on + +system ok +rpk topic create 'test-pb-struct' + + +system ok +jq -sR '{"schema":.,"schemaType":"PROTOBUF"}' << EOF | curl -X POST -H 'content-type: application/json' -d @- "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value/versions" +syntax = "proto3"; +package test; +message User { + int32 id = 1; + Name name = 2; +} +message Name { + string first_name = 1; + string last_name = 2; +} +EOF + + +# create a source with v1 schema +statement ok +create source s with ( + ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, + topic = 'test-pb-struct') +format plain encode protobuf ( + schema.registry = '${RISEDEV_SCHEMA_REGISTRY_URL}', + message = 'test.User'); + + +# register a v2 schema +system ok +jq -sR '{"schema":.,"schemaType":"PROTOBUF"}' << EOF | curl -X POST -H 'content-type: application/json' -d @- "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value/versions" +syntax = "proto3"; +package test; +message User { + int32 id = 1; + Name name = 2; +} +message Name { + string first_name = 1; + string last_name = 2; + string middle_name = 3; +} +EOF + + +# trigger recovery +statement ok +recover; + + +sleep 2s + + +# produce a v2 message +statement ok +create sink sk as select + 1 as id, + row('Alan', 'Turing', 'Mathison')::struct as name +with ( + ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, + topic = 'test-pb-struct') +format plain encode protobuf ( + schema.registry = '${RISEDEV_SCHEMA_REGISTRY_URL}', + message = 'test.User'); + + +sleep 1s + + +# reading as v1 shall not panic +query IT +select * from s; +---- +1 (Alan,Turing) + + +statement ok +drop sink sk; + + +statement ok +drop source s; + + +system ok +curl -X DELETE "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value" + + +system ok +curl -X DELETE "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value?permanent=true" + + +system ok +rpk topic delete 'test-pb-struct' diff --git a/e2e_test/source_inline/kafka/shared_source.slt b/e2e_test/source_inline/kafka/shared_source.slt index c481e609ffccd..5d1072df2cfaa 100644 --- a/e2e_test/source_inline/kafka/shared_source.slt +++ b/e2e_test/source_inline/kafka/shared_source.slt @@ -6,6 +6,29 @@ SET rw_enable_shared_source TO true; system ok rpk topic create shared_source -p 4 +# Test create source before produing data. +statement ok +create source s_before_produce (v1 int, v2 varchar) with ( + ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, + topic = 'shared_source', + scan.startup.mode = 'earliest' +) FORMAT PLAIN ENCODE JSON; + +statement ok +create materialized view mv_before_produce as select * from s_before_produce; + +sleep 2s + +# All partitions starts with backfill_info: NoDataToBackfill, so it finishes immediately. +system ok +internal_table.mjs --name mv_before_produce --type sourcebackfill +---- +0,"""Finished""" +1,"""Finished""" +2,"""Finished""" +3,"""Finished""" + + system ok cat << EOF | rpk topic produce shared_source -f "%p %v\n" -p 0 0 {"v1": 1, "v2": "a"} @@ -21,7 +44,7 @@ create source s0 (v1 int, v2 varchar) with ( scan.startup.mode = 'earliest' ) FORMAT PLAIN ENCODE JSON; -query I +query ? select count(*) from rw_internal_tables where name like '%s0%'; ---- 1 @@ -41,21 +64,24 @@ create materialized view mv_1 as select * from s0; # Wait enough time to ensure SourceExecutor consumes all Kafka data. sleep 2s -# SourceExecutor's ingestion started, but it only starts from latest. +# SourceExecutor's ingestion started, but it only starts from latest (offset 1). system ok internal_table.mjs --name s0 --type source ---- (empty) -# offset 0 must be backfilled, not from upstream. +# SourceBackfill starts from offset 0, with backfill_info: HasDataToBackfill { latest_offset: "0" } (decided by kafka high watermark). +# (meaning upstream already consumed offset 0, so we only need to backfill to offset 0) +# After backfilling offset 0, it enters SourceCachingUp state. Now the backfill is finished. +# We wait for SourceExecutor to produce offset > 0. system ok internal_table.mjs --name mv_1 --type sourcebackfill ---- -0,"{""Backfilling"": ""0""}" -1,"{""Backfilling"": ""0""}" -2,"{""Backfilling"": ""0""}" -3,"{""Backfilling"": ""0""}" +0,"{""SourceCachingUp"": ""0""}" +1,"{""SourceCachingUp"": ""0""}" +2,"{""SourceCachingUp"": ""0""}" +3,"{""SourceCachingUp"": ""0""}" # This does not affect the behavior for CREATE MATERIALIZED VIEW below. It also uses the shared source, and creates SourceBackfillExecutor. @@ -67,7 +93,7 @@ create materialized view mv_2 as select * from s0; sleep 2s -query IT rowsort +query ?? rowsort select v1, v2 from s0; ---- 1 a @@ -75,7 +101,7 @@ select v1, v2 from s0; 3 c 4 d -query IT rowsort +query ?? rowsort select v1, v2 from mv_1; ---- 1 a @@ -83,7 +109,7 @@ select v1, v2 from mv_1; 3 c 4 d -query IT rowsort +query ?? rowsort select v1, v2 from mv_2; ---- 1 a @@ -111,7 +137,7 @@ internal_table.mjs --name s0 --type source 3,"{""split_info"": {""partition"": 3, ""start_offset"": 1, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" -query IT rowsort +query ?? rowsort select v1, v2 from s0; ---- 1 a @@ -123,7 +149,7 @@ select v1, v2 from s0; 4 d 4 dd -query IT rowsort +query ?? rowsort select v1, v2 from mv_1; ---- 1 a @@ -146,18 +172,14 @@ internal_table.mjs --name s0 --type source 3,"{""split_info"": {""partition"": 3, ""start_offset"": 1, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" -# The result is non-deterministic: -# If the upstream row comes before the backfill row, it will be ignored, and the result state is "{""Backfilling"": ""1""}". -# If the upstream row comes after the backfill row, the result state is Finished. -# Uncomment below and run manually to see the result. - -# system ok -# internal_table.mjs --name mv_1 --type sourcebackfill -# ---- -# 0,"{""Finished""}" -# 1,"{""Finished""}" -# 2,"{""Finished""}" -# 3,"{""Finished""}" +# Transition from SourceCachingUp to Finished after consuming one upstream message. +system ok +internal_table.mjs --name mv_1 --type sourcebackfill +---- +0,"""Finished""" +1,"""Finished""" +2,"""Finished""" +3,"""Finished""" system ok @@ -173,7 +195,7 @@ done sleep 3s -query IT rowsort +query ?? rowsort select v1, count(*) from s0 group by v1; ---- 1 12 @@ -181,7 +203,7 @@ select v1, count(*) from s0 group by v1; 3 12 4 12 -query IT rowsort +query ?? rowsort select v1, count(*) from mv_1 group by v1; ---- 1 12 @@ -189,6 +211,14 @@ select v1, count(*) from mv_1 group by v1; 3 12 4 12 +query ?? rowsort +select v1, count(*) from mv_before_produce group by v1; +---- +1 12 +2 12 +3 12 +4 12 + # start_offset changed to 11 system ok @@ -200,15 +230,89 @@ internal_table.mjs --name s0 --type source 3,"{""split_info"": {""partition"": 3, ""start_offset"": 11, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" -# Now it is highly probable that all partitions have finished. -system ok -internal_table.mjs --name mv_1 --type sourcebackfill ----- -0,"""Finished""" -1,"""Finished""" -2,"""Finished""" -3,"""Finished""" +# # Note: the parallelism depends on the risedev profile. +# # So scale tests below are commented out. + +# query ??? +# select name, flags, parallelism from rw_fragments JOIN rw_relations ON rw_fragments.table_id = rw_relations.id order by name; +# ---- +# mv_1 {MVIEW,SOURCE_SCAN} 5 +# mv_2 {MVIEW,SOURCE_SCAN} 5 +# s0 {SOURCE} 5 + + +# system ok +# risectl meta source-split-info --ignore-id +# ---- +# Table +# Fragment (Source) +# Actor (1 splits): [0] +# Actor (1 splits): [2] +# Actor (1 splits): [3] +# Actor (1 splits): [1] +# Actor (0 splits): [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] + + +# # scale down +# statement ok +# ALTER MATERIALIZED VIEW mv_1 SET PARALLELISM TO 2; + +# # should have no effect, because of NoShuffle +# # TODO: support ALTER SOURCE SET PARALLELISM, then we can +# query ??? +# select name, flags, parallelism from rw_fragments JOIN rw_relations ON rw_fragments.table_id = rw_relations.id order by name; +# ---- +# mv_1 {MVIEW,SOURCE_SCAN} 5 +# mv_2 {MVIEW,SOURCE_SCAN} 5 +# s0 {SOURCE} 5 + +# system ok +# risectl meta source-split-info --ignore-id +# ---- +# Table +# Fragment (Source) +# Actor (1 splits): [0] +# Actor (1 splits): [2] +# Actor (1 splits): [3] +# Actor (1 splits): [1] +# Actor (0 splits): [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] +# Table +# Fragment (SourceScan) +# Actor (1 splits): [0] <- Upstream Actor #1055: [0] +# Actor (1 splits): [2] <- Upstream Actor #1056: [2] +# Actor (1 splits): [3] <- Upstream Actor #1057: [3] +# Actor (1 splits): [1] <- Upstream Actor #1058: [1] +# Actor (0 splits): [] <- Upstream Actor #1059: [] + + +# # Manual test: change the parallelism of the compute node, kill and restart, and check +# # risedev ctl meta source-split-info --ignore-id +# # risedev psql -c "select name, flags, parallelism from rw_fragments JOIN rw_relations ON rw_fragments.table_id = rw_relations.id order by name;" statement ok drop source s0 cascade; + +statement ok +drop source s_before_produce cascade; diff --git a/e2e_test/time_travel/syntax.slt b/e2e_test/time_travel/syntax.slt index 6c3408a276763..5895f6d9b9e8b 100644 --- a/e2e_test/time_travel/syntax.slt +++ b/e2e_test/time_travel/syntax.slt @@ -7,6 +7,10 @@ SET QUERY_MODE TO local; statement ok CREATE TABLE t (k INT); +query I +SELECT * FROM t; +---- + query error SELECT * FROM t FOR SYSTEM_TIME AS OF 963716300; ---- diff --git a/integration_tests/big-query-sink/create_sink.sql b/integration_tests/big-query-sink/create_sink.sql index a41fe0243120d..01fb5e340d545 100644 --- a/integration_tests/big-query-sink/create_sink.sql +++ b/integration_tests/big-query-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + -- create sink with local file CREATE SINK bhv_big_query_sink FROM diff --git a/integration_tests/cassandra-and-scylladb-sink/create_sink.sql b/integration_tests/cassandra-and-scylladb-sink/create_sink.sql index a0a305aebd0e0..fdda994d01427 100644 --- a/integration_tests/cassandra-and-scylladb-sink/create_sink.sql +++ b/integration_tests/cassandra-and-scylladb-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_cassandra_sink FROM bhv_mv WITH ( diff --git a/integration_tests/clickhouse-sink/create_sink.sql b/integration_tests/clickhouse-sink/create_sink.sql index 5f730ed6ff910..b913a246b286e 100644 --- a/integration_tests/clickhouse-sink/create_sink.sql +++ b/integration_tests/clickhouse-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_clickhouse_sink FROM bhv_mv WITH ( diff --git a/integration_tests/deltalake-sink/create_sink.sql b/integration_tests/deltalake-sink/create_sink.sql index f42b09d726e56..17c1c44aea255 100644 --- a/integration_tests/deltalake-sink/create_sink.sql +++ b/integration_tests/deltalake-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + create sink delta_lake_sink from source with ( connector = 'deltalake', diff --git a/integration_tests/doris-sink/create_sink.sql b/integration_tests/doris-sink/create_sink.sql index d4702219fed09..d6b28148c083d 100644 --- a/integration_tests/doris-sink/create_sink.sql +++ b/integration_tests/doris-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + create secret doris_secret with (backend = 'meta') as '123456'; CREATE SINK bhv_doris_sink diff --git a/integration_tests/dynamodb/create_sink.sql b/integration_tests/dynamodb/create_sink.sql index 6de71404a9da1..43cb2be6d1447 100644 --- a/integration_tests/dynamodb/create_sink.sql +++ b/integration_tests/dynamodb/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK dyn_sink FROM movies diff --git a/integration_tests/elasticsearch-sink/create_sink.sql b/integration_tests/elasticsearch-sink/create_sink.sql index 07046507d117d..f72f8f0e6ec3b 100644 --- a/integration_tests/elasticsearch-sink/create_sink.sql +++ b/integration_tests/elasticsearch-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_es7_sink FROM bhv_mv WITH ( diff --git a/integration_tests/kafka-cdc-sink/create_sink.sql b/integration_tests/kafka-cdc-sink/create_sink.sql index 349aac0ca9b0a..0c25553adebba 100644 --- a/integration_tests/kafka-cdc-sink/create_sink.sql +++ b/integration_tests/kafka-cdc-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK IF NOT EXISTS counts_sink FROM counts WITH ( diff --git a/integration_tests/mqtt/create_sink.sql b/integration_tests/mqtt/create_sink.sql index 69b6886943944..27b84aa354250 100644 --- a/integration_tests/mqtt/create_sink.sql +++ b/integration_tests/mqtt/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK mqtt_sink FROM personnel diff --git a/integration_tests/mysql-sink/create_sink.sql b/integration_tests/mysql-sink/create_sink.sql index 9776360df2914..f73b92e8ce259 100644 --- a/integration_tests/mysql-sink/create_sink.sql +++ b/integration_tests/mysql-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK target_count_mysql_sink FROM target_count WITH ( diff --git a/integration_tests/nats/create_sink.sql b/integration_tests/nats/create_sink.sql index beee01afcecfb..fda1ab1c77621 100644 --- a/integration_tests/nats/create_sink.sql +++ b/integration_tests/nats/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE TABLE personnel (id integer, name varchar); diff --git a/integration_tests/postgres-sink/create_sink.sql b/integration_tests/postgres-sink/create_sink.sql index 5041f1a36b741..ec76f16ac3037 100644 --- a/integration_tests/postgres-sink/create_sink.sql +++ b/integration_tests/postgres-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK target_count_postgres_sink FROM target_count WITH ( diff --git a/integration_tests/redis-sink/create_sink.sql b/integration_tests/redis-sink/create_sink.sql index 61ffb67326227..f88a68aca2110 100644 --- a/integration_tests/redis-sink/create_sink.sql +++ b/integration_tests/redis-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_redis_sink_1 FROM bhv_mv WITH ( diff --git a/integration_tests/starrocks-sink/create_sink.sql b/integration_tests/starrocks-sink/create_sink.sql index 8d7ebf98dfb20..7cfe69ef21973 100644 --- a/integration_tests/starrocks-sink/create_sink.sql +++ b/integration_tests/starrocks-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + create secret starrocks_secret with (backend = 'meta') as '123456'; CREATE SINK bhv_starrocks_sink_primary diff --git a/integration_tests/twitter-pulsar/pb/create_source.sql b/integration_tests/twitter-pulsar/pb/create_source.sql index bf41939b40d91..22c4927ab3bb9 100644 --- a/integration_tests/twitter-pulsar/pb/create_source.sql +++ b/integration_tests/twitter-pulsar/pb/create_source.sql @@ -1,5 +1,6 @@ CREATE SOURCE twitter WITH ( connector = 'pulsar', pulsar.topic = 'twitter', - pulsar.service.url = 'pulsar://message_queue:6650' + pulsar.service.url = 'pulsar://message_queue:6650', + subscription.name.prefix = 'custom_prefix' ) ROW FORMAT PROTOBUF MESSAGE 'twitter.schema.Event' ROW SCHEMA LOCATION 'http://file_server:8080/schema'; \ No newline at end of file diff --git a/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/DbzConnectorConfig.java b/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/DbzConnectorConfig.java index fb8aa62916f60..8ba569c7aea72 100644 --- a/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/DbzConnectorConfig.java +++ b/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/DbzConnectorConfig.java @@ -54,6 +54,7 @@ public class DbzConnectorConfig { public static final String PG_PUB_NAME = "publication.name"; public static final String PG_PUB_CREATE = "publication.create.enable"; public static final String PG_SCHEMA_NAME = "schema.name"; + public static final String PG_SSL_ROOT_CERT = "ssl.root.cert"; /* Sql Server configs */ public static final String SQL_SERVER_SCHEMA_NAME = "schema.name"; @@ -211,6 +212,10 @@ public DbzConnectorConfig( LOG.info("Disable table filtering for the shared Postgres source"); dbzProps.remove("table.include.list"); } + + if (userProps.containsKey(PG_SSL_ROOT_CERT)) { + dbzProps.setProperty("database.sslrootcert", userProps.get(PG_SSL_ROOT_CERT)); + } } else if (source == SourceTypeE.CITUS) { var postgresProps = initiateDbConfig(POSTGRES_CONFIG_FILE, substitutor); diff --git a/java/connector-node/risingwave-connector-service/src/main/resources/postgres.properties b/java/connector-node/risingwave-connector-service/src/main/resources/postgres.properties index 06c4210fcf468..c36b62a7aa531 100644 --- a/java/connector-node/risingwave-connector-service/src/main/resources/postgres.properties +++ b/java/connector-node/risingwave-connector-service/src/main/resources/postgres.properties @@ -7,6 +7,7 @@ database.port=${port} database.user=${username} database.password=${password} database.dbname=${database.name} +database.sslmode=${ssl.mode:-prefer} table.include.list=${schema.name}.${table.name} # The name of the PostgreSQL replication slot slot.name=${slot.name} diff --git a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java index 10aa371c50aec..02297a4ea57dd 100644 --- a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java +++ b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSink.java @@ -71,12 +71,13 @@ public JDBCSink(JDBCSinkConfig config, TableSchema tableSchema) { .collect(Collectors.toList()); LOG.info( - "schema = {}, table = {}, tableSchema = {}, columnSqlTypes = {}, pkIndices = {}", + "schema = {}, table = {}, tableSchema = {}, columnSqlTypes = {}, pkIndices = {}, queryTimeout = {}", config.getSchemaName(), config.getTableName(), tableSchema, columnSqlTypes, - pkIndices); + pkIndices, + config.getQueryTimeout()); if (factory.isPresent()) { this.jdbcDialect = factory.get().create(columnSqlTypes, pkIndices); @@ -92,7 +93,7 @@ public JDBCSink(JDBCSinkConfig config, TableSchema tableSchema) { // Commit the `getTransactionIsolation` conn.commit(); - jdbcStatements = new JdbcStatements(conn); + jdbcStatements = new JdbcStatements(conn, config.getQueryTimeout()); } catch (SQLException e) { throw Status.INTERNAL .withDescription( @@ -173,7 +174,7 @@ public boolean write(Iterable rows) { conn = JdbcUtils.getConnection(config.getJdbcUrl()); // reset the flag since we will retry to prepare the batch again updateFlag = false; - jdbcStatements = new JdbcStatements(conn); + jdbcStatements = new JdbcStatements(conn, config.getQueryTimeout()); } else { throw io.grpc.Status.INTERNAL .withDescription( @@ -206,13 +207,15 @@ public boolean write(Iterable rows) { * across multiple batches if only the JDBC connection is valid. */ class JdbcStatements implements AutoCloseable { + private final int queryTimeoutSecs; private PreparedStatement deleteStatement; private PreparedStatement upsertStatement; private PreparedStatement insertStatement; private final Connection conn; - public JdbcStatements(Connection conn) throws SQLException { + public JdbcStatements(Connection conn, int queryTimeoutSecs) throws SQLException { + this.queryTimeoutSecs = queryTimeoutSecs; this.conn = conn; var schemaTableName = jdbcDialect.createSchemaTableName( @@ -339,6 +342,9 @@ private void executeStatement(PreparedStatement stmt) throws SQLException { if (stmt == null) { return; } + // if timeout occurs, a SQLTimeoutException will be thrown + // and we will retry to write the stream chunk in `JDBCSink.write` + stmt.setQueryTimeout(queryTimeoutSecs); LOG.debug("Executing statement: {}", stmt); stmt.executeBatch(); stmt.clearParameters(); diff --git a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java index ca74ac6a8eb74..94eb5cdc7e0ff 100644 --- a/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java +++ b/java/connector-node/risingwave-sink-jdbc/src/main/java/com/risingwave/connector/JDBCSinkConfig.java @@ -32,6 +32,9 @@ public class JDBCSinkConfig extends CommonSinkConfig { @JsonProperty(value = "schema.name") private String schemaName; + @JsonProperty(value = "jdbc.query.timeout") + private int queryTimeoutSeconds = 600; + @JsonCreator public JDBCSinkConfig( @JsonProperty(value = "jdbc.url") String jdbcUrl, @@ -62,4 +65,8 @@ public String getSinkType() { public boolean isUpsertSink() { return this.isUpsertSink; } + + public int getQueryTimeout() { + return queryTimeoutSeconds; + } } diff --git a/lints/Cargo.lock b/lints/Cargo.lock index e3b748e6da670..aa1e1e4ef9b32 100644 --- a/lints/Cargo.lock +++ b/lints/Cargo.lock @@ -162,7 +162,8 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clippy_config" -version = "0.1.80" +version = "0.1.81" +source = "git+https://github.com/risingwavelabs/clippy?rev=5135d0218365e85f3371405b604a7fb1459eb256#5135d0218365e85f3371405b604a7fb1459eb256" dependencies = [ "rustc-semver", "serde", @@ -171,12 +172,14 @@ dependencies = [ [[package]] name = "clippy_utils" -version = "0.1.80" +version = "0.1.81" +source = "git+https://github.com/risingwavelabs/clippy?rev=5135d0218365e85f3371405b604a7fb1459eb256#5135d0218365e85f3371405b604a7fb1459eb256" dependencies = [ "arrayvec", "clippy_config", "itertools", "rustc-semver", + "rustc_apfloat", ] [[package]] @@ -869,6 +872,16 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5be1bdc7edf596692617627bbfeaba522131b18e06ca4df2b6b689e3c5d5ce84" +[[package]] +name = "rustc_apfloat" +version = "0.2.1+llvm-462a31f5a5ab" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "886d94c63c812a8037c4faca2607453a0fa4cf82f734665266876b022244543f" +dependencies = [ + "bitflags 1.3.2", + "smallvec", +] + [[package]] name = "rustfix" version = "0.6.1" @@ -975,6 +988,12 @@ dependencies = [ "digest", ] +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + [[package]] name = "syn" version = "2.0.39" diff --git a/lints/Cargo.toml b/lints/Cargo.toml index 43ece1f6fc5b7..e0b8fe5d96664 100644 --- a/lints/Cargo.toml +++ b/lints/Cargo.toml @@ -14,7 +14,7 @@ path = "ui/format_error.rs" # See `README.md` before bumping the version. # Remember to update the version in `ci/Dockerfile` as well. [dependencies] -clippy_utils = { git = "https://github.com/risingwavelabs/clippy", rev = "5e2a7c6adebdb0478ee6d5b67ab4ee94153b2997" } +clippy_utils = { git = "https://github.com/risingwavelabs/clippy", rev = "61e1d2fd7062e46ccf1237707ee6da5aac018f70" } dylint_linting = "3.1.0" itertools = "0.12" diff --git a/lints/rust-toolchain b/lints/rust-toolchain index a146af66cd637..31dbc57d04b2b 100644 --- a/lints/rust-toolchain +++ b/lints/rust-toolchain @@ -1,5 +1,5 @@ # See `README.md` before bumping the version. [toolchain] -channel = "nightly-2024-06-06" +channel = "nightly-2024-07-19" components = ["llvm-tools-preview", "rustc-dev"] diff --git a/proto/connector_service.proto b/proto/connector_service.proto index 964d227452548..99d9c58d4f1ed 100644 --- a/proto/connector_service.proto +++ b/proto/connector_service.proto @@ -229,9 +229,15 @@ message CoordinateRequest { SinkMetadata metadata = 2; } + message UpdateVnodeBitmapRequest { + common.Buffer vnode_bitmap = 1; + } + oneof msg { StartCoordinationRequest start_request = 1; CommitRequest commit_request = 2; + UpdateVnodeBitmapRequest update_vnode_request = 3; + bool stop = 4; } } diff --git a/proto/expr.proto b/proto/expr.proto index e5b5fb73ba8ff..53bba96cc587b 100644 --- a/proto/expr.proto +++ b/proto/expr.proto @@ -282,6 +282,7 @@ message ExprNode { JSONB_POPULATE_RECORD = 629; JSONB_TO_RECORD = 630; JSONB_SET = 631; + JSONB_POPULATE_MAP = 632; // Map functions MAP_FROM_ENTRIES = 700; diff --git a/proto/hummock.proto b/proto/hummock.proto index 19b7e036c9686..7956b4515dce8 100644 --- a/proto/hummock.proto +++ b/proto/hummock.proto @@ -104,6 +104,11 @@ message GroupTableChange { message GroupDestroy {} +message GroupMerge { + uint64 left_group_id = 1; + uint64 right_group_id = 2; +} + message GroupDelta { oneof delta_type { IntraLevelDelta intra_level = 1; @@ -111,6 +116,7 @@ message GroupDelta { GroupDestroy group_destroy = 3; GroupMetaChange group_meta_change = 4 [deprecated = true]; GroupTableChange group_table_change = 5 [deprecated = true]; + GroupMerge group_merge = 6; } } @@ -744,6 +750,7 @@ message PinVersionResponse { message SplitCompactionGroupRequest { uint64 group_id = 1; repeated uint32 table_ids = 2; + uint32 partition_vnode_count = 3; } message SplitCompactionGroupResponse { @@ -833,12 +840,20 @@ message CancelCompactTaskResponse { message GetVersionByEpochRequest { uint64 epoch = 1; + uint32 table_id = 2; } message GetVersionByEpochResponse { HummockVersion version = 1; } +message MergeCompactionGroupRequest { + uint64 left_group_id = 1; + uint64 right_group_id = 2; +} + +message MergeCompactionGroupResponse {} + service HummockManagerService { rpc UnpinVersionBefore(UnpinVersionBeforeRequest) returns (UnpinVersionBeforeResponse); rpc GetCurrentVersion(GetCurrentVersionRequest) returns (GetCurrentVersionResponse); @@ -880,6 +895,7 @@ service HummockManagerService { rpc CancelCompactTask(CancelCompactTaskRequest) returns (CancelCompactTaskResponse); rpc ListChangeLogEpochs(ListChangeLogEpochsRequest) returns (ListChangeLogEpochsResponse); rpc GetVersionByEpoch(GetVersionByEpochRequest) returns (GetVersionByEpochResponse); + rpc MergeCompactionGroup(MergeCompactionGroupRequest) returns (MergeCompactionGroupResponse); } message CompactionConfig { diff --git a/proto/meta.proto b/proto/meta.proto index d75494625edd4..98a7f267c0124 100644 --- a/proto/meta.proto +++ b/proto/meta.proto @@ -99,6 +99,7 @@ message TableFragments { State state = 2; map fragments = 3; map actor_status = 4; + // `Source` and `SourceBackfill` are handled together here. map actor_splits = 5; stream_plan.StreamContext ctx = 6; @@ -513,6 +514,7 @@ message GetClusterInfoRequest {} message GetClusterInfoResponse { repeated common.WorkerNode worker_nodes = 1; repeated TableFragments table_fragments = 2; + // `Source` and `SourceBackfill` are handled together here. map actor_splits = 3; map source_infos = 4; uint64 revision = 5; @@ -789,3 +791,30 @@ message RelationIdInfos { // relation_id -> FragmentIdToActorIdMap map map = 1; } + +message ActorCountPerParallelism { + message WorkerActorCount { + uint64 actor_count = 1; + uint64 parallelism = 2; + } + map worker_id_to_actor_count = 1; + uint64 hard_limit = 2; + uint64 soft_limit = 3; +} + +message ClusterLimit { + oneof limit { + ActorCountPerParallelism actor_count = 1; + // TODO: limit DDL using compaction pending bytes + } +} + +message GetClusterLimitsRequest {} + +message GetClusterLimitsResponse { + repeated ClusterLimit active_limits = 1; +} + +service ClusterLimitService { + rpc GetClusterLimits(GetClusterLimitsRequest) returns (GetClusterLimitsResponse); +} diff --git a/proto/plan_common.proto b/proto/plan_common.proto index bc2e60503f103..0f4e988e6c035 100644 --- a/proto/plan_common.proto +++ b/proto/plan_common.proto @@ -141,6 +141,29 @@ enum JoinType { JOIN_TYPE_RIGHT_ANTI = 8; } +enum AsOfJoinType { + AS_OF_JOIN_TYPE_UNSPECIFIED = 0; + AS_OF_JOIN_TYPE_INNER = 1; + AS_OF_JOIN_TYPE_LEFT_OUTER = 2; +} + +enum AsOfJoinInequalityType { + AS_OF_INEQUALITY_TYPE_UNSPECIFIED = 0; + AS_OF_INEQUALITY_TYPE_GT = 1; + AS_OF_INEQUALITY_TYPE_GE = 2; + AS_OF_INEQUALITY_TYPE_LT = 3; + AS_OF_INEQUALITY_TYPE_LE = 4; +} + +message AsOfJoinDesc { + // The index of the right side's as of column. + uint32 right_idx = 1; + // The index of the left side's as of column. + uint32 left_idx = 2; + // The type of the inequality. + AsOfJoinInequalityType inequality_type = 3; +} + // https://github.com/tokio-rs/prost/issues/80 enum FormatType { FORMAT_TYPE_UNSPECIFIED = 0; @@ -230,6 +253,8 @@ message AdditionalTableName {} message AdditionalCollectionName {} +message AdditionalColumnPayload {} + // this type means we read all headers as a whole message AdditionalColumnHeaders {} @@ -246,6 +271,7 @@ message AdditionalColumn { AdditionalSchemaName schema_name = 9; AdditionalTableName table_name = 10; AdditionalCollectionName collection_name = 11; + AdditionalColumnPayload payload = 12; } } @@ -258,4 +284,5 @@ enum AdditionalColumnType { ADDITIONAL_COLUMN_TYPE_HEADER = 5; ADDITIONAL_COLUMN_TYPE_FILENAME = 6; ADDITIONAL_COLUMN_TYPE_NORMAL = 7; + ADDITIONAL_COLUMN_TYPE_PAYLOAD = 8; } diff --git a/proto/stream_plan.proto b/proto/stream_plan.proto index 4148e78690745..ca67737aeafe0 100644 --- a/proto/stream_plan.proto +++ b/proto/stream_plan.proto @@ -23,6 +23,7 @@ message AddMutation { // All actors to be added (to the main connected component of the graph) in this update. repeated uint32 added_actors = 3; // We may embed a source change split mutation here. + // `Source` and `SourceBackfill` are handled together here. // TODO: we may allow multiple mutations in a single barrier. map actor_splits = 2; // We may embed a pause mutation here. @@ -70,6 +71,7 @@ message UpdateMutation { // All actors to be dropped in this update. repeated uint32 dropped_actors = 4; // Source updates. + // `Source` and `SourceBackfill` are handled together here. map actor_splits = 5; // When modifying the Materialized View, we need to recreate the Dispatcher from the old upstream to the new TableFragment. // Consistent with the semantics in AddMutation. @@ -77,6 +79,7 @@ message UpdateMutation { } message SourceChangeSplitMutation { + // `Source` and `SourceBackfill` are handled together here. map actor_splits = 2; } @@ -365,6 +368,9 @@ message SimpleAggNode { map distinct_dedup_tables = 6; uint32 row_count_index = 7; AggNodeVersion version = 8; + // Required by the downstream `RowMergeNode`, + // currently only used by the `approx_percentile`'s two phase plan + bool must_output_per_barrier = 9; } message HashAggNode { @@ -449,6 +455,32 @@ message HashJoinNode { bool is_append_only = 14; } +message AsOfJoinNode { + plan_common.AsOfJoinType join_type = 1; + repeated int32 left_key = 2; + repeated int32 right_key = 3; + // Used for internal table states. + catalog.Table left_table = 4; + // Used for internal table states. + catalog.Table right_table = 5; + // Used for internal table states. + catalog.Table left_degree_table = 6; + // Used for internal table states. + catalog.Table right_degree_table = 7; + // The output indices of current node + repeated uint32 output_indices = 8; + // Left deduped input pk indices. The pk of the left_table and + // The pk of the left_table is [left_join_key | left_inequality_key | left_deduped_input_pk_indices] + // left_inequality_key is not used but for forward compatibility. + repeated uint32 left_deduped_input_pk_indices = 9; + // Right deduped input pk indices. + // The pk of the right_table is [right_join_key | right_inequality_key | right_deduped_input_pk_indices] + // right_inequality_key is not used but for forward compatibility. + repeated uint32 right_deduped_input_pk_indices = 10; + repeated bool null_safe = 11; + optional plan_common.AsOfJoinDesc asof_desc = 12; +} + message TemporalJoinNode { plan_common.JoinType join_type = 1; repeated int32 left_key = 2; diff --git a/proto/stream_service.proto b/proto/stream_service.proto index 54ffc3d5ff79c..ce727ba9cc55c 100644 --- a/proto/stream_service.proto +++ b/proto/stream_service.proto @@ -17,16 +17,6 @@ message BuildActorInfo { map related_subscriptions = 2; } -message DropActorsRequest { - string request_id = 1; - repeated uint32 actor_ids = 2; -} - -message DropActorsResponse { - string request_id = 1; - common.Status status = 2; -} - message InjectBarrierRequest { string request_id = 1; stream_plan.Barrier barrier = 2; @@ -109,7 +99,6 @@ message StreamingControlStreamResponse { } service StreamService { - rpc DropActors(DropActorsRequest) returns (DropActorsResponse); rpc WaitEpochCommit(WaitEpochCommitRequest) returns (WaitEpochCommitResponse); rpc StreamingControlStream(stream StreamingControlStreamRequest) returns (stream StreamingControlStreamResponse); } diff --git a/risedev.yml b/risedev.yml index 3c7f8e0e09be4..22c4569adb610 100644 --- a/risedev.yml +++ b/risedev.yml @@ -16,8 +16,12 @@ profile: # The default configuration will start 1 compute node, 1 meta node and 1 frontend. default: - # Specify a configuration file to override the default settings + # # Specify a configuration file to override the default settings # config-path: src/config/example.toml + # # Specify custom environment variables + # env: + # RUST_LOG: "info,risingwave_storage::hummock=off" + # RW_ENABLE_PRETTY_LOG: "true" steps: # If you want to use the local s3 storage, enable the following line # - use: minio diff --git a/src/batch/src/executor/iceberg_scan.rs b/src/batch/src/executor/iceberg_scan.rs index fca7745284fe3..2f67d8ce005aa 100644 --- a/src/batch/src/executor/iceberg_scan.rs +++ b/src/batch/src/executor/iceberg_scan.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::mem; use futures_async_stream::try_stream; @@ -20,8 +21,11 @@ use iceberg::scan::FileScanTask; use iceberg::spec::TableMetadata; use itertools::Itertools; use risingwave_common::array::arrow::IcebergArrowConvert; +use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{Field, Schema}; +use risingwave_common::row::{OwnedRow, Row}; use risingwave_common::types::DataType; +use risingwave_common::util::iter_util::ZipEqFast; use risingwave_connector::sink::iceberg::IcebergConfig; use risingwave_connector::source::iceberg::{IcebergProperties, IcebergSplit}; use risingwave_connector::source::{ConnectorProperties, SplitImpl, SplitMetaData}; @@ -38,7 +42,8 @@ pub struct IcebergScanExecutor { #[allow(dead_code)] snapshot_id: Option, table_meta: TableMetadata, - file_scan_tasks: Vec, + data_file_scan_tasks: Vec, + eq_delete_file_scan_tasks: Vec, batch_size: usize, schema: Schema, identity: String, @@ -63,7 +68,8 @@ impl IcebergScanExecutor { iceberg_config: IcebergConfig, snapshot_id: Option, table_meta: TableMetadata, - file_scan_tasks: Vec, + data_file_scan_tasks: Vec, + eq_delete_file_scan_tasks: Vec, batch_size: usize, schema: Schema, identity: String, @@ -72,7 +78,8 @@ impl IcebergScanExecutor { iceberg_config, snapshot_id, table_meta, - file_scan_tasks, + data_file_scan_tasks, + eq_delete_file_scan_tasks, batch_size, schema, identity, @@ -86,33 +93,136 @@ impl IcebergScanExecutor { .load_table_v2_with_metadata(self.table_meta) .await?; let data_types = self.schema.data_types(); + let executor_schema_names = self.schema.names(); - let file_scan_tasks = mem::take(&mut self.file_scan_tasks); + let mut eq_delete_file_scan_tasks_map: HashMap = HashMap::default(); + let eq_delete_file_scan_tasks = mem::take(&mut self.eq_delete_file_scan_tasks); - let file_scan_stream = { - #[try_stream] - async move { - for file_scan_task in file_scan_tasks { - yield file_scan_task; + // Build hash map for equality delete files + // Currently, all equality delete files have the same schema which is guaranteed by `IcebergSplitEnumerator`. + let mut eq_delete_ids: Option> = None; + for eq_delete_file_scan_task in eq_delete_file_scan_tasks { + let mut sequence_number = eq_delete_file_scan_task.sequence_number; + + if eq_delete_ids.is_none() { + eq_delete_ids = Some(eq_delete_file_scan_task.project_field_ids.clone()); + } else { + debug_assert_eq!( + eq_delete_ids.as_ref().unwrap(), + &eq_delete_file_scan_task.project_field_ids + ); + } + + let reader = table + .reader_builder() + .with_batch_size(self.batch_size) + .build(); + let delete_file_scan_stream = tokio_stream::once(Ok(eq_delete_file_scan_task)); + + let mut delete_record_batch_stream = reader + .read(Box::pin(delete_file_scan_stream)) + .map_err(BatchError::Iceberg)?; + + while let Some(record_batch) = delete_record_batch_stream.next().await { + let record_batch = record_batch.map_err(BatchError::Iceberg)?; + + let chunk = IcebergArrowConvert.chunk_from_record_batch(&record_batch)?; + for row in chunk.rows() { + let entry = eq_delete_file_scan_tasks_map + .entry(row.to_owned_row()) + .or_default(); + *entry = *entry.max(&mut sequence_number); } } - }; - - let reader = table - .reader_builder() - .with_batch_size(self.batch_size) - .build(); - - let record_batch_stream = reader - .read(Box::pin(file_scan_stream)) - .map_err(BatchError::Iceberg)?; - - #[for_await] - for record_batch in record_batch_stream { - let record_batch = record_batch.map_err(BatchError::Iceberg)?; - let chunk = IcebergArrowConvert.chunk_from_record_batch(&record_batch)?; - debug_assert_eq!(chunk.data_types(), data_types); - yield chunk; + } + + let data_file_scan_tasks = mem::take(&mut self.data_file_scan_tasks); + + // Delete rows in the data file that need to be deleted by map + for data_file_scan_task in data_file_scan_tasks { + let data_sequence_number = data_file_scan_task.sequence_number; + + let data_chunk_column_names: Vec<_> = data_file_scan_task + .project_field_ids + .iter() + .filter_map(|id| { + data_file_scan_task + .schema + .name_by_field_id(*id) + .map(|name| name.to_string()) + }) + .collect(); + + // eq_delete_column_idxes are used to fetch equality delete columns from data files. + let eq_delete_column_idxes = eq_delete_ids.as_ref().map(|eq_delete_ids| { + eq_delete_ids + .iter() + .map(|eq_delete_id| { + data_file_scan_task + .project_field_ids + .iter() + .position(|project_field_id| eq_delete_id == project_field_id) + .expect("eq_delete_id not found in delete_equality_ids") + }) + .collect_vec() + }); + + let reader = table + .reader_builder() + .with_batch_size(self.batch_size) + .build(); + let file_scan_stream = tokio_stream::once(Ok(data_file_scan_task)); + + let mut record_batch_stream = reader + .read(Box::pin(file_scan_stream)) + .map_err(BatchError::Iceberg)?; + + while let Some(record_batch) = record_batch_stream.next().await { + let record_batch = record_batch.map_err(BatchError::Iceberg)?; + + let chunk = IcebergArrowConvert.chunk_from_record_batch(&record_batch)?; + let chunk = match eq_delete_column_idxes.as_ref() { + Some(delete_column_ids) => { + let visibility = Bitmap::from_iter( + // Project with the schema of the delete file + chunk.project(delete_column_ids).rows().map(|row_ref| { + let row = row_ref.to_owned_row(); + if let Some(delete_sequence_number) = + eq_delete_file_scan_tasks_map.get(&row) + && delete_sequence_number > &data_sequence_number + { + // delete_sequence_number > data_sequence_number means the delete file is written later than data file, + // so it needs to be deleted + false + } else { + true + } + }), + ) + .clone(); + // Keep the schema consistent(chunk and executor) + // Filter out (equality delete) columns that are not in the executor schema + let data = chunk + .columns() + .iter() + .zip_eq_fast(&data_chunk_column_names) + .filter_map(|(array, columns)| { + if executor_schema_names.contains(columns) { + Some(array.clone()) + } else { + None + } + }) + .collect_vec(); + let chunk = DataChunk::new(data, visibility); + debug_assert_eq!(chunk.data_types(), data_types); + chunk + } + // If there is no delete file, the data file is directly output + None => chunk, + }; + yield chunk; + } } } } @@ -171,6 +281,11 @@ impl BoxedExecutorBuilder for IcebergScanExecutorBuilder { Some(split.snapshot_id), split.table_meta.deserialize(), split.files.into_iter().map(|x| x.deserialize()).collect(), + split + .eq_delete_files + .into_iter() + .map(|x| x.deserialize()) + .collect(), source.context.get_config().developer.chunk_size, schema, source.plan_node().get_identity().clone(), diff --git a/src/batch/src/executor/join/distributed_lookup_join.rs b/src/batch/src/executor/join/distributed_lookup_join.rs index 1068ffd7f3349..74d7843013e4d 100644 --- a/src/batch/src/executor/join/distributed_lookup_join.rs +++ b/src/batch/src/executor/join/distributed_lookup_join.rs @@ -17,8 +17,9 @@ use std::mem::swap; use futures::pin_mut; use itertools::Itertools; +use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnDesc, ColumnId, Field, Schema}; -use risingwave_common::hash::{HashKey, HashKeyDispatcher}; +use risingwave_common::hash::{HashKey, HashKeyDispatcher, VirtualNode}; use risingwave_common::memory::MemoryContext; use risingwave_common::row::OwnedRow; use risingwave_common::types::{DataType, Datum}; @@ -30,7 +31,7 @@ use risingwave_pb::batch_plan::plan_node::NodeBody; use risingwave_pb::common::BatchQueryEpoch; use risingwave_storage::store::PrefetchOptions; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::{TableDistribution, TableIter}; +use risingwave_storage::table::TableIter; use risingwave_storage::{dispatch_state_store, StateStore}; use crate::error::Result; @@ -194,7 +195,8 @@ impl BoxedExecutorBuilder for DistributedLookupJoinExecutorBuilder { .collect(); // Lookup Join always contains distribution key, so we don't need vnode bitmap - let vnodes = Some(TableDistribution::all_vnodes()); + // TODO(var-vnode): use vnode count from table desc + let vnodes = Some(Bitmap::ones(VirtualNode::COUNT).into()); dispatch_state_store!(source.context().state_store(), state_store, { let table = StorageTable::new_partial(state_store, column_ids, vnodes, table_desc); let inner_side_builder = InnerSideExecutorBuilder::new( diff --git a/src/batch/src/executor/join/hash_join.rs b/src/batch/src/executor/join/hash_join.rs index 3bfb583d6459d..863e53035626a 100644 --- a/src/batch/src/executor/join/hash_join.rs +++ b/src/batch/src/executor/join/hash_join.rs @@ -162,9 +162,8 @@ impl<'a> Iterator for RowIdIter<'a> { type Item = RowId; fn next(&mut self) -> Option { - self.current_row_id.map(|row_id| { - self.current_row_id = self.next_row_id[row_id]; - row_id + self.current_row_id.inspect(|row_id| { + self.current_row_id = self.next_row_id[*row_id]; }) } } diff --git a/src/batch/src/executor/join/local_lookup_join.rs b/src/batch/src/executor/join/local_lookup_join.rs index a3be00fc39a22..7c7a08af5d873 100644 --- a/src/batch/src/executor/join/local_lookup_join.rs +++ b/src/batch/src/executor/join/local_lookup_join.rs @@ -17,7 +17,7 @@ use std::marker::PhantomData; use anyhow::Context; use itertools::Itertools; -use risingwave_common::bitmap::BitmapBuilder; +use risingwave_common::bitmap::{Bitmap, BitmapBuilder}; use risingwave_common::catalog::{ColumnDesc, Field, Schema}; use risingwave_common::hash::table_distribution::TableDistribution; use risingwave_common::hash::{ @@ -408,12 +408,11 @@ impl BoxedExecutorBuilder for LocalLookupJoinExecutorBuilder { }) .collect(); + // TODO(var-vnode): use vnode count from table desc + let vnodes = Some(Bitmap::ones(VirtualNode::COUNT).into()); let inner_side_builder = InnerSideExecutorBuilder { table_desc: table_desc.clone(), - table_distribution: TableDistribution::new_from_storage_table_desc( - Some(TableDistribution::all_vnodes()), - table_desc, - ), + table_distribution: TableDistribution::new_from_storage_table_desc(vnodes, table_desc), vnode_mapping, outer_side_key_types, inner_side_schema, diff --git a/src/batch/src/executor/log_row_seq_scan.rs b/src/batch/src/executor/log_row_seq_scan.rs index 7106eaec1b760..be2a11b756946 100644 --- a/src/batch/src/executor/log_row_seq_scan.rs +++ b/src/batch/src/executor/log_row_seq_scan.rs @@ -22,13 +22,14 @@ use prometheus::Histogram; use risingwave_common::array::{DataChunk, Op}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnId, Field, Schema}; +use risingwave_common::hash::VirtualNode; use risingwave_common::row::{Row, RowExt}; use risingwave_common::types::ScalarImpl; use risingwave_pb::batch_plan::plan_node::NodeBody; use risingwave_pb::common::{batch_query_epoch, BatchQueryEpoch}; use risingwave_pb::plan_common::StorageTableDesc; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::{collect_data_chunk, TableDistribution}; +use risingwave_storage::table::collect_data_chunk; use risingwave_storage::{dispatch_state_store, StateStore}; use super::{BoxedDataChunkStream, BoxedExecutor, BoxedExecutorBuilder, Executor, ExecutorBuilder}; @@ -106,7 +107,8 @@ impl BoxedExecutorBuilder for LogStoreRowSeqScanExecutorBuilder { Some(vnodes) => Some(Bitmap::from(vnodes).into()), // This is possible for dml. vnode_bitmap is not filled by scheduler. // Or it's single distribution, e.g., distinct agg. We scan in a single executor. - None => Some(TableDistribution::all_vnodes()), + // TODO(var-vnode): use vnode count from table desc + None => Some(Bitmap::ones(VirtualNode::COUNT).into()), }; let chunk_size = source.context.get_config().developer.chunk_size as u32; diff --git a/src/batch/src/executor/row_seq_scan.rs b/src/batch/src/executor/row_seq_scan.rs index b897dbd813787..7c7244d954764 100644 --- a/src/batch/src/executor/row_seq_scan.rs +++ b/src/batch/src/executor/row_seq_scan.rs @@ -21,6 +21,7 @@ use prometheus::Histogram; use risingwave_common::array::DataChunk; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnId, Schema}; +use risingwave_common::hash::VirtualNode; use risingwave_common::row::{OwnedRow, Row}; use risingwave_common::types::{DataType, Datum}; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; @@ -32,7 +33,6 @@ use risingwave_pb::plan_common::as_of::AsOfType; use risingwave_pb::plan_common::{as_of, PbAsOf, StorageTableDesc}; use risingwave_storage::store::PrefetchOptions; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::TableDistribution; use risingwave_storage::{dispatch_state_store, StateStore}; use crate::error::{BatchError, Result}; @@ -210,7 +210,8 @@ impl BoxedExecutorBuilder for RowSeqScanExecutorBuilder { Some(vnodes) => Some(Bitmap::from(vnodes).into()), // This is possible for dml. vnode_bitmap is not filled by scheduler. // Or it's single distribution, e.g., distinct agg. We scan in a single executor. - None => Some(TableDistribution::all_vnodes()), + // TODO(var-vnode): use vnode count from table desc + None => Some(Bitmap::ones(VirtualNode::COUNT).into()), }; let scan_ranges = { diff --git a/src/batch/src/lib.rs b/src/batch/src/lib.rs index 414f27b33b4a7..9b88c3be9cd68 100644 --- a/src/batch/src/lib.rs +++ b/src/batch/src/lib.rs @@ -20,7 +20,6 @@ #![feature(coroutines)] #![feature(proc_macro_hygiene, stmt_expr_attributes)] #![feature(iterator_try_collect)] -#![feature(lint_reasons)] #![feature(is_sorted)] #![recursion_limit = "256"] #![feature(let_chains)] diff --git a/src/batch/src/task/consistent_hash_shuffle_channel.rs b/src/batch/src/task/consistent_hash_shuffle_channel.rs index ad0fdbaa8b70a..32d91a7acc09b 100644 --- a/src/batch/src/task/consistent_hash_shuffle_channel.rs +++ b/src/batch/src/task/consistent_hash_shuffle_channel.rs @@ -59,6 +59,7 @@ fn generate_hash_values( .iter() .map(|idx| *idx as usize) .collect::>(), + consistent_hash_info.vmap.len(), ); let hash_values = vnodes diff --git a/src/batch/src/worker_manager/worker_node_manager.rs b/src/batch/src/worker_manager/worker_node_manager.rs index 80cd2806f2b64..772bc8a4b6da7 100644 --- a/src/batch/src/worker_manager/worker_node_manager.rs +++ b/src/batch/src/worker_manager/worker_node_manager.rs @@ -346,36 +346,26 @@ impl WorkerNodeSelector { if self.enable_barrier_read { self.manager.get_streaming_fragment_mapping(&fragment_id) } else { - let (hint, parallelism) = match self.manager.serving_fragment_mapping(fragment_id) { - Ok(o) => { - if self.manager.worker_node_mask().is_empty() { - // 1. Stable mapping for most cases. - return Ok(o); - } - // If it's a singleton, set max_parallelism=1 for place_vnode. - let max_parallelism = o.to_single().map(|_| 1); - (Some(o), max_parallelism) - } - Err(e) => { - if !matches!(e, BatchError::ServingVnodeMappingNotFound(_)) { - return Err(e); - } - // We cannot tell whether it's a singleton, set max_parallelism=1 for place_vnode as if it's a singleton. - let max_parallelism = 1; - tracing::warn!( - fragment_id, - max_parallelism, - "Serving fragment mapping not found, fall back to temporary one." - ); - // Workaround the case that new mapping is not available yet due to asynchronous - // notification. - (None, Some(max_parallelism)) - } - }; - // 2. Temporary mapping that filters out unavailable workers. - let new_workers = self.apply_worker_node_mask(self.manager.list_serving_worker_nodes()); - let masked_mapping = place_vnode(hint.as_ref(), &new_workers, parallelism); - masked_mapping.ok_or_else(|| BatchError::EmptyWorkerNodes) + let mapping = (self.manager.serving_fragment_mapping(fragment_id)).or_else(|_| { + tracing::warn!( + fragment_id, + "Serving fragment mapping not found, fall back to streaming one." + ); + self.manager.get_streaming_fragment_mapping(&fragment_id) + })?; + + // Filter out unavailable workers. + if self.manager.worker_node_mask().is_empty() { + Ok(mapping) + } else { + let workers = self.apply_worker_node_mask(self.manager.list_serving_worker_nodes()); + // If it's a singleton, set max_parallelism=1 for place_vnode. + let max_parallelism = mapping.to_single().map(|_| 1); + let masked_mapping = + place_vnode(Some(&mapping), &workers, max_parallelism, mapping.len()) + .ok_or_else(|| BatchError::EmptyWorkerNodes)?; + Ok(masked_mapping) + } } } diff --git a/src/common/benches/bench_data_chunk_encoding.rs b/src/common/benches/bench_data_chunk_encoding.rs index 96413a4305205..4b09aeaeed5c2 100644 --- a/src/common/benches/bench_data_chunk_encoding.rs +++ b/src/common/benches/bench_data_chunk_encoding.rs @@ -55,7 +55,7 @@ fn bench_data_chunk_encoding(c: &mut Criterion) { for null_ratio in NULL_RATIOS { for chunk_size in CHUNK_SIZES { let chunk = rand_chunk::gen_chunk(&case.data_types, *chunk_size, SEED, *null_ratio); - let mut group = c.benchmark_group(&format!( + let mut group = c.benchmark_group(format!( "data chunk encoding: {}, {} rows, Pr[null]={}", case.name, chunk_size, null_ratio )); diff --git a/src/common/benches/bench_sequencer.rs b/src/common/benches/bench_sequencer.rs index 12e92f1f3332d..591b5fd64ee3a 100644 --- a/src/common/benches/bench_sequencer.rs +++ b/src/common/benches/bench_sequencer.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] - use std::cell::RefCell; use std::hint::black_box; use std::sync::atomic::{AtomicUsize, Ordering}; diff --git a/src/common/common_service/src/lib.rs b/src/common/common_service/src/lib.rs index 2cf9a56e076f3..ecf89a84fce88 100644 --- a/src/common/common_service/src/lib.rs +++ b/src/common/common_service/src/lib.rs @@ -14,7 +14,6 @@ // This is a stub lib.rs. -#![feature(lint_reasons)] #![feature(impl_trait_in_assoc_type)] #![feature(error_generic_member_access)] diff --git a/src/common/metrics/src/guarded_metrics.rs b/src/common/metrics/src/guarded_metrics.rs index 27710748ae359..9b16cc778938c 100644 --- a/src/common/metrics/src/guarded_metrics.rs +++ b/src/common/metrics/src/guarded_metrics.rs @@ -83,6 +83,22 @@ macro_rules! register_guarded_int_gauge_vec_with_registry { }}; } +#[macro_export] +macro_rules! register_guarded_uint_gauge_vec_with_registry { + ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr, $REGISTRY:expr $(,)?) => {{ + let inner = prometheus::core::GenericGaugeVec::::new( + prometheus::opts!($NAME, $HELP), + $LABELS_NAMES, + ); + inner.and_then(|inner| { + let inner = $crate::__extract_gauge_builder(inner); + let label_guarded = $crate::LabelGuardedUintGaugeVec::new(inner, { $LABELS_NAMES }); + let result = ($REGISTRY).register(Box::new(label_guarded.clone())); + result.map(move |()| label_guarded) + }) + }}; +} + #[macro_export] macro_rules! register_guarded_int_counter_vec_with_registry { ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr, $REGISTRY:expr $(,)?) => {{ @@ -131,6 +147,8 @@ pub type LabelGuardedIntCounterVec = LabelGuardedMetricVec, N>; pub type LabelGuardedIntGaugeVec = LabelGuardedMetricVec, N>; +pub type LabelGuardedUintGaugeVec = + LabelGuardedMetricVec, N>; pub type LabelGuardedGaugeVec = LabelGuardedMetricVec, N>; diff --git a/src/common/src/array/arrow/arrow_iceberg.rs b/src/common/src/array/arrow/arrow_iceberg.rs index ff23bc102ee6b..80c0a3dab1667 100644 --- a/src/common/src/array/arrow/arrow_iceberg.rs +++ b/src/common/src/array/arrow/arrow_iceberg.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cell::RefCell; +use std::collections::HashMap; use std::ops::{Div, Mul}; use std::sync::Arc; @@ -138,12 +140,8 @@ impl ToArrow for IcebergArrowConvert { let scale = e.scale() as i8; let diff_scale = abs(max_scale - scale); let value = match scale { - _ if scale < max_scale => { - value.mul(10_i32.pow(diff_scale as u32) as i128) - } - _ if scale > max_scale => { - value.div(10_i32.pow(diff_scale as u32) as i128) - } + _ if scale < max_scale => value.mul(10_i128.pow(diff_scale as u32)), + _ if scale > max_scale => value.div(10_i128.pow(diff_scale as u32)), _ => value, }; Some(value) @@ -171,6 +169,94 @@ impl ToArrow for IcebergArrowConvert { impl FromArrow for IcebergArrowConvert {} +/// Iceberg sink with `create_table_if_not_exists` option will use this struct to convert the +/// iceberg data type to arrow data type. Specifically, it will add the field id to the +/// arrow field metadata, because iceberg-rust and icelake need the field id to be set. +/// +/// Note: this is different from [`IcebergArrowConvert`], which is used to read from/write to +/// an _existing_ iceberg table. In that case, we just need to make sure the data is compatible to the existing schema. +/// But to _create a new table_, we need to meet more requirements of iceberg. +#[derive(Default)] +pub struct IcebergCreateTableArrowConvert { + next_field_id: RefCell, +} + +impl IcebergCreateTableArrowConvert { + pub fn to_arrow_field( + &self, + name: &str, + data_type: &DataType, + ) -> Result { + ToArrow::to_arrow_field(self, name, data_type) + } + + fn add_field_id(&self, arrow_field: &mut arrow_schema::Field) { + *self.next_field_id.borrow_mut() += 1; + let field_id = *self.next_field_id.borrow(); + + let mut metadata = HashMap::new(); + // for iceberg-rust + metadata.insert("PARQUET:field_id".to_string(), field_id.to_string()); + // for icelake + metadata.insert("column_id".to_string(), field_id.to_string()); + arrow_field.set_metadata(metadata); + } +} + +impl ToArrow for IcebergCreateTableArrowConvert { + #[inline] + fn decimal_type_to_arrow(&self, name: &str) -> arrow_schema::Field { + // To create a iceberg table, we need a decimal type with precision and scale to be set + // We choose 28 here + // The decimal type finally will be converted to an iceberg decimal type. + // Iceberg decimal(P,S) + // Fixed-point decimal; precision P, scale S Scale is fixed, precision must be 38 or less. + let data_type = arrow_schema::DataType::Decimal128(28, 10); + + let mut arrow_field = arrow_schema::Field::new(name, data_type, true); + self.add_field_id(&mut arrow_field); + arrow_field + } + + /// Convert RisingWave data type to Arrow data type. + /// + /// This function returns a `Field` instead of `DataType` because some may be converted to + /// extension types which require additional metadata in the field. + fn to_arrow_field( + &self, + name: &str, + value: &DataType, + ) -> Result { + let data_type = match value { + // using the inline function + DataType::Boolean => self.bool_type_to_arrow(), + DataType::Int16 => self.int16_type_to_arrow(), + DataType::Int32 => self.int32_type_to_arrow(), + DataType::Int64 => self.int64_type_to_arrow(), + DataType::Int256 => self.int256_type_to_arrow(), + DataType::Float32 => self.float32_type_to_arrow(), + DataType::Float64 => self.float64_type_to_arrow(), + DataType::Date => self.date_type_to_arrow(), + DataType::Time => self.time_type_to_arrow(), + DataType::Timestamp => self.timestamp_type_to_arrow(), + DataType::Timestamptz => self.timestamptz_type_to_arrow(), + DataType::Interval => self.interval_type_to_arrow(), + DataType::Varchar => self.varchar_type_to_arrow(), + DataType::Bytea => self.bytea_type_to_arrow(), + DataType::Serial => self.serial_type_to_arrow(), + DataType::Decimal => return Ok(self.decimal_type_to_arrow(name)), + DataType::Jsonb => return Ok(self.jsonb_type_to_arrow(name)), + DataType::Struct(fields) => self.struct_type_to_arrow(fields)?, + DataType::List(datatype) => self.list_type_to_arrow(datatype)?, + DataType::Map(datatype) => self.map_type_to_arrow(datatype)?, + }; + + let mut arrow_field = arrow_schema::Field::new(name, data_type, true); + self.add_field_id(&mut arrow_field); + Ok(arrow_field) + } +} + #[cfg(test)] mod test { use std::sync::Arc; @@ -207,4 +293,30 @@ mod test { ) as ArrayRef; assert_eq!(&arrow_array, &expect_array); } + + #[test] + fn decimal_with_large_scale() { + let array = DecimalArray::from_iter([ + None, + Some(Decimal::NaN), + Some(Decimal::PositiveInf), + Some(Decimal::NegativeInf), + Some(Decimal::Normalized("123.4".parse().unwrap())), + Some(Decimal::Normalized("123.456".parse().unwrap())), + ]); + let ty = DataType::Decimal128(28, 10); + let arrow_array = IcebergArrowConvert.decimal_to_arrow(&ty, &array).unwrap(); + let expect_array = Arc::new( + Decimal128Array::from(vec![ + None, + None, + Some(9999999999999999999999999999), + Some(-9999999999999999999999999999), + Some(1234000000000), + Some(1234560000000), + ]) + .with_data_type(ty), + ) as ArrayRef; + assert_eq!(&arrow_array, &expect_array); + } } diff --git a/src/common/src/array/arrow/arrow_impl.rs b/src/common/src/array/arrow/arrow_impl.rs index 7d69b50afed49..8fa3e2abb6b5f 100644 --- a/src/common/src/array/arrow/arrow_impl.rs +++ b/src/common/src/array/arrow/arrow_impl.rs @@ -448,12 +448,17 @@ pub trait ToArrow { #[inline] fn map_type_to_arrow(&self, map_type: &MapType) -> Result { let sorted = false; - let list_type = map_type.clone().into_list(); + // "key" is always non-null + let key = self + .to_arrow_field("key", map_type.key())? + .with_nullable(false); + let value = self.to_arrow_field("value", map_type.value())?; Ok(arrow_schema::DataType::Map( Arc::new(arrow_schema::Field::new( "entries", - self.list_type_to_arrow(&list_type)?, - true, + arrow_schema::DataType::Struct([Arc::new(key), Arc::new(value)].into()), + // "entries" is always non-null + false, )), sorted, )) @@ -514,6 +519,12 @@ pub trait FromArrow { LargeBinary => self.from_large_binary()?, List(field) => DataType::List(Box::new(self.from_field(field)?)), Struct(fields) => DataType::Struct(self.from_fields(fields)?), + Map(field, _is_sorted) => { + let entries = self.from_field(field)?; + DataType::Map(MapType::try_from_entries(entries).map_err(|e| { + ArrayError::from_arrow(format!("invalid arrow map field: {field:?}, err: {e}")) + })?) + } t => { return Err(ArrayError::from_arrow(format!( "unsupported arrow data type: {t:?}" @@ -588,6 +599,7 @@ pub trait FromArrow { LargeBinary => self.from_large_binary_array(array.as_any().downcast_ref().unwrap()), List(_) => self.from_list_array(array.as_any().downcast_ref().unwrap()), Struct(_) => self.from_struct_array(array.as_any().downcast_ref().unwrap()), + Map(_, _) => self.from_map_array(array.as_any().downcast_ref().unwrap()), t => Err(ArrayError::from_arrow(format!( "unsupported arrow data type: {t:?}", ))), @@ -754,6 +766,21 @@ pub trait FromArrow { (0..array.len()).map(|i| array.is_valid(i)).collect(), ))) } + + fn from_map_array(&self, array: &arrow_array::MapArray) -> Result { + use arrow_array::Array; + let struct_array = self.from_struct_array(array.entries())?; + let list_array = ListArray { + value: Box::new(struct_array), + bitmap: match array.nulls() { + Some(nulls) => nulls.iter().collect(), + None => Bitmap::ones(array.len()), + }, + offsets: array.offsets().iter().map(|o| *o as u32).collect(), + }; + + Ok(ArrayImpl::Map(MapArray { inner: list_array })) + } } impl From<&Bitmap> for arrow_buffer::NullBuffer { diff --git a/src/common/src/array/arrow/arrow_udf.rs b/src/common/src/array/arrow/arrow_udf.rs index e461f49e576a6..a5296ca21cab8 100644 --- a/src/common/src/array/arrow/arrow_udf.rs +++ b/src/common/src/array/arrow/arrow_udf.rs @@ -125,6 +125,7 @@ impl FromArrow for UdfArrowConvert { #[cfg(test)] mod tests { + use super::*; use crate::array::*; @@ -205,4 +206,120 @@ mod tests { .unwrap(); assert_eq!(rw_array.as_list(), &array); } + + #[test] + fn map() { + let map_type = MapType::from_kv(DataType::Varchar, DataType::Int32); + let rw_map_type = DataType::Map(map_type.clone()); + let mut builder = MapArrayBuilder::with_type(3, rw_map_type.clone()); + builder.append_owned(Some( + MapValue::try_from_kv( + ListValue::from_str("{a,b,c}", &DataType::List(Box::new(DataType::Varchar))) + .unwrap(), + ListValue::from_str("{1,2,3}", &DataType::List(Box::new(DataType::Int32))).unwrap(), + ) + .unwrap(), + )); + builder.append_owned(None); + builder.append_owned(Some( + MapValue::try_from_kv( + ListValue::from_str("{a,c}", &DataType::List(Box::new(DataType::Varchar))).unwrap(), + ListValue::from_str("{1,3}", &DataType::List(Box::new(DataType::Int32))).unwrap(), + ) + .unwrap(), + )); + let rw_array = builder.finish(); + + let arrow_map_type = UdfArrowConvert::default() + .map_type_to_arrow(&map_type) + .unwrap(); + expect_test::expect![[r#" + Map( + Field { + name: "entries", + data_type: Struct( + [ + Field { + name: "key", + data_type: Utf8, + nullable: false, + dict_id: 0, + dict_is_ordered: false, + metadata: {}, + }, + Field { + name: "value", + data_type: Int32, + nullable: true, + dict_id: 0, + dict_is_ordered: false, + metadata: {}, + }, + ], + ), + nullable: false, + dict_id: 0, + dict_is_ordered: false, + metadata: {}, + }, + false, + ) + "#]] + .assert_debug_eq(&arrow_map_type); + let rw_map_type_new = UdfArrowConvert::default() + .from_field(&arrow_schema::Field::new( + "map", + arrow_map_type.clone(), + true, + )) + .unwrap(); + assert_eq!(rw_map_type, rw_map_type_new); + let arrow = UdfArrowConvert::default() + .map_to_arrow(&arrow_map_type, &rw_array) + .unwrap(); + expect_test::expect![[r#" + MapArray + [ + StructArray + [ + -- child 0: "key" (Utf8) + StringArray + [ + "a", + "b", + "c", + ] + -- child 1: "value" (Int32) + PrimitiveArray + [ + 1, + 2, + 3, + ] + ], + null, + StructArray + [ + -- child 0: "key" (Utf8) + StringArray + [ + "a", + "c", + ] + -- child 1: "value" (Int32) + PrimitiveArray + [ + 1, + 3, + ] + ], + ] + "#]] + .assert_debug_eq(&arrow); + + let rw_array_new = UdfArrowConvert::default() + .from_map_array(arrow.as_any().downcast_ref().unwrap()) + .unwrap(); + assert_eq!(&rw_array, rw_array_new.as_map()); + } } diff --git a/src/common/src/array/arrow/mod.rs b/src/common/src/array/arrow/mod.rs index fd9f55ee09f7e..d519d62f9935a 100644 --- a/src/common/src/array/arrow/mod.rs +++ b/src/common/src/array/arrow/mod.rs @@ -17,7 +17,7 @@ mod arrow_iceberg; mod arrow_udf; pub use arrow_deltalake::DeltaLakeConvert; -pub use arrow_iceberg::IcebergArrowConvert; +pub use arrow_iceberg::{IcebergArrowConvert, IcebergCreateTableArrowConvert}; pub use arrow_udf::{FromArrow, ToArrow, UdfArrowConvert}; use crate::types::Interval; diff --git a/src/common/src/bitmap.rs b/src/common/src/bitmap.rs index 7ef6bf039f47d..ae07105164408 100644 --- a/src/common/src/bitmap.rs +++ b/src/common/src/bitmap.rs @@ -685,6 +685,12 @@ impl From<&PbBuffer> for Bitmap { } } +impl From for Bitmap { + fn from(buf: PbBuffer) -> Self { + Self::from(&buf) + } +} + /// Bitmap iterator. pub struct BitmapIter<'a> { bits: Option<&'a [usize]>, diff --git a/src/common/src/config.rs b/src/common/src/config.rs index 88ea110869b79..e2b4dd7b0f97c 100644 --- a/src/common/src/config.rs +++ b/src/common/src/config.rs @@ -33,7 +33,6 @@ use serde_default::DefaultFromSerde; use serde_json::Value; use crate::for_all_params; -use crate::hash::VirtualNode; /// Use the maximum value for HTTP/2 connection window size to avoid deadlock among multiplexed /// streams on the same connection. @@ -427,16 +426,13 @@ impl<'de> Deserialize<'de> for DefaultParallelism { ))) } } - Parallelism::Int(i) => Ok(DefaultParallelism::Default(if i > VirtualNode::COUNT { - Err(serde::de::Error::custom(format!( - "default parallelism should be not great than {}", - VirtualNode::COUNT - )))? - } else { + Parallelism::Int(i) => Ok(DefaultParallelism::Default( + // Note: we won't check whether this exceeds the maximum parallelism (i.e., vnode count) + // here because it requires extra context. The check will be done when scheduling jobs. NonZeroUsize::new(i).ok_or_else(|| { - serde::de::Error::custom("default parallelism should be greater than 0") - })? - })), + serde::de::Error::custom("default parallelism should not be 0") + })?, + )), } } } @@ -466,6 +462,16 @@ pub struct MetaDeveloperConfig { #[serde(default = "default::developer::max_get_task_probe_times")] pub max_get_task_probe_times: usize, + + /// Max number of actor allowed per parallelism (default = 100). + /// CREATE MV/Table will be noticed when the number of actors exceeds this limit. + #[serde(default = "default::developer::actor_cnt_per_worker_parallelism_soft_limit")] + pub actor_cnt_per_worker_parallelism_soft_limit: usize, + + /// Max number of actor allowed per parallelism (default = 400). + /// CREATE MV/Table will be rejected when the number of actors exceeds this limit. + #[serde(default = "default::developer::actor_cnt_per_worker_parallelism_hard_limit")] + pub actor_cnt_per_worker_parallelism_hard_limit: usize, } /// The section `[server]` in `risingwave.toml`. @@ -693,6 +699,9 @@ pub struct StorageConfig { #[serde(default)] pub prefetch_buffer_capacity_mb: Option, + #[serde(default)] + pub max_cached_recent_versions_number: Option, + /// max prefetch block number #[serde(default = "default::storage::max_prefetch_block_number")] pub max_prefetch_block_number: usize, @@ -1859,6 +1868,14 @@ pub mod default { 5 } + pub fn actor_cnt_per_worker_parallelism_soft_limit() -> usize { + 100 + } + + pub fn actor_cnt_per_worker_parallelism_hard_limit() -> usize { + 400 + } + pub fn memory_controller_threshold_aggressive() -> f64 { 0.9 } diff --git a/src/common/src/hash/consistent_hash/bitmap.rs b/src/common/src/hash/consistent_hash/bitmap.rs index 773231ba36a89..eee6a64a2b42c 100644 --- a/src/common/src/hash/consistent_hash/bitmap.rs +++ b/src/common/src/hash/consistent_hash/bitmap.rs @@ -15,6 +15,7 @@ use std::ops::RangeInclusive; use crate::bitmap::Bitmap; +use crate::hash::table_distribution::SINGLETON_VNODE; use crate::hash::VirtualNode; /// An extension trait for `Bitmap` to support virtual node operations. @@ -36,4 +37,17 @@ impl Bitmap { self.high_ranges() .map(|r| (VirtualNode::from_index(*r.start())..=VirtualNode::from_index(*r.end()))) } + + /// Returns whether only the [`SINGLETON_VNODE`] is set in the bitmap. + /// + /// Note that this method returning `true` does not imply that the bitmap was created by + /// [`VnodeBitmapExt::singleton`], or that the bitmap has length 1. + pub fn is_singleton(&self) -> bool { + self.count_ones() == 1 && self.iter_vnodes().next().unwrap() == SINGLETON_VNODE + } + + /// Creates a bitmap with length 1 and the single bit set. + pub fn singleton() -> Self { + Self::ones(1) + } } diff --git a/src/common/src/hash/consistent_hash/mapping.rs b/src/common/src/hash/consistent_hash/mapping.rs index a462acb291853..0ab8f9e18fd2e 100644 --- a/src/common/src/hash/consistent_hash/mapping.rs +++ b/src/common/src/hash/consistent_hash/mapping.rs @@ -105,26 +105,26 @@ impl VnodeMapping { /// /// For example, if `items` is `[0, 1, 2]`, and the total vnode count is 10, we'll generate /// mapping like `[0, 0, 0, 0, 1, 1, 1, 2, 2, 2]`. - pub fn new_uniform(items: impl ExactSizeIterator) -> Self { + pub fn new_uniform(items: impl ExactSizeIterator, vnode_count: usize) -> Self { // If the number of items is greater than the total vnode count, no vnode will be mapped to // some items and the mapping will be invalid. - assert!(items.len() <= VirtualNode::COUNT); + assert!(items.len() <= vnode_count); let mut original_indices = Vec::with_capacity(items.len()); let mut data = Vec::with_capacity(items.len()); - let hash_shard_size = VirtualNode::COUNT / items.len(); - let mut one_more_count = VirtualNode::COUNT % items.len(); + let hash_shard_size = vnode_count / items.len(); + let mut one_more_count = vnode_count % items.len(); let mut init_bound = 0; for item in items { - let vnode_count = if one_more_count > 0 { + let count = if one_more_count > 0 { one_more_count -= 1; hash_shard_size + 1 } else { hash_shard_size }; - init_bound += vnode_count; + init_bound += count; original_indices.push(init_bound as u32 - 1); data.push(item); @@ -141,10 +141,11 @@ impl VnodeMapping { /// Create a vnode mapping where all vnodes are mapped to the same single item. pub fn new_single(item: T::Item) -> Self { - Self::new_uniform(std::iter::once(item)) + // TODO(var-vnode): always 1 correct? + Self::new_uniform(std::iter::once(item), 1) } - /// The length of the vnode in this mapping, typically [`VirtualNode::COUNT`]. + /// The length (or count) of the vnode in this mapping. pub fn len(&self) -> usize { self.original_indices .last() @@ -204,12 +205,13 @@ impl VnodeMapping { /// Convert this vnode mapping to a mapping from items to bitmaps, where each bitmap represents /// the vnodes mapped to the item. pub fn to_bitmaps(&self) -> HashMap { + let vnode_count = self.len(); let mut vnode_bitmaps = HashMap::new(); for (vnode, item) in self.iter_with_vnode() { vnode_bitmaps .entry(item) - .or_insert_with(|| BitmapBuilder::zeroed(VirtualNode::COUNT)) + .or_insert_with(|| BitmapBuilder::zeroed(vnode_count)) .set(vnode.to_index(), true); } @@ -222,10 +224,11 @@ impl VnodeMapping { /// Create a vnode mapping from the given mapping from items to bitmaps, where each bitmap /// represents the vnodes mapped to the item. pub fn from_bitmaps(bitmaps: &HashMap) -> Self { - let mut items = vec![None; VirtualNode::COUNT]; + let vnode_count = bitmaps.values().next().expect("empty bitmaps").len(); + let mut items = vec![None; vnode_count]; for (&item, bitmap) in bitmaps { - assert_eq!(bitmap.len(), VirtualNode::COUNT); + assert_eq!(bitmap.len(), vnode_count); for idx in bitmap.iter_ones() { if let Some(prev) = items[idx].replace(item) { panic!("mapping at index `{idx}` is set to both `{prev:?}` and `{item:?}`"); @@ -241,9 +244,8 @@ impl VnodeMapping { Self::from_expanded(&items) } - /// Create a vnode mapping from the expanded slice of items with length [`VirtualNode::COUNT`]. + /// Create a vnode mapping from the expanded slice of items. pub fn from_expanded(items: &[T::Item]) -> Self { - assert_eq!(items.len(), VirtualNode::COUNT); let (original_indices, data) = compress_data(items); Self { original_indices, @@ -251,7 +253,7 @@ impl VnodeMapping { } } - /// Convert this vnode mapping to a expanded vector of items with length [`VirtualNode::COUNT`]. + /// Convert this vnode mapping to a expanded vector of items. pub fn to_expanded(&self) -> ExpandedMapping { self.iter().collect() } @@ -353,8 +355,8 @@ impl ActorMapping { impl WorkerSlotMapping { /// Create a uniform worker mapping from the given worker ids - pub fn build_from_ids(worker_slot_ids: &[WorkerSlotId]) -> Self { - Self::new_uniform(worker_slot_ids.iter().cloned()) + pub fn build_from_ids(worker_slot_ids: &[WorkerSlotId], vnode_count: usize) -> Self { + Self::new_uniform(worker_slot_ids.iter().cloned(), vnode_count) } /// Create a worker mapping from the protobuf representation. @@ -403,18 +405,18 @@ mod tests { type TestMapping = VnodeMapping; type Test2Mapping = VnodeMapping; - const COUNTS: &[usize] = &[1, 3, 12, 42, VirtualNode::COUNT]; + const COUNTS: &[usize] = &[1, 3, 12, 42, VirtualNode::COUNT_FOR_TEST]; fn uniforms() -> impl Iterator { COUNTS .iter() - .map(|&count| TestMapping::new_uniform(0..count as u32)) + .map(|&count| TestMapping::new_uniform(0..count as u32, VirtualNode::COUNT_FOR_TEST)) } fn randoms() -> impl Iterator { COUNTS.iter().map(|&count| { let raw = repeat_with(|| rand::thread_rng().gen_range(0..count as u32)) - .take(VirtualNode::COUNT) + .take(VirtualNode::COUNT_FOR_TEST) .collect_vec(); TestMapping::from_expanded(&raw) }) @@ -427,7 +429,7 @@ mod tests { #[test] fn test_uniform() { for vnode_mapping in uniforms() { - assert_eq!(vnode_mapping.len(), VirtualNode::COUNT); + assert_eq!(vnode_mapping.len(), VirtualNode::COUNT_FOR_TEST); let item_count = vnode_mapping.iter_unique().count(); let mut check: HashMap> = HashMap::new(); diff --git a/src/common/src/hash/consistent_hash/vnode.rs b/src/common/src/hash/consistent_hash/vnode.rs index f528544689f31..685f99d6cf4f4 100644 --- a/src/common/src/hash/consistent_hash/vnode.rs +++ b/src/common/src/hash/consistent_hash/vnode.rs @@ -30,26 +30,45 @@ use crate::util::row_id::extract_vnode_id_from_row_id; pub struct VirtualNode(VirtualNodeInner); /// The internal representation of a virtual node id. +/// +/// Note: not all bits of the inner representation might be used. type VirtualNodeInner = u16; -static_assertions::const_assert!(VirtualNodeInner::BITS >= VirtualNode::BITS as u32); -impl From for VirtualNode { - fn from(hash_code: Crc32HashCode) -> Self { +/// `vnode_count` must be provided to convert a hash code to a virtual node. +/// +/// Use [`Crc32HashCodeToVnodeExt::to_vnode`] instead. +impl !From for VirtualNode {} + +#[easy_ext::ext(Crc32HashCodeToVnodeExt)] +impl Crc32HashCode { + /// Converts the hash code to a virtual node, based on the given total count of vnodes. + fn to_vnode(self, vnode_count: usize) -> VirtualNode { // Take the least significant bits of the hash code. // TODO: should we use the most significant bits? - let inner = (hash_code.value() % Self::COUNT as u64) as VirtualNodeInner; + let inner = (self.value() % vnode_count as u64) as VirtualNodeInner; VirtualNode(inner) } } impl VirtualNode { - /// The number of bits used to represent a virtual node. - /// - /// Note: Not all bits of the inner representation are used. One should rely on this constant - /// to determine the count of virtual nodes. - pub const BITS: usize = 8; /// The total count of virtual nodes. - pub const COUNT: usize = 1 << Self::BITS; + // TODO(var-vnode): remove this and only keep `COUNT_FOR_TEST` + pub const COUNT: usize = 1 << 8; + /// The maximum value of the virtual node. + // TODO(var-vnode): remove this and only keep `MAX_FOR_TEST` + pub const MAX: VirtualNode = VirtualNode::from_index(Self::COUNT - 1); +} + +impl VirtualNode { + /// The total count of virtual nodes, for testing purposes. + pub const COUNT_FOR_TEST: usize = Self::COUNT; + /// The maximum value of the virtual node, for testing purposes. + pub const MAX_FOR_TEST: VirtualNode = Self::MAX; +} + +impl VirtualNode { + /// The maximum count of virtual nodes that fits in [`VirtualNodeInner`]. + pub const MAX_COUNT: usize = 1 << VirtualNodeInner::BITS; /// The size of a virtual node in bytes, in memory or serialized representation. pub const SIZE: usize = std::mem::size_of::(); } @@ -58,8 +77,6 @@ impl VirtualNode { pub type AllVirtualNodeIter = std::iter::Map, fn(usize) -> VirtualNode>; impl VirtualNode { - /// The maximum value of the virtual node. - pub const MAX: VirtualNode = VirtualNode::from_index(Self::COUNT - 1); /// We may use `VirtualNode` as a datum in a stream, or store it as a column. /// Hence this reifies it as a RW datatype. pub const RW_TYPE: DataType = DataType::Int16; @@ -68,7 +85,7 @@ impl VirtualNode { /// Creates a virtual node from the `usize` index. pub const fn from_index(index: usize) -> Self { - debug_assert!(index < Self::COUNT); + debug_assert!(index < Self::MAX_COUNT); Self(index as _) } @@ -79,7 +96,6 @@ impl VirtualNode { /// Creates a virtual node from the given scalar representation. Used by `VNODE` expression. pub const fn from_scalar(scalar: i16) -> Self { - debug_assert!((scalar as usize) < Self::COUNT); Self(scalar as _) } @@ -99,7 +115,6 @@ impl VirtualNode { /// Creates a virtual node from the given big-endian bytes representation. pub const fn from_be_bytes(bytes: [u8; Self::SIZE]) -> Self { let inner = VirtualNodeInner::from_be_bytes(bytes); - debug_assert!((inner as usize) < Self::COUNT); Self(inner) } @@ -109,8 +124,8 @@ impl VirtualNode { } /// Iterates over all virtual nodes. - pub fn all() -> AllVirtualNodeIter { - (0..Self::COUNT).map(Self::from_index) + pub fn all(vnode_count: usize) -> AllVirtualNodeIter { + (0..vnode_count).map(Self::from_index) } } @@ -119,7 +134,11 @@ impl VirtualNode { // chunk. When only one column is provided and its type is `Serial`, we consider the column to // be the one that contains RowId, and use a special method to skip the calculation of Hash // and directly extract the `VirtualNode` from `RowId`. - pub fn compute_chunk(data_chunk: &DataChunk, keys: &[usize]) -> Vec { + pub fn compute_chunk( + data_chunk: &DataChunk, + keys: &[usize], + vnode_count: usize, + ) -> Vec { if let Ok(idx) = keys.iter().exactly_one() && let ArrayImpl::Serial(serial_array) = &**data_chunk.column_at(*idx) { @@ -135,7 +154,7 @@ impl VirtualNode { // This process doesn’t guarantee the order of rows, producing indeterminate results in some cases, // such as when `distinct on` is used without an `order by`. let (row, _) = data_chunk.row_at(idx); - row.hash(Crc32FastBuilder).into() + row.hash(Crc32FastBuilder).to_vnode(vnode_count) } }) .collect(); @@ -144,19 +163,29 @@ impl VirtualNode { data_chunk .get_hash_values(keys, Crc32FastBuilder) .into_iter() - .map(|hash| hash.into()) + .map(|hash| hash.to_vnode(vnode_count)) .collect() } + /// Equivalent to [`Self::compute_chunk`] with [`VirtualNode::COUNT_FOR_TEST`] as the vnode count. + pub fn compute_chunk_for_test(data_chunk: &DataChunk, keys: &[usize]) -> Vec { + Self::compute_chunk(data_chunk, keys, Self::COUNT_FOR_TEST) + } + // `compute_row` is used to calculate the `VirtualNode` for the corresponding columns in a // `Row`. Similar to `compute_chunk`, it also contains special handling for serial columns. - pub fn compute_row(row: impl Row, indices: &[usize]) -> VirtualNode { + pub fn compute_row(row: impl Row, indices: &[usize], vnode_count: usize) -> VirtualNode { let project = row.project(indices); if let Ok(Some(ScalarRefImpl::Serial(s))) = project.iter().exactly_one().as_ref() { return extract_vnode_id_from_row_id(s.as_row_id()); } - project.hash(Crc32FastBuilder).into() + project.hash(Crc32FastBuilder).to_vnode(vnode_count) + } + + /// Equivalent to [`Self::compute_row`] with [`VirtualNode::COUNT_FOR_TEST`] as the vnode count. + pub fn compute_row_for_test(row: impl Row, indices: &[usize]) -> VirtualNode { + Self::compute_row(row, indices, Self::COUNT_FOR_TEST) } } @@ -179,7 +208,7 @@ mod tests { ); let chunk = DataChunk::from_pretty(chunk.as_str()); - let vnodes = VirtualNode::compute_chunk(&chunk, &[0]); + let vnodes = VirtualNode::compute_chunk_for_test(&chunk, &[0]); assert_eq!( vnodes.as_slice(), @@ -195,7 +224,7 @@ mod tests { Some(ScalarImpl::Int64(12345)), ]); - let vnode = VirtualNode::compute_row(&row, &[0]); + let vnode = VirtualNode::compute_row_for_test(&row, &[0]); assert_eq!(vnode, VirtualNode::from_index(100)); } @@ -216,7 +245,7 @@ mod tests { ); let chunk = DataChunk::from_pretty(chunk.as_str()); - let vnodes = VirtualNode::compute_chunk(&chunk, &[0]); + let vnodes = VirtualNode::compute_chunk_for_test(&chunk, &[0]); assert_eq!( vnodes.as_slice(), diff --git a/src/common/src/hash/table_distribution.rs b/src/common/src/hash/table_distribution.rs index 9be9cd2abafb2..5275aca04adb3 100644 --- a/src/common/src/hash/table_distribution.rs +++ b/src/common/src/hash/table_distribution.rs @@ -13,30 +13,34 @@ // limitations under the License. use std::mem::replace; -use std::ops::Deref; use std::sync::{Arc, LazyLock}; use itertools::Itertools; use risingwave_pb::plan_common::StorageTableDesc; -use tracing::warn; use crate::array::{Array, DataChunk, PrimitiveArray}; -use crate::bitmap::{Bitmap, BitmapBuilder}; +use crate::bitmap::Bitmap; use crate::hash::VirtualNode; use crate::row::Row; use crate::util::iter_util::ZipEqFast; -/// For tables without distribution (singleton), the `DEFAULT_VNODE` is encoded. -pub const DEFAULT_VNODE: VirtualNode = VirtualNode::ZERO; +/// For tables without distribution (singleton), the `SINGLETON_VNODE` is encoded. +pub const SINGLETON_VNODE: VirtualNode = VirtualNode::ZERO; + +use super::VnodeBitmapExt; #[derive(Debug, Clone)] enum ComputeVnode { Singleton, DistKeyIndices { + /// Virtual nodes that the table is partitioned into. + vnodes: Arc, /// Indices of distribution key for computing vnode, based on the pk columns of the table. dist_key_in_pk_indices: Vec, }, VnodeColumnIndex { + /// Virtual nodes that the table is partitioned into. + vnodes: Arc, /// Index of vnode column. vnode_col_idx_in_pk: usize, }, @@ -47,13 +51,8 @@ enum ComputeVnode { pub struct TableDistribution { /// The way to compute vnode provided primary key compute_vnode: ComputeVnode, - - /// Virtual nodes that the table is partitioned into. - vnodes: Arc, } -pub const SINGLETON_VNODE: VirtualNode = DEFAULT_VNODE; - impl TableDistribution { pub fn new_from_storage_table_desc( vnodes: Option>, @@ -75,69 +74,32 @@ impl TableDistribution { ) -> Self { let compute_vnode = if let Some(vnode_col_idx_in_pk) = vnode_col_idx_in_pk { ComputeVnode::VnodeColumnIndex { + vnodes: vnodes.unwrap_or_else(|| Bitmap::singleton().into()), vnode_col_idx_in_pk, } } else if !dist_key_in_pk_indices.is_empty() { ComputeVnode::DistKeyIndices { + vnodes: vnodes.expect("vnodes must be `Some` as dist key indices are set"), dist_key_in_pk_indices, } } else { ComputeVnode::Singleton }; - let vnodes = vnodes.unwrap_or_else(Self::singleton_vnode_bitmap); - if let ComputeVnode::Singleton = &compute_vnode { - if &vnodes != Self::singleton_vnode_bitmap_ref() && &vnodes != Self::all_vnodes_ref() { - warn!( - ?vnodes, - "singleton distribution get non-singleton vnode bitmap" - ); - } - } - - Self { - compute_vnode, - vnodes, - } + Self { compute_vnode } } pub fn is_singleton(&self) -> bool { matches!(&self.compute_vnode, ComputeVnode::Singleton) } - pub fn singleton_vnode_bitmap_ref() -> &'static Arc { - /// A bitmap that only the default vnode is set. - static SINGLETON_VNODES: LazyLock> = LazyLock::new(|| { - let mut vnodes = BitmapBuilder::zeroed(VirtualNode::COUNT); - vnodes.set(SINGLETON_VNODE.to_index(), true); - vnodes.finish().into() - }); - - SINGLETON_VNODES.deref() - } - - pub fn singleton_vnode_bitmap() -> Arc { - Self::singleton_vnode_bitmap_ref().clone() - } - - pub fn all_vnodes_ref() -> &'static Arc { - /// A bitmap that all vnodes are set. - static ALL_VNODES: LazyLock> = - LazyLock::new(|| Bitmap::ones(VirtualNode::COUNT).into()); - &ALL_VNODES - } - - pub fn all_vnodes() -> Arc { - Self::all_vnodes_ref().clone() - } - /// Distribution that accesses all vnodes, mainly used for tests. - pub fn all(dist_key_in_pk_indices: Vec) -> Self { + pub fn all(dist_key_in_pk_indices: Vec, vnode_count: usize) -> Self { Self { compute_vnode: ComputeVnode::DistKeyIndices { + vnodes: Bitmap::ones(vnode_count).into(), dist_key_in_pk_indices, }, - vnodes: Self::all_vnodes(), } } @@ -145,20 +107,39 @@ impl TableDistribution { pub fn singleton() -> Self { Self { compute_vnode: ComputeVnode::Singleton, - vnodes: Self::singleton_vnode_bitmap(), } } pub fn update_vnode_bitmap(&mut self, new_vnodes: Arc) -> Arc { - if self.is_singleton() && &new_vnodes != Self::singleton_vnode_bitmap_ref() { - warn!(?new_vnodes, "update vnode on singleton distribution"); + match &mut self.compute_vnode { + ComputeVnode::Singleton => { + if !new_vnodes.is_singleton() { + panic!( + "update vnode bitmap on singleton distribution to non-singleton: {:?}", + new_vnodes + ); + } + self.vnodes().clone() // not updated + } + + ComputeVnode::DistKeyIndices { vnodes, .. } + | ComputeVnode::VnodeColumnIndex { vnodes, .. } => { + assert_eq!(vnodes.len(), new_vnodes.len()); + replace(vnodes, new_vnodes) + } } - assert_eq!(self.vnodes.len(), new_vnodes.len()); - replace(&mut self.vnodes, new_vnodes) } + /// Get vnode bitmap if distributed, or a dummy [`Bitmap::singleton()`] if singleton. pub fn vnodes(&self) -> &Arc { - &self.vnodes + static SINGLETON_VNODES: LazyLock> = + LazyLock::new(|| Bitmap::singleton().into()); + + match &self.compute_vnode { + ComputeVnode::DistKeyIndices { vnodes, .. } => vnodes, + ComputeVnode::VnodeColumnIndex { vnodes, .. } => vnodes, + ComputeVnode::Singleton => &SINGLETON_VNODES, + } } /// Get vnode value with given primary key. @@ -166,11 +147,13 @@ impl TableDistribution { match &self.compute_vnode { ComputeVnode::Singleton => SINGLETON_VNODE, ComputeVnode::DistKeyIndices { + vnodes, dist_key_in_pk_indices, - } => compute_vnode(pk, dist_key_in_pk_indices, &self.vnodes), + } => compute_vnode(pk, dist_key_in_pk_indices, vnodes), ComputeVnode::VnodeColumnIndex { + vnodes, vnode_col_idx_in_pk, - } => get_vnode_from_row(pk, *vnode_col_idx_in_pk, &self.vnodes), + } => get_vnode_from_row(pk, *vnode_col_idx_in_pk, vnodes), } } @@ -178,22 +161,20 @@ impl TableDistribution { match &self.compute_vnode { ComputeVnode::Singleton => Some(SINGLETON_VNODE), ComputeVnode::DistKeyIndices { + vnodes, dist_key_in_pk_indices, } => dist_key_in_pk_indices .iter() .all(|&d| d < pk_prefix.len()) - .then(|| compute_vnode(pk_prefix, dist_key_in_pk_indices, &self.vnodes)), + .then(|| compute_vnode(pk_prefix, dist_key_in_pk_indices, vnodes)), ComputeVnode::VnodeColumnIndex { + vnodes, vnode_col_idx_in_pk, } => { if *vnode_col_idx_in_pk >= pk_prefix.len() { None } else { - Some(get_vnode_from_row( - pk_prefix, - *vnode_col_idx_in_pk, - &self.vnodes, - )) + Some(get_vnode_from_row(pk_prefix, *vnode_col_idx_in_pk, vnodes)) } } } @@ -203,7 +184,7 @@ impl TableDistribution { /// Get vnode value with `indices` on the given `row`. pub fn compute_vnode(row: impl Row, indices: &[usize], vnodes: &Bitmap) -> VirtualNode { assert!(!indices.is_empty()); - let vnode = VirtualNode::compute_row(&row, indices); + let vnode = VirtualNode::compute_row(&row, indices, vnodes.len()); check_vnode_is_set(vnode, vnodes); tracing::debug!(target: "events::storage::storage_table", "compute vnode: {:?} key {:?} => {}", row, indices, vnode); @@ -230,6 +211,7 @@ impl TableDistribution { vec![SINGLETON_VNODE; chunk.capacity()] } ComputeVnode::DistKeyIndices { + vnodes, dist_key_in_pk_indices, } => { let dist_key_indices = dist_key_in_pk_indices @@ -237,19 +219,20 @@ impl TableDistribution { .map(|idx| pk_indices[*idx]) .collect_vec(); - VirtualNode::compute_chunk(chunk, &dist_key_indices) + VirtualNode::compute_chunk(chunk, &dist_key_indices, vnodes.len()) .into_iter() .zip_eq_fast(chunk.visibility().iter()) .map(|(vnode, vis)| { // Ignore the invisible rows. if vis { - check_vnode_is_set(vnode, &self.vnodes); + check_vnode_is_set(vnode, vnodes); } vnode }) .collect() } ComputeVnode::VnodeColumnIndex { + vnodes, vnode_col_idx_in_pk, } => { let array: &PrimitiveArray = @@ -262,7 +245,7 @@ impl TableDistribution { let vnode = VirtualNode::from_scalar(vnode); if vis { assert!(exist); - check_vnode_is_set(vnode, &self.vnodes); + check_vnode_is_set(vnode, vnodes); } vnode }) diff --git a/src/common/src/lib.rs b/src/common/src/lib.rs index 8d47d0c621646..e3417853b0201 100644 --- a/src/common/src/lib.rs +++ b/src/common/src/lib.rs @@ -23,7 +23,6 @@ #![feature(test)] #![feature(trusted_len)] #![feature(allocator_api)] -#![feature(lint_reasons)] #![feature(coroutines)] #![feature(map_try_insert)] #![feature(error_generic_member_access)] @@ -76,7 +75,7 @@ pub mod memory; pub use risingwave_common_metrics::{ monitor, register_guarded_gauge_vec_with_registry, register_guarded_histogram_vec_with_registry, register_guarded_int_counter_vec_with_registry, - register_guarded_int_gauge_vec_with_registry, + register_guarded_int_gauge_vec_with_registry, register_guarded_uint_gauge_vec_with_registry, }; pub use { risingwave_common_metrics as metrics, risingwave_common_secret as secret, diff --git a/src/common/src/session_config/mod.rs b/src/common/src/session_config/mod.rs index ffdbe6753acb5..163aa18799390 100644 --- a/src/common/src/session_config/mod.rs +++ b/src/common/src/session_config/mod.rs @@ -292,6 +292,12 @@ pub struct SessionConfig { #[parameter(default = "hex", check_hook = check_bytea_output)] bytea_output: String, + + /// Bypass checks on cluster limits + /// + /// When enabled, `CREATE MATERIALIZED VIEW` will not fail if the cluster limit is hit. + #[parameter(default = false)] + bypass_cluster_limits: bool, } fn check_timezone(val: &str) -> Result<(), String> { diff --git a/src/common/src/types/jsonb.rs b/src/common/src/types/jsonb.rs index fa80069080ff4..6363864fd73e2 100644 --- a/src/common/src/types/jsonb.rs +++ b/src/common/src/types/jsonb.rs @@ -20,7 +20,9 @@ use jsonbb::{Value, ValueRef}; use postgres_types::{accepts, to_sql_checked, FromSql, IsNull, ToSql, Type}; use risingwave_common_estimate_size::EstimateSize; -use super::{Datum, IntoOrdered, ListValue, ScalarImpl, StructRef, ToOwnedDatum, F64}; +use super::{ + Datum, IntoOrdered, ListValue, MapType, MapValue, ScalarImpl, StructRef, ToOwnedDatum, F64, +}; use crate::types::{DataType, Scalar, ScalarRef, StructType, StructValue}; use crate::util::iter_util::ZipEqDebug; @@ -464,6 +466,28 @@ impl<'a> JsonbRef<'a> { Ok(StructValue::new(fields)) } + pub fn to_map(self, ty: &MapType) -> Result { + let object = self + .0 + .as_object() + .ok_or_else(|| format!("cannot convert to map from a jsonb {}", self.type_name()))?; + if !matches!(ty.key(), DataType::Varchar) { + return Err("cannot convert jsonb to a map with non-string keys".to_string()); + } + + let mut keys: Vec = Vec::with_capacity(object.len()); + let mut values: Vec = Vec::with_capacity(object.len()); + for (k, v) in object.iter() { + let v = Self(v).to_datum(ty.value())?; + keys.push(Some(ScalarImpl::Utf8(k.to_owned().into()))); + values.push(v); + } + MapValue::try_from_kv( + ListValue::from_datum_iter(ty.key(), keys), + ListValue::from_datum_iter(ty.value(), values), + ) + } + /// Expands the top-level JSON object to a row having the struct type of the `base` argument. pub fn populate_struct( self, diff --git a/src/common/src/util/cluster_limit.rs b/src/common/src/util/cluster_limit.rs new file mode 100644 index 0000000000000..048ea4fdab305 --- /dev/null +++ b/src/common/src/util/cluster_limit.rs @@ -0,0 +1,134 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::{self, Display, Formatter}; + +use risingwave_pb::meta::actor_count_per_parallelism::PbWorkerActorCount; +use risingwave_pb::meta::cluster_limit::PbLimit; +use risingwave_pb::meta::{PbActorCountPerParallelism, PbClusterLimit}; +pub enum ClusterLimit { + ActorCount(ActorCountPerParallelism), +} + +impl From for PbClusterLimit { + fn from(limit: ClusterLimit) -> Self { + match limit { + ClusterLimit::ActorCount(actor_count_per_parallelism) => PbClusterLimit { + limit: Some(PbLimit::ActorCount(actor_count_per_parallelism.into())), + }, + } + } +} + +impl From for ClusterLimit { + fn from(pb_limit: PbClusterLimit) -> Self { + match pb_limit.limit.unwrap() { + PbLimit::ActorCount(actor_count_per_parallelism) => { + ClusterLimit::ActorCount(actor_count_per_parallelism.into()) + } + } + } +} + +#[derive(Debug)] +pub struct WorkerActorCount { + pub actor_count: usize, + pub parallelism: usize, +} + +impl From for PbWorkerActorCount { + fn from(worker_actor_count: WorkerActorCount) -> Self { + PbWorkerActorCount { + actor_count: worker_actor_count.actor_count as u64, + parallelism: worker_actor_count.parallelism as u64, + } + } +} + +impl From for WorkerActorCount { + fn from(pb_worker_actor_count: PbWorkerActorCount) -> Self { + WorkerActorCount { + actor_count: pb_worker_actor_count.actor_count as usize, + parallelism: pb_worker_actor_count.parallelism as usize, + } + } +} + +pub struct ActorCountPerParallelism { + pub worker_id_to_actor_count: HashMap, + pub hard_limit: usize, + pub soft_limit: usize, +} + +impl From for PbActorCountPerParallelism { + fn from(actor_count_per_parallelism: ActorCountPerParallelism) -> Self { + PbActorCountPerParallelism { + worker_id_to_actor_count: actor_count_per_parallelism + .worker_id_to_actor_count + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + hard_limit: actor_count_per_parallelism.hard_limit as u64, + soft_limit: actor_count_per_parallelism.soft_limit as u64, + } + } +} + +impl From for ActorCountPerParallelism { + fn from(pb_actor_count_per_parallelism: PbActorCountPerParallelism) -> Self { + ActorCountPerParallelism { + worker_id_to_actor_count: pb_actor_count_per_parallelism + .worker_id_to_actor_count + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + hard_limit: pb_actor_count_per_parallelism.hard_limit as usize, + soft_limit: pb_actor_count_per_parallelism.soft_limit as usize, + } + } +} + +impl ActorCountPerParallelism { + pub fn exceed_hard_limit(&self) -> bool { + self.worker_id_to_actor_count + .values() + .any(|v| v.actor_count > self.hard_limit.saturating_mul(v.parallelism)) + } + + pub fn exceed_soft_limit(&self) -> bool { + self.worker_id_to_actor_count + .values() + .any(|v| v.actor_count > self.soft_limit.saturating_mul(v.parallelism)) + } + + pub fn exceed_limit(&self) -> bool { + self.exceed_soft_limit() || self.exceed_hard_limit() + } +} + +impl Display for ActorCountPerParallelism { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let worker_id_to_actor_count_str: Vec<_> = self + .worker_id_to_actor_count + .iter() + .map(|(k, v)| format!("{} -> {:?}", k, v)) + .collect(); + write!( + f, + "ActorCountPerParallelism {{ critical limit: {:?}, recommended limit: {:?}. worker_id_to_actor_count: {:?} }}", + self.hard_limit, self.soft_limit, worker_id_to_actor_count_str + ) + } +} diff --git a/src/common/src/util/mod.rs b/src/common/src/util/mod.rs index 20dac5906c91d..bfa15c8327037 100644 --- a/src/common/src/util/mod.rs +++ b/src/common/src/util/mod.rs @@ -42,3 +42,4 @@ pub mod tracing; pub mod value_encoding; pub mod worker_util; pub use tokio_util; +pub mod cluster_limit; diff --git a/src/common/src/util/row_id.rs b/src/common/src/util/row_id.rs index 508f418903413..7f22c17e925e4 100644 --- a/src/common/src/util/row_id.rs +++ b/src/common/src/util/row_id.rs @@ -52,6 +52,7 @@ pub struct RowIdGenerator { pub type RowId = i64; +// TODO(var-vnode): how should we handle this for different virtual node counts? #[inline] pub fn extract_vnode_id_from_row_id(id: RowId) -> VirtualNode { let vnode_id = ((id >> VNODE_ID_SHIFT_BITS) & (VNODE_ID_UPPER_BOUND as i64 - 1)) as u32; diff --git a/src/common/src/util/scan_range.rs b/src/common/src/util/scan_range.rs index fd056f1790444..cfe209cf2c22a 100644 --- a/src/common/src/util/scan_range.rs +++ b/src/common/src/util/scan_range.rs @@ -159,7 +159,7 @@ mod tests { let pk = vec![1, 3, 2]; let dist_key_idx_in_pk = crate::catalog::get_dist_key_in_pk_indices(&dist_key, &pk).unwrap(); - let dist = TableDistribution::all(dist_key_idx_in_pk); + let dist = TableDistribution::all(dist_key_idx_in_pk, VirtualNode::COUNT_FOR_TEST); let mut scan_range = ScanRange::full_table_scan(); assert!(scan_range.try_compute_vnode(&dist).is_none()); @@ -173,7 +173,7 @@ mod tests { Some(ScalarImpl::from(514)), ]); - let vnode = VirtualNode::compute_row(&row, &[0, 1]); + let vnode = VirtualNode::compute_row_for_test(&row, &[0, 1]); assert_eq!(scan_range.try_compute_vnode(&dist), Some(vnode)); } @@ -185,7 +185,7 @@ mod tests { let pk = vec![1, 3, 2]; let dist_key_idx_in_pk = crate::catalog::get_dist_key_in_pk_indices(&dist_key, &pk).unwrap(); - let dist = TableDistribution::all(dist_key_idx_in_pk); + let dist = TableDistribution::all(dist_key_idx_in_pk, VirtualNode::COUNT_FOR_TEST); let mut scan_range = ScanRange::full_table_scan(); assert!(scan_range.try_compute_vnode(&dist).is_none()); @@ -203,7 +203,7 @@ mod tests { Some(ScalarImpl::from(114514)), ]); - let vnode = VirtualNode::compute_row(&row, &[2, 1]); + let vnode = VirtualNode::compute_row_for_test(&row, &[2, 1]); assert_eq!(scan_range.try_compute_vnode(&dist), Some(vnode)); } diff --git a/src/common/src/vnode_mapping/vnode_placement.rs b/src/common/src/vnode_mapping/vnode_placement.rs index 5619ffc6e0f96..1f9235bb862ae 100644 --- a/src/common/src/vnode_mapping/vnode_placement.rs +++ b/src/common/src/vnode_mapping/vnode_placement.rs @@ -30,7 +30,12 @@ pub fn place_vnode( hint_worker_slot_mapping: Option<&WorkerSlotMapping>, workers: &[WorkerNode], max_parallelism: Option, + vnode_count: usize, ) -> Option { + if let Some(mapping) = hint_worker_slot_mapping { + assert_eq!(mapping.len(), vnode_count); + } + // Get all serving worker slots from all available workers, grouped by worker id and ordered // by worker slot id in each group. let mut worker_slots: LinkedList<_> = workers @@ -44,7 +49,7 @@ pub fn place_vnode( // `max_parallelism` and total number of virtual nodes. let serving_parallelism = std::cmp::min( worker_slots.iter().map(|slots| slots.len()).sum(), - std::cmp::min(max_parallelism.unwrap_or(usize::MAX), VirtualNode::COUNT), + std::cmp::min(max_parallelism.unwrap_or(usize::MAX), vnode_count), ); // Select `serving_parallelism` worker slots in a round-robin fashion, to distribute workload @@ -79,14 +84,14 @@ pub fn place_vnode( is_temp: bool, } - let (expected, mut remain) = VirtualNode::COUNT.div_rem(&selected_slots.len()); + let (expected, mut remain) = vnode_count.div_rem(&selected_slots.len()); let mut balances: HashMap = HashMap::default(); for slot in &selected_slots { let mut balance = Balance { slot: *slot, balance: -(expected as i32), - builder: BitmapBuilder::zeroed(VirtualNode::COUNT), + builder: BitmapBuilder::zeroed(vnode_count), is_temp: false, }; @@ -102,7 +107,7 @@ pub fn place_vnode( let mut temp_slot = Balance { slot: WorkerSlotId::new(0u32, usize::MAX), /* This id doesn't matter for `temp_slot`. It's distinguishable via `is_temp`. */ balance: 0, - builder: BitmapBuilder::zeroed(VirtualNode::COUNT), + builder: BitmapBuilder::zeroed(vnode_count), is_temp: true, }; match hint_worker_slot_mapping { @@ -123,7 +128,7 @@ pub fn place_vnode( } None => { // No hint is provided, assign all vnodes to `temp_pu`. - for vnode in VirtualNode::all() { + for vnode in VirtualNode::all(vnode_count) { temp_slot.balance += 1; temp_slot.builder.set(vnode.to_index(), true); } @@ -158,7 +163,7 @@ pub fn place_vnode( let mut dst = balances.pop_back().unwrap(); let n = std::cmp::min(src.balance.abs(), dst.balance.abs()); let mut moved = 0; - for idx in 0..VirtualNode::COUNT { + for idx in 0..vnode_count { if moved >= n { break; } @@ -189,7 +194,7 @@ pub fn place_vnode( for (worker_slot, bitmap) in results { worker_result .entry(worker_slot) - .or_insert(BitmapBuilder::zeroed(VirtualNode::COUNT).finish()) + .or_insert(Bitmap::zeros(vnode_count)) .bitor_assign(&bitmap); } @@ -204,10 +209,24 @@ mod tests { use risingwave_pb::common::WorkerNode; use crate::hash::VirtualNode; - use crate::vnode_mapping::vnode_placement::place_vnode; + + /// [`super::place_vnode`] with [`VirtualNode::COUNT_FOR_TEST`] as the vnode count. + fn place_vnode( + hint_worker_slot_mapping: Option<&WorkerSlotMapping>, + workers: &[WorkerNode], + max_parallelism: Option, + ) -> Option { + super::place_vnode( + hint_worker_slot_mapping, + workers, + max_parallelism, + VirtualNode::COUNT_FOR_TEST, + ) + } + #[test] fn test_place_vnode() { - assert_eq!(VirtualNode::COUNT, 256); + assert_eq!(VirtualNode::COUNT_FOR_TEST, 256); let serving_property = Property { is_unschedulable: false, @@ -220,7 +239,7 @@ mod tests { assert_eq!(wm1.len(), 256); assert_eq!(wm2.len(), 256); let mut count: usize = 0; - for idx in 0..VirtualNode::COUNT { + for idx in 0..VirtualNode::COUNT_FOR_TEST { let vnode = VirtualNode::from_index(idx); if wm1.get(vnode) == wm2.get(vnode) { count += 1; diff --git a/src/compute/src/lib.rs b/src/compute/src/lib.rs index d91fb56b1cb88..1336a84980cea 100644 --- a/src/compute/src/lib.rs +++ b/src/compute/src/lib.rs @@ -16,7 +16,6 @@ #![feature(coroutines)] #![feature(type_alias_impl_trait)] #![feature(let_chains)] -#![feature(lint_reasons)] #![feature(impl_trait_in_assoc_type)] #![cfg_attr(coverage, feature(coverage_attribute))] @@ -103,8 +102,9 @@ pub struct ComputeNodeOpts { pub role: Role, /// Used for control the metrics level, similar to log level. - /// 0 = disable metrics - /// >0 = enable metrics + /// + /// level = 0: disable metrics + /// level > 0: enable metrics #[clap(long, hide = true, env = "RW_METRICS_LEVEL")] #[override_opts(path = server.metrics_level)] pub metrics_level: Option, diff --git a/src/compute/src/rpc/service/stream_service.rs b/src/compute/src/rpc/service/stream_service.rs index eb055a174b3ea..6253cfe74c730 100644 --- a/src/compute/src/rpc/service/stream_service.rs +++ b/src/compute/src/rpc/service/stream_service.rs @@ -40,20 +40,6 @@ impl StreamService for StreamServiceImpl { type StreamingControlStreamStream = impl Stream>; - #[cfg_attr(coverage, coverage(off))] - async fn drop_actors( - &self, - request: Request, - ) -> std::result::Result, Status> { - let req = request.into_inner(); - let actors = req.actor_ids; - self.mgr.drop_actors(actors).await?; - Ok(Response::new(DropActorsResponse { - request_id: req.request_id, - status: None, - })) - } - #[cfg_attr(coverage, coverage(off))] async fn wait_epoch_commit( &self, diff --git a/src/config/docs.md b/src/config/docs.md index 47905d71e5e0c..bcce61d8bb456 100644 --- a/src/config/docs.md +++ b/src/config/docs.md @@ -119,6 +119,7 @@ This page is automatically generated by `./risedev generate-example-config` | enable_fast_compaction | | true | | high_priority_ratio_in_percent | DEPRECATED: This config will be deprecated in the future version, use `storage.cache.block_cache_eviction.high_priority_ratio_in_percent` with `storage.cache.block_cache_eviction.algorithm = "Lru"` instead. | | | imm_merge_threshold | The threshold for the number of immutable memtables to merge to a new imm. | 0 | +| max_cached_recent_versions_number | | | | max_concurrent_compaction_task_number | | 16 | | max_prefetch_block_number | max prefetch block number | 16 | | max_preload_io_retry_times | | 3 | diff --git a/src/config/example.toml b/src/config/example.toml index c81b35163eafa..f3c127cdc7825 100644 --- a/src/config/example.toml +++ b/src/config/example.toml @@ -81,6 +81,8 @@ meta_enable_trivial_move = true meta_enable_check_task_level_overlap = false meta_max_trivial_move_task_count_per_loop = 256 meta_max_get_task_probe_times = 5 +meta_actor_cnt_per_worker_parallelism_soft_limit = 100 +meta_actor_cnt_per_worker_parallelism_hard_limit = 400 [batch] enable_barrier_read = false diff --git a/src/connector/Cargo.toml b/src/connector/Cargo.toml index d87e89c1cf65d..3801508a7aa19 100644 --- a/src/connector/Cargo.toml +++ b/src/connector/Cargo.toml @@ -76,7 +76,7 @@ jni = { version = "0.21.1", features = ["invocation"] } jsonbb = { workspace = true } jsonwebtoken = "9.2.0" maplit = "1.0.2" -moka = { version = "0.12", features = ["future"] } +moka = { version = "0.12.0", features = ["future"] } mongodb = { version = "2.8.2", features = ["tokio-runtime"] } mysql_async = { version = "0.34", default-features = false, features = [ "default", @@ -103,7 +103,7 @@ pg_bigdecimal = { git = "https://github.com/risingwavelabs/rust-pg_bigdecimal", postgres-openssl = "0.5.0" prometheus = { version = "0.13", features = ["process"] } prost = { workspace = true, features = ["no-recursion-limit"] } -prost-reflect = "0.14" +prost-reflect = { version = "0.14", features = ["serde"] } prost-types = "0.13" protobuf-native = "0.2.2" pulsar = { version = "6.3", default-features = false, features = [ diff --git a/src/connector/codec/src/decoder/mod.rs b/src/connector/codec/src/decoder/mod.rs index 814e06a166c6c..bbfdbf0a90d79 100644 --- a/src/connector/codec/src/decoder/mod.rs +++ b/src/connector/codec/src/decoder/mod.rs @@ -38,6 +38,9 @@ pub enum AccessError { #[error("Unsupported additional column `{name}`")] UnsupportedAdditionalColumn { name: String }, + #[error("Fail to convert protobuf Any into jsonb: {0}")] + ProtobufAnyToJson(#[source] serde_json::Error), + /// Errors that are not categorized into variants above. #[error("{message}")] Uncategorized { message: String }, diff --git a/src/connector/codec/src/lib.rs b/src/connector/codec/src/lib.rs index cbf0ad14046f7..2119c1ece4e57 100644 --- a/src/connector/codec/src/lib.rs +++ b/src/connector/codec/src/lib.rs @@ -21,7 +21,6 @@ #![feature(stmt_expr_attributes)] #![feature(box_patterns)] #![feature(trait_alias)] -#![feature(lint_reasons)] #![feature(let_chains)] #![feature(box_into_inner)] #![feature(type_alias_impl_trait)] diff --git a/src/connector/src/connector_common/common.rs b/src/connector/src/connector_common/common.rs index b522ae2eda560..9f4211aedd4d9 100644 --- a/src/connector/src/connector_common/common.rs +++ b/src/connector/src/connector_common/common.rs @@ -192,14 +192,26 @@ pub struct KafkaCommon { #[serde(rename = "properties.ssl.ca.location")] ssl_ca_location: Option, + /// CA certificate string (PEM format) for verifying the broker's key. + #[serde(rename = "properties.ssl.ca.pem")] + ssl_ca_pem: Option, + /// Path to client's certificate file (PEM). #[serde(rename = "properties.ssl.certificate.location")] ssl_certificate_location: Option, + /// Client's public key string (PEM format) used for authentication. + #[serde(rename = "properties.ssl.certificate.pem")] + ssl_certificate_pem: Option, + /// Path to client's private key file (PEM). #[serde(rename = "properties.ssl.key.location")] ssl_key_location: Option, + /// Client's private key string (PEM format) used for authentication. + #[serde(rename = "properties.ssl.key.pem")] + ssl_key_pem: Option, + /// Passphrase of client's private key. #[serde(rename = "properties.ssl.key.password")] ssl_key_password: Option, @@ -325,12 +337,21 @@ impl KafkaCommon { if let Some(ssl_ca_location) = self.ssl_ca_location.as_ref() { config.set("ssl.ca.location", ssl_ca_location); } + if let Some(ssl_ca_pem) = self.ssl_ca_pem.as_ref() { + config.set("ssl.ca.pem", ssl_ca_pem); + } if let Some(ssl_certificate_location) = self.ssl_certificate_location.as_ref() { config.set("ssl.certificate.location", ssl_certificate_location); } + if let Some(ssl_certificate_pem) = self.ssl_certificate_pem.as_ref() { + config.set("ssl.certificate.pem", ssl_certificate_pem); + } if let Some(ssl_key_location) = self.ssl_key_location.as_ref() { config.set("ssl.key.location", ssl_key_location); } + if let Some(ssl_key_pem) = self.ssl_key_pem.as_ref() { + config.set("ssl.key.pem", ssl_key_pem); + } if let Some(ssl_key_password) = self.ssl_key_password.as_ref() { config.set("ssl.key.password", ssl_key_password); } diff --git a/src/connector/src/lib.rs b/src/connector/src/lib.rs index 6ee28a2161aa1..f66b5116c110b 100644 --- a/src/connector/src/lib.rs +++ b/src/connector/src/lib.rs @@ -19,7 +19,6 @@ #![feature(stmt_expr_attributes)] #![feature(box_patterns)] #![feature(trait_alias)] -#![feature(lint_reasons)] #![feature(let_chains)] #![feature(box_into_inner)] #![feature(type_alias_impl_trait)] diff --git a/src/connector/src/parser/additional_columns.rs b/src/connector/src/parser/additional_columns.rs index c30f5f74ba390..645220b401c5a 100644 --- a/src/connector/src/parser/additional_columns.rs +++ b/src/connector/src/parser/additional_columns.rs @@ -24,15 +24,15 @@ use risingwave_pb::plan_common::additional_column::ColumnType as AdditionalColum use risingwave_pb::plan_common::{ AdditionalCollectionName, AdditionalColumn, AdditionalColumnFilename, AdditionalColumnHeader, AdditionalColumnHeaders, AdditionalColumnKey, AdditionalColumnOffset, - AdditionalColumnPartition, AdditionalColumnTimestamp, AdditionalDatabaseName, - AdditionalSchemaName, AdditionalTableName, + AdditionalColumnPartition, AdditionalColumnPayload, AdditionalColumnTimestamp, + AdditionalDatabaseName, AdditionalSchemaName, AdditionalTableName, }; use crate::error::ConnectorResult; use crate::source::cdc::MONGODB_CDC_CONNECTOR; use crate::source::{ AZBLOB_CONNECTOR, GCS_CONNECTOR, KAFKA_CONNECTOR, KINESIS_CONNECTOR, OPENDAL_S3_CONNECTOR, - POSIX_FS_CONNECTOR, PULSAR_CONNECTOR, S3_CONNECTOR, + POSIX_FS_CONNECTOR, PULSAR_CONNECTOR, }; // Hidden additional columns connectors which do not support `include` syntax. @@ -44,21 +44,36 @@ pub static COMPATIBLE_ADDITIONAL_COLUMNS: LazyLock ColumnDesc::named_with_additional_column( + column_name, + column_id, + DataType::Jsonb, + AdditionalColumn { + column_type: Some(AdditionalColumnType::Payload(AdditionalColumnPayload {})), + }, + ), "offset" => ColumnDesc::named_with_additional_column( column_name, column_id, diff --git a/src/connector/src/parser/mod.rs b/src/connector/src/parser/mod.rs index 4b14654bf518d..a49390c2752f4 100644 --- a/src/connector/src/parser/mod.rs +++ b/src/connector/src/parser/mod.rs @@ -488,6 +488,11 @@ impl SourceStreamChunkRowWriter<'_> { .map(|ele| ScalarRefImpl::Utf8(ele.split_id)), )); } + (_, &Some(AdditionalColumnType::Payload(_))) => { + // ingest the whole payload as a single column + // do special logic in `KvEvent::access_field` + parse_field(desc) + } (_, _) => { // For normal columns, call the user provided closure. parse_field(desc) diff --git a/src/connector/src/parser/plain_parser.rs b/src/connector/src/parser/plain_parser.rs index f1ac65d79a654..e9c9436fd295f 100644 --- a/src/connector/src/parser/plain_parser.rs +++ b/src/connector/src/parser/plain_parser.rs @@ -297,10 +297,9 @@ mod tests { .unwrap() .into_iter() .filter(|c| c.cardinality() > 0) - .map(|c| { + .inspect(|c| { // 5 data messages in a single chunk assert_eq!(5, c.cardinality()); - c }) .collect_vec(); diff --git a/src/connector/src/parser/protobuf/parser.rs b/src/connector/src/parser/protobuf/parser.rs index 8be25074f6295..bbd1d3f0da1e3 100644 --- a/src/connector/src/parser/protobuf/parser.rs +++ b/src/connector/src/parser/protobuf/parser.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use anyhow::Context; use itertools::Itertools; use prost_reflect::{ @@ -22,19 +20,16 @@ use prost_reflect::{ }; use risingwave_common::array::{ListValue, StructValue}; use risingwave_common::types::{ - DataType, Datum, DatumCow, Decimal, JsonbRef, JsonbVal, ScalarImpl, ScalarRefImpl, ToDatumRef, - ToOwnedDatum, F32, F64, + DataType, DatumCow, Decimal, JsonbVal, ScalarImpl, ToOwnedDatum, F32, F64, }; use risingwave_common::{bail, try_match_expand}; use risingwave_pb::plan_common::{AdditionalColumn, ColumnDesc, ColumnDescVersion}; use thiserror::Error; -use thiserror_ext::{AsReport, Macro}; +use thiserror_ext::Macro; use crate::error::ConnectorResult; use crate::parser::unified::protobuf::ProtobufAccess; -use crate::parser::unified::{ - bail_uncategorized, uncategorized, AccessError, AccessImpl, AccessResult, -}; +use crate::parser::unified::{uncategorized, AccessError, AccessImpl, AccessResult}; use crate::parser::util::bytes_from_url; use crate::parser::{AccessBuilder, EncodingProperties}; use crate::schema::schema_registry::{extract_schema_id, handle_sr_list, Client, WireFormatError}; @@ -44,7 +39,6 @@ use crate::schema::SchemaLoader; pub struct ProtobufAccessBuilder { confluent_wire_type: bool, message_descriptor: MessageDescriptor, - descriptor_pool: Arc, } impl AccessBuilder for ProtobufAccessBuilder { @@ -59,10 +53,7 @@ impl AccessBuilder for ProtobufAccessBuilder { let message = DynamicMessage::decode(self.message_descriptor.clone(), payload) .context("failed to parse message")?; - Ok(AccessImpl::Protobuf(ProtobufAccess::new( - message, - Arc::clone(&self.descriptor_pool), - ))) + Ok(AccessImpl::Protobuf(ProtobufAccess::new(message))) } } @@ -71,13 +62,11 @@ impl ProtobufAccessBuilder { let ProtobufParserConfig { confluent_wire_type, message_descriptor, - descriptor_pool, } = config; Ok(Self { confluent_wire_type, message_descriptor, - descriptor_pool, }) } } @@ -86,8 +75,6 @@ impl ProtobufAccessBuilder { pub struct ProtobufParserConfig { confluent_wire_type: bool, pub(crate) message_descriptor: MessageDescriptor, - /// Note that the pub(crate) here is merely for testing - pub(crate) descriptor_pool: Arc, } impl ProtobufParserConfig { @@ -132,7 +119,6 @@ impl ProtobufParserConfig { Ok(Self { message_descriptor, confluent_wire_type: protobuf_config.use_schema_registry, - descriptor_pool: Arc::new(pool), }) } @@ -216,141 +202,10 @@ fn detect_loop_and_push( Ok(()) } -fn extract_any_info(dyn_msg: &DynamicMessage) -> (String, Value) { - debug_assert!( - dyn_msg.fields().count() == 2, - "Expected only two fields for Any Type MessageDescriptor" - ); - - let type_url = dyn_msg - .get_field_by_name("type_url") - .expect("Expect type_url in dyn_msg") - .to_string() - .split('/') - .nth(1) - .map(|part| part[..part.len() - 1].to_string()) - .unwrap_or_default(); - - let payload = dyn_msg - .get_field_by_name("value") - .expect("Expect value (payload) in dyn_msg") - .as_ref() - .clone(); - - (type_url, payload) -} - -/// TODO: Resolve the potential naming conflict in the map -/// i.e., If the two anonymous type shares the same key (e.g., "Int32"), -/// the latter will overwrite the former one in `serde_json::Map`. -/// Possible solution, maintaining a global id map, for the same types -/// In the same level of fields, add the unique id at the tail of the name. -/// e.g., "Int32.1" & "Int32.2" in the above example -fn recursive_parse_json( - fields: &[Datum], - full_name_vec: Option>, - full_name: Option, -) -> serde_json::Value { - // Note that the key is of no order - let mut ret: serde_json::Map = serde_json::Map::new(); - - // The hidden type hint for user's convenience - // i.e., `"_type": message.full_name()` - if let Some(full_name) = full_name { - ret.insert("_type".to_string(), serde_json::Value::String(full_name)); - } - - for (idx, field) in fields.iter().enumerate() { - let mut key; - if let Some(k) = full_name_vec.as_ref() { - key = k[idx].to_string(); - } else { - key = "".to_string(); - } - - match field.clone() { - Some(ScalarImpl::Int16(v)) => { - if key.is_empty() { - key = "Int16".to_string(); - } - ret.insert(key, serde_json::Value::Number(serde_json::Number::from(v))); - } - Some(ScalarImpl::Int32(v)) => { - if key.is_empty() { - key = "Int32".to_string(); - } - ret.insert(key, serde_json::Value::Number(serde_json::Number::from(v))); - } - Some(ScalarImpl::Int64(v)) => { - if key.is_empty() { - key = "Int64".to_string(); - } - ret.insert(key, serde_json::Value::Number(serde_json::Number::from(v))); - } - Some(ScalarImpl::Bool(v)) => { - if key.is_empty() { - key = "Bool".to_string(); - } - ret.insert(key, serde_json::Value::Bool(v)); - } - Some(ScalarImpl::Bytea(v)) => { - if key.is_empty() { - key = "Bytea".to_string(); - } - let s = String::from_utf8(v.to_vec()).unwrap(); - ret.insert(key, serde_json::Value::String(s)); - } - Some(ScalarImpl::Float32(v)) => { - if key.is_empty() { - key = "Int16".to_string(); - } - ret.insert( - key, - serde_json::Value::Number( - serde_json::Number::from_f64(v.into_inner() as f64).unwrap(), - ), - ); - } - Some(ScalarImpl::Float64(v)) => { - if key.is_empty() { - key = "Float64".to_string(); - } - ret.insert( - key, - serde_json::Value::Number( - serde_json::Number::from_f64(v.into_inner()).unwrap(), - ), - ); - } - Some(ScalarImpl::Utf8(v)) => { - if key.is_empty() { - key = "Utf8".to_string(); - } - ret.insert(key, serde_json::Value::String(v.to_string())); - } - Some(ScalarImpl::Struct(v)) => { - if key.is_empty() { - key = "Struct".to_string(); - } - ret.insert(key, recursive_parse_json(v.fields(), None, None)); - } - Some(ScalarImpl::Jsonb(v)) => { - if key.is_empty() { - key = "Jsonb".to_string(); - } - ret.insert(key, v.take()); - } - r#type => panic!("Not yet support ScalarImpl type: {:?}", r#type), - } - } - - serde_json::Value::Object(ret) -} - pub fn from_protobuf_value<'a>( field_desc: &FieldDescriptor, value: &'a Value, - descriptor_pool: &Arc, + type_expected: &DataType, ) -> AccessResult> { let kind = field_desc.kind(); @@ -382,91 +237,46 @@ pub fn from_protobuf_value<'a>( } Value::Message(dyn_msg) => { if dyn_msg.descriptor().full_name() == "google.protobuf.Any" { - // If the fields are not presented, default value is an empty string - if !dyn_msg.has_field_by_name("type_url") || !dyn_msg.has_field_by_name("value") { - borrowed!(JsonbRef::empty_string()); - } - - // Sanity check - debug_assert!( - dyn_msg.has_field_by_name("type_url") && dyn_msg.has_field_by_name("value"), - "`type_url` & `value` must exist in fields of `dyn_msg`" - ); - - // The message is of type `Any` - let (type_url, payload) = extract_any_info(dyn_msg); - - let payload_field_desc = dyn_msg.descriptor().get_field_by_name("value").unwrap(); - - let payload = from_protobuf_value(&payload_field_desc, &payload, descriptor_pool)?; - let Some(ScalarRefImpl::Bytea(payload)) = payload.to_datum_ref() else { - bail_uncategorized!("expected bytes for dynamic message payload"); - }; - - // Get the corresponding schema from the descriptor pool - let msg_desc = descriptor_pool - .get_message_by_name(&type_url) - .ok_or_else(|| { - uncategorized!("message `{type_url}` not found in descriptor pool") - })?; - - let f = msg_desc - .clone() - .fields() - .map(|f| f.name().to_string()) - .collect::>(); - - let full_name = msg_desc.clone().full_name().to_string(); - - // Decode the payload based on the `msg_desc` - let decoded_value = DynamicMessage::decode(msg_desc, payload).unwrap(); - let decoded_value = from_protobuf_value( - field_desc, - &Value::Message(decoded_value), - descriptor_pool, - )? - .to_owned_datum() - .unwrap(); - - // Extract the struct value - let ScalarImpl::Struct(v) = decoded_value else { - panic!("Expect ScalarImpl::Struct"); + ScalarImpl::Jsonb(JsonbVal::from( + serde_json::to_value(dyn_msg).map_err(AccessError::ProtobufAnyToJson)?, + )) + } else { + let desc = dyn_msg.descriptor(); + let DataType::Struct(st) = type_expected else { + return Err(AccessError::TypeError { + expected: type_expected.to_string(), + got: desc.full_name().to_string(), + value: value.to_string(), // Protobuf TEXT + }); }; - ScalarImpl::Jsonb(JsonbVal::from(serde_json::json!(recursive_parse_json( - v.fields(), - Some(f), - Some(full_name), - )))) - } else { - let mut rw_values = Vec::with_capacity(dyn_msg.descriptor().fields().len()); - // fields is a btree map in descriptor - // so it's order is the same as datatype - for field_desc in dyn_msg.descriptor().fields() { - // missing field - if !dyn_msg.has_field(&field_desc) - && field_desc.cardinality() == Cardinality::Required - { - return Err(AccessError::Undefined { - name: field_desc.name().to_owned(), - path: dyn_msg.descriptor().full_name().to_owned(), - }); - } - // use default value if dyn_msg doesn't has this field + let mut rw_values = Vec::with_capacity(st.len()); + for (name, expected_field_type) in st.iter() { + let Some(field_desc) = desc.get_field_by_name(name) else { + // Field deleted in protobuf. Fallback to SQL NULL (of proper RW type). + rw_values.push(None); + continue; + }; let value = dyn_msg.get_field(&field_desc); rw_values.push( - from_protobuf_value(&field_desc, &value, descriptor_pool)?.to_owned_datum(), + from_protobuf_value(&field_desc, &value, expected_field_type)? + .to_owned_datum(), ); } ScalarImpl::Struct(StructValue::new(rw_values)) } } Value::List(values) => { - let data_type = protobuf_type_mapping(field_desc, &mut vec![]) - .map_err(|e| uncategorized!("{}", e.to_report_string()))?; - let mut builder = data_type.as_list().create_array_builder(values.len()); + let DataType::List(element_type) = type_expected else { + return Err(AccessError::TypeError { + expected: type_expected.to_string(), + got: format!("repeated {:?}", kind), + value: value.to_string(), // Protobuf TEXT + }); + }; + let mut builder = element_type.create_array_builder(values.len()); for value in values { - builder.append(from_protobuf_value(field_desc, value, descriptor_pool)?); + builder.append(from_protobuf_value(field_desc, value, element_type)?); } ScalarImpl::List(ListValue::new(builder.finish())) } @@ -498,25 +308,18 @@ fn protobuf_type_mapping( } Kind::Uint64 | Kind::Fixed64 => DataType::Decimal, Kind::String => DataType::Varchar, - Kind::Message(m) => { - let fields = m - .fields() - .map(|f| protobuf_type_mapping(&f, parse_trace)) - .try_collect()?; - let field_names = m.fields().map(|f| f.name().to_string()).collect_vec(); - - // Note that this part is useful for actual parsing - // Since RisingWave will parse message to `ScalarImpl::Jsonb` - // Please do NOT modify it - if field_names.len() == 2 - && field_names.contains(&"value".to_string()) - && field_names.contains(&"type_url".to_string()) - { - DataType::Jsonb - } else { + Kind::Message(m) => match m.full_name() { + // Well-Known Types are identified by their full name + "google.protobuf.Any" => DataType::Jsonb, + _ => { + let fields = m + .fields() + .map(|f| protobuf_type_mapping(&f, parse_trace)) + .try_collect()?; + let field_names = m.fields().map(|f| f.name().to_string()).collect_vec(); DataType::new_struct(fields, field_names) } - } + }, Kind::Enum(_) => DataType::Varchar, Kind::Bytes => DataType::Bytea, }; @@ -597,6 +400,7 @@ mod test { use risingwave_pb::data::data_type::PbTypeName; use risingwave_pb::plan_common::{PbEncodeType, PbFormatType}; use serde_json::json; + use thiserror_ext::AsReport as _; use super::*; use crate::parser::protobuf::recursive::all_types::{EnumType, ExampleOneof, NestedMessage}; @@ -904,7 +708,8 @@ mod test { } fn pb_eq(a: &ProtobufAccess, field_name: &str, value: ScalarImpl) { - let dummy_type = DataType::Varchar; + let field = a.descriptor().get_field_by_name(field_name).unwrap(); + let dummy_type = protobuf_type_mapping(&field, &mut vec![]).unwrap(); let d = a.access_owned(&[field_name], &dummy_type).unwrap().unwrap(); assert_eq!(d, value, "field: {} value: {:?}", field_name, d); } @@ -964,49 +769,35 @@ mod test { println!("Current conf: {:#?}", conf); println!("---------------------------"); - let value = + let message = DynamicMessage::decode(conf.message_descriptor.clone(), ANY_GEN_PROTO_DATA).unwrap(); - println!("Test ANY_GEN_PROTO_DATA, current value: {:#?}", value); + println!("Test ANY_GEN_PROTO_DATA, current value: {:#?}", message); println!("---------------------------"); - // This is of no use - let field = value.fields().next().unwrap().0; - - if let Some(ret) = - from_protobuf_value(&field, &Value::Message(value), &conf.descriptor_pool) - .unwrap() - .to_owned_datum() - { - println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); - println!("---------------------------"); - - let ScalarImpl::Struct(struct_value) = ret else { - panic!("Expected ScalarImpl::Struct"); - }; - - let fields = struct_value.fields(); + let field = conf + .message_descriptor + .get_field_by_name("any_value") + .unwrap(); + let value = message.get_field(&field); - match fields[0].clone() { - Some(ScalarImpl::Int32(v)) => { - println!("Successfully decode field[0]"); - assert_eq!(v, 12345); - } - _ => panic!("Expected ScalarImpl::Int32"), - } + let ret = from_protobuf_value(&field, &value, &DataType::Jsonb) + .unwrap() + .to_owned_datum(); + println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); + println!("---------------------------"); - match fields[1].clone() { - Some(ScalarImpl::Jsonb(jv)) => { - assert_eq!( - jv, - JsonbVal::from(json!({ - "_type": "test.StringValue", - "value": "John Doe" - })) - ); - } - _ => panic!("Expected ScalarImpl::Jsonb"), + match ret { + Some(ScalarImpl::Jsonb(jv)) => { + assert_eq!( + jv, + JsonbVal::from(json!({ + "@type": "type.googleapis.com/test.StringValue", + "value": "John Doe" + })) + ); } + _ => panic!("Expected ScalarImpl::Jsonb"), } Ok(()) @@ -1027,49 +818,35 @@ mod test { println!("Current conf: {:#?}", conf); println!("---------------------------"); - let value = + let message = DynamicMessage::decode(conf.message_descriptor.clone(), ANY_GEN_PROTO_DATA_1).unwrap(); - println!("Current Value: {:#?}", value); + println!("Current Value: {:#?}", message); println!("---------------------------"); - // This is of no use - let field = value.fields().next().unwrap().0; - - if let Some(ret) = - from_protobuf_value(&field, &Value::Message(value), &conf.descriptor_pool) - .unwrap() - .to_owned_datum() - { - println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); - println!("---------------------------"); - - let ScalarImpl::Struct(struct_value) = ret else { - panic!("Expected ScalarImpl::Struct"); - }; - - let fields = struct_value.fields(); + let field = conf + .message_descriptor + .get_field_by_name("any_value") + .unwrap(); + let value = message.get_field(&field); - match fields[0].clone() { - Some(ScalarImpl::Int32(v)) => { - println!("Successfully decode field[0]"); - assert_eq!(v, 12345); - } - _ => panic!("Expected ScalarImpl::Int32"), - } + let ret = from_protobuf_value(&field, &value, &DataType::Jsonb) + .unwrap() + .to_owned_datum(); + println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); + println!("---------------------------"); - match fields[1].clone() { - Some(ScalarImpl::Jsonb(jv)) => { - assert_eq!( - jv, - JsonbVal::from(json!({ - "_type": "test.Int32Value", - "value": 114514 - })) - ); - } - _ => panic!("Expected ScalarImpl::Jsonb"), + match ret { + Some(ScalarImpl::Jsonb(jv)) => { + assert_eq!( + jv, + JsonbVal::from(json!({ + "@type": "type.googleapis.com/test.Int32Value", + "value": 114514 + })) + ); } + _ => panic!("Expected ScalarImpl::Jsonb"), } Ok(()) @@ -1098,60 +875,80 @@ mod test { println!("Current conf: {:#?}", conf); println!("---------------------------"); - let value = DynamicMessage::decode( + let message = DynamicMessage::decode( conf.message_descriptor.clone(), ANY_RECURSIVE_GEN_PROTO_DATA, ) .unwrap(); - println!("Current Value: {:#?}", value); + println!("Current Value: {:#?}", message); + println!("---------------------------"); + + let field = conf + .message_descriptor + .get_field_by_name("any_value") + .unwrap(); + let value = message.get_field(&field); + + let ret = from_protobuf_value(&field, &value, &DataType::Jsonb) + .unwrap() + .to_owned_datum(); + println!("Decoded Value for ANY_RECURSIVE_GEN_PROTO_DATA: {:#?}", ret); println!("---------------------------"); - // This is of no use - let field = value.fields().next().unwrap().0; + match ret { + Some(ScalarImpl::Jsonb(jv)) => { + assert_eq!( + jv, + JsonbVal::from(json!({ + "@type": "type.googleapis.com/test.AnyValue", + "anyValue1": { + "@type": "type.googleapis.com/test.StringValue", + "value": "114514", + }, + "anyValue2": { + "@type": "type.googleapis.com/test.Int32Value", + "value": 114514, + } + })) + ); + } + _ => panic!("Expected ScalarImpl::Jsonb"), + } - if let Some(ret) = - from_protobuf_value(&field, &Value::Message(value), &conf.descriptor_pool) - .unwrap() - .to_owned_datum() - { - println!("Decoded Value for ANY_RECURSIVE_GEN_PROTO_DATA: {:#?}", ret); - println!("---------------------------"); + Ok(()) + } - let ScalarImpl::Struct(struct_value) = ret else { - panic!("Expected ScalarImpl::Struct"); - }; + // id: 12345 + // any_value: { + // type_url: "type.googleapis.com/test.StringXalue" + // value: "\n\010John Doe" + // } + static ANY_GEN_PROTO_DATA_INVALID: &[u8] = b"\x08\xb9\x60\x12\x32\x0a\x24\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x53\x74\x72\x69\x6e\x67\x58\x61\x6c\x75\x65\x12\x0a\x0a\x08\x4a\x6f\x68\x6e\x20\x44\x6f\x65"; - let fields = struct_value.fields(); + #[tokio::test] + async fn test_any_invalid() -> crate::error::ConnectorResult<()> { + let conf = create_recursive_pb_parser_config("/any-schema.pb", "test.TestAny").await; - match fields[0].clone() { - Some(ScalarImpl::Int32(v)) => { - println!("Successfully decode field[0]"); - assert_eq!(v, 12345); - } - _ => panic!("Expected ScalarImpl::Int32"), - } + let message = + DynamicMessage::decode(conf.message_descriptor.clone(), ANY_GEN_PROTO_DATA_INVALID) + .unwrap(); - match fields[1].clone() { - Some(ScalarImpl::Jsonb(jv)) => { - assert_eq!( - jv, - JsonbVal::from(json!({ - "_type": "test.AnyValue", - "any_value_1": { - "_type": "test.StringValue", - "value": "114514", - }, - "any_value_2": { - "_type": "test.Int32Value", - "value": 114514, - } - })) - ); - } - _ => panic!("Expected ScalarImpl::Jsonb"), - } - } + let field = conf + .message_descriptor + .get_field_by_name("any_value") + .unwrap(); + let value = message.get_field(&field); + + let err = from_protobuf_value(&field, &value, &DataType::Jsonb).unwrap_err(); + + let expected = expect_test::expect![[r#" + Fail to convert protobuf Any into jsonb + + Caused by: + message 'test.StringXalue' not found + "#]]; + expected.assert_eq(err.to_report_string_pretty().as_str()); Ok(()) } diff --git a/src/connector/src/parser/unified/json.rs b/src/connector/src/parser/unified/json.rs index ca709e2eebc73..8ee8f9fe9386f 100644 --- a/src/connector/src/parser/unified/json.rs +++ b/src/connector/src/parser/unified/json.rs @@ -646,6 +646,7 @@ impl<'a> JsonAccess<'a> { impl Access for JsonAccess<'_> { fn access<'a>(&'a self, path: &[&str], type_expected: &DataType) -> AccessResult> { let mut value = &self.value; + for (idx, &key) in path.iter().enumerate() { if let Some(sub_value) = if self.options.ignoring_keycase { json_object_get_case_insensitive(value, key) diff --git a/src/connector/src/parser/unified/kv_event.rs b/src/connector/src/parser/unified/kv_event.rs index 7e52d2f4c3c24..6ab7925b9bb48 100644 --- a/src/connector/src/parser/unified/kv_event.rs +++ b/src/connector/src/parser/unified/kv_event.rs @@ -79,6 +79,9 @@ where pub fn access_field(&self, desc: &SourceColumnDesc) -> AccessResult> { match desc.additional_column.column_type { Some(AdditionalColumnType::Key(_)) => self.access_key(&[&desc.name], &desc.data_type), + // hack here: Get the whole payload as a single column + // use a special mark empty slice as path to represent the whole payload + Some(AdditionalColumnType::Payload(_)) => self.access_value(&[], &desc.data_type), None => self.access_value(&[&desc.name], &desc.data_type), _ => unreachable!(), } diff --git a/src/connector/src/parser/unified/mod.rs b/src/connector/src/parser/unified/mod.rs index 8045ce0132401..fdfe3aae6aaee 100644 --- a/src/connector/src/parser/unified/mod.rs +++ b/src/connector/src/parser/unified/mod.rs @@ -17,9 +17,7 @@ use auto_impl::auto_impl; use risingwave_common::types::{DataType, DatumCow}; use risingwave_connector_codec::decoder::avro::AvroAccess; -pub use risingwave_connector_codec::decoder::{ - bail_uncategorized, uncategorized, Access, AccessError, AccessResult, -}; +pub use risingwave_connector_codec::decoder::{uncategorized, Access, AccessError, AccessResult}; use self::bytes::BytesAccess; use self::json::JsonAccess; diff --git a/src/connector/src/parser/unified/protobuf.rs b/src/connector/src/parser/unified/protobuf.rs index 02febc22db247..3ebeebca44373 100644 --- a/src/connector/src/parser/unified/protobuf.rs +++ b/src/connector/src/parser/unified/protobuf.rs @@ -13,9 +13,9 @@ // limitations under the License. use std::borrow::Cow; -use std::sync::{Arc, LazyLock}; +use std::sync::LazyLock; -use prost_reflect::{DescriptorPool, DynamicMessage, ReflectMessage}; +use prost_reflect::{DynamicMessage, ReflectMessage}; use risingwave_common::log::LogSuppresser; use risingwave_common::types::{DataType, DatumCow, ToOwnedDatum}; use thiserror_ext::AsReport; @@ -26,24 +26,21 @@ use crate::parser::unified::uncategorized; pub struct ProtobufAccess { message: DynamicMessage, - descriptor_pool: Arc, } impl ProtobufAccess { - pub fn new(message: DynamicMessage, descriptor_pool: Arc) -> Self { - Self { - message, - descriptor_pool, - } + pub fn new(message: DynamicMessage) -> Self { + Self { message } + } + + #[cfg(test)] + pub fn descriptor(&self) -> prost_reflect::MessageDescriptor { + self.message.descriptor() } } impl Access for ProtobufAccess { - fn access<'a>( - &'a self, - path: &[&str], - _type_expected: &DataType, - ) -> AccessResult> { + fn access<'a>(&'a self, path: &[&str], type_expected: &DataType) -> AccessResult> { debug_assert_eq!(1, path.len()); let field_desc = self .message @@ -59,10 +56,10 @@ impl Access for ProtobufAccess { })?; match self.message.get_field(&field_desc) { - Cow::Borrowed(value) => from_protobuf_value(&field_desc, value, &self.descriptor_pool), + Cow::Borrowed(value) => from_protobuf_value(&field_desc, value, type_expected), // `Owned` variant occurs only if there's no such field and the default value is returned. - Cow::Owned(value) => from_protobuf_value(&field_desc, &value, &self.descriptor_pool) + Cow::Owned(value) => from_protobuf_value(&field_desc, &value, type_expected) // enforce `Owned` variant to avoid returning a reference to a temporary value .map(|d| d.to_owned_datum().into()), } diff --git a/src/connector/src/sink/big_query.rs b/src/connector/src/sink/big_query.rs index 22146e86d0d1d..235b1ff5b6539 100644 --- a/src/connector/src/sink/big_query.rs +++ b/src/connector/src/sink/big_query.rs @@ -12,19 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +use core::pin::Pin; use core::time::Duration; -use std::collections::{BTreeMap, HashMap}; -use std::sync::Arc; +use std::collections::{BTreeMap, HashMap, VecDeque}; use anyhow::{anyhow, Context}; -use async_trait::async_trait; +use futures::future::pending; +use futures::prelude::Future; +use futures::{Stream, StreamExt}; +use futures_async_stream::try_stream; use gcp_bigquery_client::error::BQError; use gcp_bigquery_client::model::query_request::QueryRequest; use gcp_bigquery_client::model::table::Table; use gcp_bigquery_client::model::table_field_schema::TableFieldSchema; use gcp_bigquery_client::model::table_schema::TableSchema; use gcp_bigquery_client::Client; -use google_cloud_bigquery::grpc::apiv1::bigquery_client::StreamingWriteClient; use google_cloud_bigquery::grpc::apiv1::conn_pool::{WriteConnectionManager, DOMAIN}; use google_cloud_gax::conn::{ConnectionOptions, Environment}; use google_cloud_gax::grpc::Request; @@ -32,7 +34,7 @@ use google_cloud_googleapis::cloud::bigquery::storage::v1::append_rows_request:: ProtoData, Rows as AppendRowsRequestRows, }; use google_cloud_googleapis::cloud::bigquery::storage::v1::{ - AppendRowsRequest, ProtoRows, ProtoSchema, + AppendRowsRequest, AppendRowsResponse, ProtoRows, ProtoSchema, }; use google_cloud_pubsub::client::google_cloud_auth; use google_cloud_pubsub::client::google_cloud_auth::credentials::CredentialsFile; @@ -42,32 +44,35 @@ use prost_types::{ FileDescriptorSet, }; use risingwave_common::array::{Op, StreamChunk}; -use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{Field, Schema}; use risingwave_common::types::DataType; use serde_derive::Deserialize; use serde_with::{serde_as, DisplayFromStr}; use simd_json::prelude::ArrayTrait; +use tokio::sync::mpsc; +use tonic::{async_trait, Response, Status}; use url::Url; use uuid::Uuid; use with_options::WithOptions; use yup_oauth2::ServiceAccountKey; use super::encoder::{ProtoEncoder, ProtoHeader, RowEncoder, SerTo}; -use super::writer::LogSinkerOf; -use super::{SinkError, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT}; +use super::log_store::{LogStoreReadItem, TruncateOffset}; +use super::{ + LogSinker, SinkError, SinkLogReader, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT, +}; use crate::aws_utils::load_file_descriptor_from_s3; use crate::connector_common::AwsAuthProps; -use crate::sink::writer::SinkWriterExt; -use crate::sink::{ - DummySinkCommitCoordinator, Result, Sink, SinkParam, SinkWriter, SinkWriterParam, -}; +use crate::sink::{DummySinkCommitCoordinator, Result, Sink, SinkParam, SinkWriterParam}; pub const BIGQUERY_SINK: &str = "bigquery"; pub const CHANGE_TYPE: &str = "_CHANGE_TYPE"; const DEFAULT_GRPC_CHANNEL_NUMS: usize = 4; const CONNECT_TIMEOUT: Option = Some(Duration::from_secs(30)); const CONNECTION_TIMEOUT: Option = None; +const BIGQUERY_SEND_FUTURE_BUFFER_MAX_SIZE: usize = 65536; +// < 10MB, we set 8MB +const MAX_ROW_SIZE: usize = 8 * 1024 * 1024; #[serde_as] #[derive(Deserialize, Debug, Clone, WithOptions)] @@ -82,23 +87,100 @@ pub struct BigQueryCommon { pub dataset: String, #[serde(rename = "bigquery.table")] pub table: String, - #[serde(rename = "bigquery.max_batch_rows", default = "default_max_batch_rows")] - #[serde_as(as = "DisplayFromStr")] - pub max_batch_rows: usize, - #[serde(rename = "bigquery.retry_times", default = "default_retry_times")] - #[serde_as(as = "DisplayFromStr")] - pub retry_times: usize, #[serde(default)] // default false #[serde_as(as = "DisplayFromStr")] pub auto_create: bool, } -fn default_max_batch_rows() -> usize { - 1024 +struct BigQueryFutureManager { + // `offset_queue` holds the Some corresponding to each future. + // When TruncateOffset is barrier, the num is 0, we don't need to wait for the return of `resp_stream`. + // When TruncateOffset is chunk: + // 1. chunk has no rows. we didn't send, the num is 0, we don't need to wait for the return of `resp_stream`. + // 2. chunk is less than `MAX_ROW_SIZE`, we only sent once, the num is 1 and we only have to wait once for `resp_stream`. + // 3. chunk is less than `MAX_ROW_SIZE`, we only sent n, the num is n and we need to wait n times for r. + offset_queue: VecDeque<(TruncateOffset, usize)>, + resp_stream: Pin> + Send>>, } +impl BigQueryFutureManager { + pub fn new( + max_future_num: usize, + resp_stream: impl Stream> + Send + 'static, + ) -> Self { + let offset_queue = VecDeque::with_capacity(max_future_num); + Self { + offset_queue, + resp_stream: Box::pin(resp_stream), + } + } + + pub fn add_offset(&mut self, offset: TruncateOffset, resp_num: usize) { + self.offset_queue.push_back((offset, resp_num)); + } -fn default_retry_times() -> usize { - 5 + pub async fn next_offset(&mut self) -> Result { + if let Some((_offset, remaining_resp_num)) = self.offset_queue.front_mut() { + if *remaining_resp_num == 0 { + return Ok(self.offset_queue.pop_front().unwrap().0); + } + while *remaining_resp_num > 0 { + self.resp_stream + .next() + .await + .ok_or_else(|| SinkError::BigQuery(anyhow::anyhow!("end of stream")))??; + *remaining_resp_num -= 1; + } + Ok(self.offset_queue.pop_front().unwrap().0) + } else { + pending().await + } + } +} +pub struct BigQueryLogSinker { + writer: BigQuerySinkWriter, + bigquery_future_manager: BigQueryFutureManager, + future_num: usize, +} +impl BigQueryLogSinker { + pub fn new( + writer: BigQuerySinkWriter, + resp_stream: impl Stream> + Send + 'static, + future_num: usize, + ) -> Self { + Self { + writer, + bigquery_future_manager: BigQueryFutureManager::new(future_num, resp_stream), + future_num, + } + } +} + +#[async_trait] +impl LogSinker for BigQueryLogSinker { + async fn consume_log_and_sink(mut self, log_reader: &mut impl SinkLogReader) -> Result { + loop { + tokio::select!( + offset = self.bigquery_future_manager.next_offset() => { + log_reader.truncate(offset?)?; + } + item_result = log_reader.next_item(), if self.bigquery_future_manager.offset_queue.len() <= self.future_num => { + let (epoch, item) = item_result?; + match item { + LogStoreReadItem::StreamChunk { chunk_id, chunk } => { + let resp_num = self.writer.write_chunk(chunk)?; + self.bigquery_future_manager + .add_offset(TruncateOffset::Chunk { epoch, chunk_id },resp_num); + } + LogStoreReadItem::Barrier { .. } => { + self.bigquery_future_manager + .add_offset(TruncateOffset::Barrier { epoch },0); + } + LogStoreReadItem::UpdateVnodeBitmap(_) => {} + } + } + ) + } + } } impl BigQueryCommon { @@ -116,14 +198,13 @@ impl BigQueryCommon { async fn build_writer_client( &self, aws_auth_props: &AwsAuthProps, - ) -> Result { + ) -> Result<(StorageWriterClient, impl Stream>)> { let auth_json = self.get_auth_json_from_path(aws_auth_props).await?; let credentials_file = CredentialsFile::new_from_str(&auth_json) .await .map_err(|e| SinkError::BigQuery(e.into()))?; - let client = StorageWriterClient::new(credentials_file).await?; - Ok(client) + StorageWriterClient::new(credentials_file).await } async fn get_auth_json_from_path(&self, aws_auth_props: &AwsAuthProps) -> Result { @@ -342,19 +423,23 @@ impl BigQuerySink { impl Sink for BigQuerySink { type Coordinator = DummySinkCommitCoordinator; - type LogSinker = LogSinkerOf; + type LogSinker = BigQueryLogSinker; const SINK_NAME: &'static str = BIGQUERY_SINK; - async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result { - Ok(BigQuerySinkWriter::new( + async fn new_log_sinker(&self, _writer_param: SinkWriterParam) -> Result { + let (writer, resp_stream) = BigQuerySinkWriter::new( self.config.clone(), self.schema.clone(), self.pk_indices.clone(), self.is_append_only, ) - .await? - .into_log_sinker(writer_param.sink_metrics)) + .await?; + Ok(BigQueryLogSinker::new( + writer, + resp_stream, + BIGQUERY_SEND_FUTURE_BUFFER_MAX_SIZE, + )) } async fn validate(&self) -> Result<()> { @@ -446,8 +531,6 @@ pub struct BigQuerySinkWriter { message_descriptor: MessageDescriptor, write_stream: String, proto_field: Option, - write_rows: Vec, - write_rows_count: usize, } impl TryFrom for BigQuerySink { @@ -471,8 +554,8 @@ impl BigQuerySinkWriter { schema: Schema, pk_indices: Vec, is_append_only: bool, - ) -> Result { - let client = config + ) -> Result<(Self, impl Stream>)> { + let (client, resp_stream) = config .common .build_writer_client(&config.aws_auth_props) .await?; @@ -519,25 +602,26 @@ impl BigQuerySinkWriter { message_descriptor.clone(), ProtoHeader::None, )?; - Ok(Self { - write_stream: format!( - "projects/{}/datasets/{}/tables/{}/streams/_default", - config.common.project, config.common.dataset, config.common.table - ), - config, - schema, - pk_indices, - client, - is_append_only, - row_encoder, - message_descriptor, - proto_field, - writer_pb_schema: ProtoSchema { - proto_descriptor: Some(descriptor_proto), + Ok(( + Self { + write_stream: format!( + "projects/{}/datasets/{}/tables/{}/streams/_default", + config.common.project, config.common.dataset, config.common.table + ), + config, + schema, + pk_indices, + client, + is_append_only, + row_encoder, + message_descriptor, + proto_field, + writer_pb_schema: ProtoSchema { + proto_descriptor: Some(descriptor_proto), + }, }, - write_rows: vec![], - write_rows_count: 0, - }) + resp_stream, + )) } fn append_only(&mut self, chunk: StreamChunk) -> Result>> { @@ -588,82 +672,96 @@ impl BigQuerySinkWriter { Ok(serialized_rows) } - async fn write_rows(&mut self) -> Result<()> { - if self.write_rows.is_empty() { - return Ok(()); - } - let mut errs = Vec::with_capacity(self.config.common.retry_times); - for _ in 0..self.config.common.retry_times { - match self - .client - .append_rows(self.write_rows.clone(), self.write_stream.clone()) - .await - { - Ok(_) => { - self.write_rows_count = 0; - self.write_rows.clear(); - return Ok(()); - } - Err(e) => errs.push(e), - } - } - Err(SinkError::BigQuery(anyhow::anyhow!( - "Insert error {:?}", - errs - ))) - } -} - -#[async_trait] -impl SinkWriter for BigQuerySinkWriter { - async fn write_batch(&mut self, chunk: StreamChunk) -> Result<()> { + fn write_chunk(&mut self, chunk: StreamChunk) -> Result { let serialized_rows = if self.is_append_only { self.append_only(chunk)? } else { self.upsert(chunk)? }; - if !serialized_rows.is_empty() { - self.write_rows_count += serialized_rows.len(); + if serialized_rows.is_empty() { + return Ok(0); + } + let mut result = Vec::new(); + let mut result_inner = Vec::new(); + let mut size_count = 0; + for i in serialized_rows { + size_count += i.len(); + if size_count > MAX_ROW_SIZE { + result.push(result_inner); + result_inner = Vec::new(); + size_count = i.len(); + } + result_inner.push(i); + } + if !result_inner.is_empty() { + result.push(result_inner); + } + let len = result.len(); + for serialized_rows in result { let rows = AppendRowsRequestRows::ProtoRows(ProtoData { writer_schema: Some(self.writer_pb_schema.clone()), rows: Some(ProtoRows { serialized_rows }), }); - self.write_rows.push(rows); - - if self.write_rows_count >= self.config.common.max_batch_rows { - self.write_rows().await?; - } + self.client.append_rows(rows, self.write_stream.clone())?; } - Ok(()) - } - - async fn begin_epoch(&mut self, _epoch: u64) -> Result<()> { - Ok(()) - } - - async fn abort(&mut self) -> Result<()> { - Ok(()) + Ok(len) } +} - async fn barrier(&mut self, is_checkpoint: bool) -> Result<()> { - if is_checkpoint { - self.write_rows().await?; +#[try_stream(ok = (), error = SinkError)] +pub async fn resp_to_stream( + resp_stream: impl Future< + Output = std::result::Result< + Response>, + Status, + >, + > + + 'static + + Send, +) { + let mut resp_stream = resp_stream + .await + .map_err(|e| SinkError::BigQuery(e.into()))? + .into_inner(); + loop { + match resp_stream + .message() + .await + .map_err(|e| SinkError::BigQuery(e.into()))? + { + Some(append_rows_response) => { + if !append_rows_response.row_errors.is_empty() { + return Err(SinkError::BigQuery(anyhow::anyhow!( + "bigquery insert error {:?}", + append_rows_response.row_errors + ))); + } + if let Some(google_cloud_googleapis::cloud::bigquery::storage::v1::append_rows_response::Response::Error(status)) = append_rows_response.response{ + return Err(SinkError::BigQuery(anyhow::anyhow!( + "bigquery insert error {:?}", + status + ))); + } + yield (); + } + None => { + return Err(SinkError::BigQuery(anyhow::anyhow!( + "bigquery insert error: end of resp stream", + ))); + } } - Ok(()) - } - - async fn update_vnode_bitmap(&mut self, _vnode_bitmap: Arc) -> Result<()> { - Ok(()) } } struct StorageWriterClient { - client: StreamingWriteClient, #[expect(dead_code)] environment: Environment, + request_sender: mpsc::UnboundedSender, } impl StorageWriterClient { - pub async fn new(credentials: CredentialsFile) -> Result { + pub async fn new( + credentials: CredentialsFile, + ) -> Result<(Self, impl Stream>)> { let ts_grpc = google_cloud_auth::token::DefaultTokenSourceProvider::new_with_credentials( Self::bigquery_grpc_auth_config(), Box::new(credentials), @@ -683,49 +781,34 @@ impl StorageWriterClient { ) .await .map_err(|e| SinkError::BigQuery(e.into()))?; - let client = conn.conn(); - Ok(StorageWriterClient { - client, - environment, - }) + let mut client = conn.conn(); + + let (tx, rx) = mpsc::unbounded_channel(); + let stream = tokio_stream::wrappers::UnboundedReceiverStream::new(rx); + + let resp = async move { client.append_rows(Request::new(stream)).await }; + let resp_stream = resp_to_stream(resp); + + Ok(( + StorageWriterClient { + environment, + request_sender: tx, + }, + resp_stream, + )) } - pub async fn append_rows( - &mut self, - rows: Vec, - write_stream: String, - ) -> Result<()> { - let mut resp_count = rows.len(); - let append_req: Vec = rows - .into_iter() - .map(|row| AppendRowsRequest { - write_stream: write_stream.clone(), - offset: None, - trace_id: Uuid::new_v4().hyphenated().to_string(), - missing_value_interpretations: HashMap::default(), - rows: Some(row), - }) - .collect(); - let mut resp = self - .client - .append_rows(Request::new(tokio_stream::iter(append_req))) - .await - .map_err(|e| SinkError::BigQuery(e.into()))? - .into_inner(); - while let Some(append_rows_response) = resp - .message() - .await - .map_err(|e| SinkError::BigQuery(e.into()))? - { - resp_count -= 1; - if !append_rows_response.row_errors.is_empty() { - return Err(SinkError::BigQuery(anyhow::anyhow!( - "Insert error {:?}", - append_rows_response.row_errors - ))); - } - } - assert_eq!(resp_count,0,"bigquery sink insert error: the number of response inserted is not equal to the number of request"); + pub fn append_rows(&mut self, row: AppendRowsRequestRows, write_stream: String) -> Result<()> { + let append_req = AppendRowsRequest { + write_stream: write_stream.clone(), + offset: None, + trace_id: Uuid::new_v4().hyphenated().to_string(), + missing_value_interpretations: HashMap::default(), + rows: Some(row), + }; + self.request_sender + .send(append_req) + .map_err(|e| SinkError::BigQuery(e.into()))?; Ok(()) } diff --git a/src/connector/src/sink/clickhouse.rs b/src/connector/src/sink/clickhouse.rs index 6b3e78f6a7b9d..07db42790f581 100644 --- a/src/connector/src/sink/clickhouse.rs +++ b/src/connector/src/sink/clickhouse.rs @@ -25,7 +25,6 @@ use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::Schema; use risingwave_common::row::Row; -use risingwave_common::session_config::sink_decouple::SinkDecouple; use risingwave_common::types::{DataType, Decimal, ScalarRefImpl, Serial}; use serde::ser::{SerializeSeq, SerializeStruct}; use serde::Serialize; @@ -38,12 +37,10 @@ use with_options::WithOptions; use super::decouple_checkpoint_log_sink::{ default_commit_checkpoint_interval, DecoupleCheckpointLogSinkerOf, - DEFAULT_COMMIT_CHECKPOINT_INTERVAL, }; use super::writer::SinkWriter; use super::{DummySinkCommitCoordinator, SinkWriterParam}; use crate::error::ConnectorResult; -use crate::sink::catalog::desc::SinkDesc; use crate::sink::{ Result, Sink, SinkError, SinkParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT, }; @@ -497,29 +494,6 @@ impl Sink for ClickHouseSink { const SINK_NAME: &'static str = CLICKHOUSE_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - if let Some(interval) = desc.properties.get("commit_checkpoint_interval") { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - } else { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL - }; - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if commit_checkpoint_interval > 1 { - return Err(SinkError::Config(anyhow!( - "config conflict: Clickhouse config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn validate(&self) -> Result<()> { // For upsert clickhouse sink, the primary key must be defined. if !self.is_append_only && self.pk_indices.is_empty() { diff --git a/src/connector/src/sink/coordinate.rs b/src/connector/src/sink/coordinate.rs index c069167870101..fcfb8c0877d6b 100644 --- a/src/connector/src/sink/coordinate.rs +++ b/src/connector/src/sink/coordinate.rs @@ -15,10 +15,12 @@ use std::sync::Arc; use anyhow::anyhow; +use futures::FutureExt; use risingwave_common::array::StreamChunk; use risingwave_common::bitmap::Bitmap; use risingwave_pb::connector_service::SinkMetadata; use risingwave_rpc_client::CoordinatorStreamHandle; +use thiserror_ext::AsReport; use tracing::warn; use super::SinkCoordinationRpcClientEnum; @@ -81,6 +83,23 @@ impl>> SinkWriter for Coordi } async fn update_vnode_bitmap(&mut self, vnode_bitmap: Arc) -> Result<()> { + self.coordinator_stream_handle + .update_vnode_bitmap(&vnode_bitmap) + .await?; self.inner.update_vnode_bitmap(vnode_bitmap).await } } + +impl>> Drop for CoordinatedSinkWriter { + fn drop(&mut self) { + match self.coordinator_stream_handle.stop().now_or_never() { + None => { + warn!("unable to send stop due to channel full") + } + Some(Err(e)) => { + warn!(e = ?e.as_report(), "failed to stop the coordinator"); + } + Some(Ok(_)) => {} + } + } +} diff --git a/src/connector/src/sink/decouple_checkpoint_log_sink.rs b/src/connector/src/sink/decouple_checkpoint_log_sink.rs index 4ba57e3adda7a..59e3335eb36db 100644 --- a/src/connector/src/sink/decouple_checkpoint_log_sink.rs +++ b/src/connector/src/sink/decouple_checkpoint_log_sink.rs @@ -20,10 +20,12 @@ use async_trait::async_trait; use crate::sink::log_store::{LogStoreReadItem, TruncateOffset}; use crate::sink::writer::SinkWriter; use crate::sink::{LogSinker, Result, SinkLogReader, SinkMetrics}; -pub const DEFAULT_COMMIT_CHECKPOINT_INTERVAL: u64 = 10; +pub const DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE: u64 = 10; +pub const DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITHOUT_SINK_DECOUPLE: u64 = 1; +pub const COMMIT_CHECKPOINT_INTERVAL: &str = "commit_checkpoint_interval"; pub fn default_commit_checkpoint_interval() -> u64 { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE } /// The `LogSinker` implementation used for commit-decoupled sinks (such as `Iceberg`, `DeltaLake` and `StarRocks`). @@ -65,7 +67,7 @@ impl> LogSinker for DecoupleCheckpointLogSink EpochBegun { curr_epoch: u64 }, /// Mark that the consumer has just received a barrier - BarrierReceived { prev_epoch: u64 }, + BarrierReceived { prev_epoch: u64, committed: bool }, } let mut state = LogConsumerState::Uninitialized; @@ -75,15 +77,34 @@ impl> LogSinker for DecoupleCheckpointLogSink loop { let (epoch, item): (u64, LogStoreReadItem) = log_reader.next_item().await?; - if let LogStoreReadItem::UpdateVnodeBitmap(_) = &item { - match &state { - LogConsumerState::BarrierReceived { .. } => {} + if let LogStoreReadItem::UpdateVnodeBitmap(vnode_bitmap) = &item { + match &mut state { + LogConsumerState::BarrierReceived { + committed, + prev_epoch, + } => { + if !*committed { + // force commit on update vnode bitmap + let start_time = Instant::now(); + sink_writer.barrier(true).await?; + sink_metrics + .sink_commit_duration_metrics + .observe(start_time.elapsed().as_millis() as f64); + log_reader.truncate(TruncateOffset::Barrier { epoch: *prev_epoch })?; + current_checkpoint = 0; + *committed = true; + } + sink_writer + .update_vnode_bitmap(vnode_bitmap.clone()) + .await?; + } _ => unreachable!( "update vnode bitmap can be accepted only right after \ barrier, but current state is {:?}", state ), } + continue; } // begin_epoch when not previously began state = match state { @@ -100,7 +121,7 @@ impl> LogSinker for DecoupleCheckpointLogSink ); LogConsumerState::EpochBegun { curr_epoch: epoch } } - LogConsumerState::BarrierReceived { prev_epoch } => { + LogConsumerState::BarrierReceived { prev_epoch, .. } => { assert!( epoch > prev_epoch, "new epoch {} should be greater than prev epoch {}", @@ -123,7 +144,7 @@ impl> LogSinker for DecoupleCheckpointLogSink LogConsumerState::EpochBegun { curr_epoch } => curr_epoch, _ => unreachable!("epoch must have begun before handling barrier"), }; - if is_checkpoint { + let committed = if is_checkpoint { current_checkpoint += 1; if current_checkpoint >= commit_checkpoint_interval.get() { let start_time = Instant::now(); @@ -133,16 +154,22 @@ impl> LogSinker for DecoupleCheckpointLogSink .observe(start_time.elapsed().as_millis() as f64); log_reader.truncate(TruncateOffset::Barrier { epoch })?; current_checkpoint = 0; + true } else { sink_writer.barrier(false).await?; + false } } else { sink_writer.barrier(false).await?; + false + }; + state = LogConsumerState::BarrierReceived { + prev_epoch, + committed, } - state = LogConsumerState::BarrierReceived { prev_epoch } } - LogStoreReadItem::UpdateVnodeBitmap(vnode_bitmap) => { - sink_writer.update_vnode_bitmap(vnode_bitmap).await?; + LogStoreReadItem::UpdateVnodeBitmap(_) => { + unreachable!("should have been handle earlier") } } } diff --git a/src/connector/src/sink/deltalake.rs b/src/connector/src/sink/deltalake.rs index 2dedffa3469e3..494adb2dd6fed 100644 --- a/src/connector/src/sink/deltalake.rs +++ b/src/connector/src/sink/deltalake.rs @@ -31,7 +31,6 @@ use risingwave_common::array::StreamChunk; use risingwave_common::bail; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::Schema; -use risingwave_common::session_config::sink_decouple::SinkDecouple; use risingwave_common::types::DataType; use risingwave_common::util::iter_util::ZipEqFast; use risingwave_pb::connector_service::sink_metadata::Metadata::Serialized; @@ -41,11 +40,9 @@ use serde_derive::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use with_options::WithOptions; -use super::catalog::desc::SinkDesc; use super::coordinate::CoordinatedSinkWriter; use super::decouple_checkpoint_log_sink::{ default_commit_checkpoint_interval, DecoupleCheckpointLogSinkerOf, - DEFAULT_COMMIT_CHECKPOINT_INTERVAL, }; use super::writer::SinkWriter; use super::{ @@ -285,29 +282,6 @@ impl Sink for DeltaLakeSink { const SINK_NAME: &'static str = DELTALAKE_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - if let Some(interval) = desc.properties.get("commit_checkpoint_interval") { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - } else { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL - }; - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if commit_checkpoint_interval > 1 { - return Err(SinkError::Config(anyhow!( - "config conflict: DeltaLake config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result { let inner = DeltaLakeSinkWriter::new( self.config.clone(), diff --git a/src/connector/src/sink/file_sink/opendal_sink.rs b/src/connector/src/sink/file_sink/opendal_sink.rs index 1fd461015b4ba..65ec46f494345 100644 --- a/src/connector/src/sink/file_sink/opendal_sink.rs +++ b/src/connector/src/sink/file_sink/opendal_sink.rs @@ -97,9 +97,6 @@ impl Sink for FileSink { const SINK_NAME: &'static str = S::SINK_NAME; async fn validate(&self) -> Result<()> { - risingwave_common::license::Feature::FileSink - .check_available() - .map_err(|e| anyhow::anyhow!(e))?; if !self.is_append_only { return Err(SinkError::Config(anyhow!( "File sink only supports append-only mode at present. \ diff --git a/src/connector/src/sink/google_pubsub.rs b/src/connector/src/sink/google_pubsub.rs index ea0e0e4776318..ff9079591a2f5 100644 --- a/src/connector/src/sink/google_pubsub.rs +++ b/src/connector/src/sink/google_pubsub.rs @@ -14,11 +14,7 @@ use std::collections::BTreeMap; -use anyhow::{anyhow, Context}; -use futures::future::try_join_all; -use futures::prelude::future::FutureExt; -use futures::prelude::TryFuture; -use futures::TryFutureExt; +use anyhow::anyhow; use google_cloud_gax::conn::Environment; use google_cloud_googleapis::pubsub::v1::PubsubMessage; use google_cloud_pubsub::apiv1; @@ -26,7 +22,7 @@ use google_cloud_pubsub::client::google_cloud_auth::credentials::CredentialsFile use google_cloud_pubsub::client::google_cloud_auth::project; use google_cloud_pubsub::client::google_cloud_auth::token::DefaultTokenSourceProvider; use google_cloud_pubsub::client::{Client, ClientConfig}; -use google_cloud_pubsub::publisher::{Awaiter, Publisher}; +use google_cloud_pubsub::publisher::Publisher; use risingwave_common::array::StreamChunk; use risingwave_common::catalog::Schema; use serde_derive::Deserialize; @@ -46,19 +42,33 @@ use crate::dispatch_sink_formatter_str_key_impl; pub const PUBSUB_SINK: &str = "google_pubsub"; const PUBSUB_SEND_FUTURE_BUFFER_MAX_SIZE: usize = 65536; -fn may_delivery_future(awaiter: Vec) -> GooglePubSubSinkDeliveryFuture { - try_join_all(awaiter.into_iter().map(|awaiter| { - awaiter.get().map(|result| { - result - .context("Google Pub/Sub sink error") - .map_err(SinkError::GooglePubSub) - .map(|_| ()) - }) - })) - .map_ok(|_: Vec<()>| ()) - .boxed() +mod delivery_future { + use anyhow::Context; + use futures::future::try_join_all; + use futures::{FutureExt, TryFuture, TryFutureExt}; + use google_cloud_pubsub::publisher::Awaiter; + + use crate::sink::SinkError; + + pub type GooglePubSubSinkDeliveryFuture = + impl TryFuture + Unpin + 'static; + + pub(super) fn may_delivery_future(awaiter: Vec) -> GooglePubSubSinkDeliveryFuture { + try_join_all(awaiter.into_iter().map(|awaiter| { + awaiter.get().map(|result| { + result + .context("Google Pub/Sub sink error") + .map_err(SinkError::GooglePubSub) + .map(|_| ()) + }) + })) + .map_ok(|_: Vec<()>| ()) + .boxed() + } } +use delivery_future::*; + #[serde_as] #[derive(Clone, Debug, Deserialize, WithOptions)] pub struct GooglePubSubConfig { @@ -172,9 +182,6 @@ struct GooglePubSubPayloadWriter<'w> { add_future: DeliveryFutureManagerAddFuture<'w, GooglePubSubSinkDeliveryFuture>, } -pub type GooglePubSubSinkDeliveryFuture = - impl TryFuture + Unpin + 'static; - impl GooglePubSubSinkWriter { pub async fn new( config: GooglePubSubConfig, diff --git a/src/connector/src/sink/iceberg/jni_catalog.rs b/src/connector/src/sink/iceberg/jni_catalog.rs index b80a6a305870f..6529ea733428d 100644 --- a/src/connector/src/sink/iceberg/jni_catalog.rs +++ b/src/connector/src/sink/iceberg/jni_catalog.rs @@ -288,7 +288,7 @@ impl CatalogV2 for JniCatalog { "Failed to crete iceberg table.", ) .with_source(e) - }) + })? } /// Load table from the catalog. @@ -338,7 +338,7 @@ impl CatalogV2 for JniCatalog { "Failed to load iceberg table.", ) .with_source(e) - }) + })? } /// Drop a table from the catalog. diff --git a/src/connector/src/sink/iceberg/mod.rs b/src/connector/src/sink/iceberg/mod.rs index b68e74b1f5d95..9e87694539f0c 100644 --- a/src/connector/src/sink/iceberg/mod.rs +++ b/src/connector/src/sink/iceberg/mod.rs @@ -43,11 +43,10 @@ use icelake::io_v2::{ DataFileWriterBuilder, EqualityDeltaWriterBuilder, IcebergWriterBuilder, DELETE_OP, INSERT_OP, }; use icelake::transaction::Transaction; -use icelake::types::{data_file_from_json, data_file_to_json, Any, DataFile, COLUMN_ID_META_KEY}; +use icelake::types::{data_file_from_json, data_file_to_json, Any, DataFile}; use icelake::{Table, TableIdentifier}; use itertools::Itertools; -use parquet::arrow::PARQUET_FIELD_ID_META_KEY; -use risingwave_common::array::arrow::IcebergArrowConvert; +use risingwave_common::array::arrow::{IcebergArrowConvert, IcebergCreateTableArrowConvert}; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bail; use risingwave_common::bitmap::Bitmap; @@ -65,10 +64,8 @@ use with_options::WithOptions; use self::mock_catalog::MockCatalog; use self::prometheus::monitored_base_file_writer::MonitoredBaseFileWriterBuilder; use self::prometheus::monitored_position_delete_writer::MonitoredPositionDeleteWriterBuilder; -use super::catalog::desc::SinkDesc; use super::decouple_checkpoint_log_sink::{ default_commit_checkpoint_interval, DecoupleCheckpointLogSinkerOf, - DEFAULT_COMMIT_CHECKPOINT_INTERVAL, }; use super::{ Sink, SinkError, SinkWriterParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT, @@ -76,7 +73,7 @@ use super::{ use crate::error::ConnectorResult; use crate::sink::coordinate::CoordinatedSinkWriter; use crate::sink::writer::SinkWriter; -use crate::sink::{Result, SinkCommitCoordinator, SinkDecouple, SinkParam}; +use crate::sink::{Result, SinkCommitCoordinator, SinkParam}; use crate::{ deserialize_bool_from_string, deserialize_optional_bool_from_string, deserialize_optional_string_seq_from_string, @@ -672,7 +669,7 @@ impl IcebergConfig { .file_io(storage_catalog.file_io().clone()) // Only support readonly table for storage catalog now. .readonly(true) - .build()) + .build()?) } _ => self.load_table_v2().await, } @@ -747,30 +744,20 @@ impl IcebergSink { bail!("database name must be set if you want to create table") }; + let iceberg_create_table_arrow_convert = IcebergCreateTableArrowConvert::default(); // convert risingwave schema -> arrow schema -> iceberg schema let arrow_fields = self .param .columns .iter() .map(|column| { - let mut arrow_field = IcebergArrowConvert + Ok(iceberg_create_table_arrow_convert .to_arrow_field(&column.name, &column.data_type) .map_err(|e| SinkError::Iceberg(anyhow!(e))) .context(format!( "failed to convert {}: {} to arrow type", &column.name, &column.data_type - ))?; - let mut metadata = HashMap::new(); - metadata.insert( - PARQUET_FIELD_ID_META_KEY.to_string(), - column.column_id.get_id().to_string(), - ); - metadata.insert( - COLUMN_ID_META_KEY.to_string(), - column.column_id.get_id().to_string(), - ); - arrow_field.set_metadata(metadata); - Ok(arrow_field) + ))?) }) .collect::>>()?; let arrow_schema = arrow_schema_iceberg::Schema::new(arrow_fields); @@ -843,31 +830,6 @@ impl Sink for IcebergSink { const SINK_NAME: &'static str = ICEBERG_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - desc.properties - .get("commit_checkpoint_interval") - .map(|interval| { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - }); - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if let Some(commit_checkpoint_interval) = commit_checkpoint_interval - && commit_checkpoint_interval > 1 - { - return Err(SinkError::Config(anyhow!( - "config conflict: Iceberg config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn validate(&self) -> Result<()> { if "glue".eq_ignore_ascii_case(self.config.catalog_type()) { risingwave_common::license::Feature::IcebergSinkWithGlue @@ -1375,15 +1337,21 @@ pub fn try_matches_arrow_schema( (ArrowDataType::Decimal128(_, _), ArrowDataType::Decimal128(_, _)) => true, (ArrowDataType::Binary, ArrowDataType::LargeBinary) => true, (ArrowDataType::LargeBinary, ArrowDataType::Binary) => true, - (left, right) => left == right, + // cases where left != right (metadata, field name mismatch) + // + // all nested types: in iceberg `field_id` will always be present, but RW doesn't have it: + // {"PARQUET:field_id": ".."} + // + // map: The standard name in arrow is "entries", "key", "value". + // in iceberg-rs, it's called "key_value" + (left, right) => left.equals_datatype(right), }; if !compatible { - bail!("Field {}'s type not compatible, risingwave converted data type {}, iceberg's data type: {}", + bail!("field {}'s type is incompatible\nRisingWave converted data type: {}\niceberg's data type: {}", arrow_field.name(), converted_arrow_data_type, arrow_field.data_type() ); } } - Ok(()) } @@ -1393,7 +1361,7 @@ mod test { use risingwave_common::catalog::Field; - use crate::sink::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL; + use crate::sink::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE; use crate::sink::iceberg::IcebergConfig; use crate::source::DataType; @@ -1476,7 +1444,7 @@ mod test { .into_iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - commit_checkpoint_interval: DEFAULT_COMMIT_CHECKPOINT_INTERVAL, + commit_checkpoint_interval: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE, create_table_if_not_exists: false, }; diff --git a/src/connector/src/sink/iceberg/prometheus/monitored_partition_writer.rs b/src/connector/src/sink/iceberg/prometheus/monitored_partition_writer.rs index d85d712c41ac3..463b1f3c9dbd4 100644 --- a/src/connector/src/sink/iceberg/prometheus/monitored_partition_writer.rs +++ b/src/connector/src/sink/iceberg/prometheus/monitored_partition_writer.rs @@ -27,7 +27,6 @@ pub struct MonitoredFanoutPartitionedWriterBuilder { } impl MonitoredFanoutPartitionedWriterBuilder { - #[expect(dead_code)] pub fn new( inner: FanoutPartitionedWriterBuilder, partition_num: LabelGuardedIntGauge<2>, diff --git a/src/connector/src/sink/iceberg/prometheus/monitored_write_writer.rs b/src/connector/src/sink/iceberg/prometheus/monitored_write_writer.rs index dc44434e5d9c2..aebb5939ff143 100644 --- a/src/connector/src/sink/iceberg/prometheus/monitored_write_writer.rs +++ b/src/connector/src/sink/iceberg/prometheus/monitored_write_writer.rs @@ -28,7 +28,6 @@ pub struct MonitoredWriteWriterBuilder { impl MonitoredWriteWriterBuilder { /// Create writer context. - #[expect(dead_code)] pub fn new( inner: B, write_qps: LabelGuardedIntCounter<2>, diff --git a/src/connector/src/sink/iceberg/storage_catalog.rs b/src/connector/src/sink/iceberg/storage_catalog.rs index 01adb510882a2..18e2ff0e036ff 100644 --- a/src/connector/src/sink/iceberg/storage_catalog.rs +++ b/src/connector/src/sink/iceberg/storage_catalog.rs @@ -249,11 +249,11 @@ impl Catalog for StorageCatalog { let version_hint_output = self.file_io.new_output(&version_hint_path)?; version_hint_output.write("1".into()).await?; - Ok(Table::builder() + Table::builder() .metadata(table_metadata) .identifier(table_ident) .file_io(self.file_io.clone()) - .build()) + .build() } /// Load table from the catalog. @@ -283,13 +283,13 @@ impl Catalog for StorageCatalog { let metadata_file_content = metadata_file.read().await?; let table_metadata = serde_json::from_slice::(&metadata_file_content)?; - Ok(Table::builder() + Table::builder() .metadata(table_metadata) .identifier(table.clone()) .file_io(self.file_io.clone()) // Only support readonly table for storage catalog now. .readonly(true) - .build()) + .build() } /// Drop a table from the catalog. diff --git a/src/connector/src/sink/mod.rs b/src/connector/src/sink/mod.rs index dafbc856207a9..b453af53cca41 100644 --- a/src/connector/src/sink/mod.rs +++ b/src/connector/src/sink/mod.rs @@ -53,6 +53,13 @@ use ::deltalake::DeltaTableError; use ::redis::RedisError; use anyhow::anyhow; use async_trait::async_trait; +use clickhouse::CLICKHOUSE_SINK; +use decouple_checkpoint_log_sink::{ + COMMIT_CHECKPOINT_INTERVAL, DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITHOUT_SINK_DECOUPLE, + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE, +}; +use deltalake::DELTALAKE_SINK; +use iceberg::ICEBERG_SINK; use opendal::Error as OpendalError; use risingwave_common::array::ArrayError; use risingwave_common::bitmap::Bitmap; @@ -66,6 +73,7 @@ use risingwave_pb::catalog::PbSinkType; use risingwave_pb::connector_service::{PbSinkParam, SinkMetadata, TableSchema}; use risingwave_rpc_client::error::RpcError; use risingwave_rpc_client::MetaClient; +use starrocks::STARROCKS_SINK; use thiserror::Error; use thiserror_ext::AsReport; pub use tracing; @@ -366,13 +374,54 @@ impl SinkWriterParam { } } +fn is_sink_support_commit_checkpoint_interval(sink_name: &str) -> bool { + matches!( + sink_name, + ICEBERG_SINK | CLICKHOUSE_SINK | STARROCKS_SINK | DELTALAKE_SINK + ) +} pub trait Sink: TryFrom { const SINK_NAME: &'static str; type LogSinker: LogSinker; type Coordinator: SinkCommitCoordinator; + fn set_default_commit_checkpoint_interval( + desc: &mut SinkDesc, + user_specified: &SinkDecouple, + ) -> Result<()> { + if is_sink_support_commit_checkpoint_interval(Self::SINK_NAME) { + match desc.properties.get(COMMIT_CHECKPOINT_INTERVAL) { + Some(commit_checkpoint_interval) => { + let commit_checkpoint_interval = commit_checkpoint_interval + .parse::() + .map_err(|e| SinkError::Config(anyhow!(e)))?; + if matches!(user_specified, SinkDecouple::Disable) + && commit_checkpoint_interval > 1 + { + return Err(SinkError::Config(anyhow!("config conflict: `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled"))); + } + } + None => match user_specified { + SinkDecouple::Default | SinkDecouple::Enable => { + desc.properties.insert( + COMMIT_CHECKPOINT_INTERVAL.to_string(), + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE.to_string(), + ); + } + SinkDecouple::Disable => { + desc.properties.insert( + COMMIT_CHECKPOINT_INTERVAL.to_string(), + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITHOUT_SINK_DECOUPLE.to_string(), + ); + } + }, + } + } + Ok(()) + } + /// `user_specified` is the value of `sink_decouple` config. - fn is_sink_decouple(_desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(user_specified: &SinkDecouple) -> Result { match user_specified { SinkDecouple::Default | SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), diff --git a/src/connector/src/sink/redis.rs b/src/connector/src/sink/redis.rs index 49207e668e41b..763d7e9bba49a 100644 --- a/src/connector/src/sink/redis.rs +++ b/src/connector/src/sink/redis.rs @@ -288,7 +288,7 @@ impl RedisSinkPayloadWriter { return Ok(()); } } - self.pipe.query(self.conn.as_mut().unwrap()).await?; + self.pipe.query::<()>(self.conn.as_mut().unwrap()).await?; self.pipe.clear(); Ok(()) } diff --git a/src/connector/src/sink/remote.rs b/src/connector/src/sink/remote.rs index 6fcef5d41b654..aa8ca0625d05f 100644 --- a/src/connector/src/sink/remote.rs +++ b/src/connector/src/sink/remote.rs @@ -23,7 +23,6 @@ use async_trait::async_trait; use await_tree::InstrumentAwait; use futures::future::select; use futures::TryStreamExt; -use itertools::Itertools; use jni::JavaVM; use prost::Message; use risingwave_common::array::StreamChunk; @@ -60,7 +59,6 @@ use tracing::warn; use super::elasticsearch::{is_es_sink, StreamChunkConverter, ES_OPTION_DELIMITER}; use crate::error::ConnectorResult; -use crate::sink::catalog::desc::SinkDesc; use crate::sink::coordinate::CoordinatedSinkWriter; use crate::sink::log_store::{LogStoreReadItem, LogStoreResult, TruncateOffset}; use crate::sink::writer::{LogSinkerOf, SinkWriter, SinkWriterExt}; @@ -116,7 +114,7 @@ def_remote_sink!(); pub trait RemoteSinkTrait: Send + Sync + 'static { const SINK_NAME: &'static str; - fn default_sink_decouple(_desc: &SinkDesc) -> bool { + fn default_sink_decouple() -> bool { true } } @@ -144,9 +142,9 @@ impl Sink for RemoteSink { const SINK_NAME: &'static str = R::SINK_NAME; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(user_specified: &SinkDecouple) -> Result { match user_specified { - SinkDecouple::Default => Ok(R::default_sink_decouple(desc)), + SinkDecouple::Default => Ok(R::default_sink_decouple()), SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), } @@ -175,7 +173,7 @@ async fn validate_remote_sink(param: &SinkParam, sink_name: &str) -> ConnectorRe bail!("Es sink only supports single pk or pk with delimiter option"); } // FIXME: support struct and array in stream sink - param.columns.iter().map(|col| { + param.columns.iter().try_for_each(|col| { match &col.data_type { DataType::Int16 | DataType::Int32 @@ -218,7 +216,7 @@ async fn validate_remote_sink(param: &SinkParam, sink_name: &str) -> ConnectorRe "remote sink supports Int16, Int32, Int64, Float32, Float64, Boolean, Decimal, Time, Date, Interval, Jsonb, Timestamp, Timestamptz, Bytea, List and Varchar, (Es sink support Struct) got {:?}: {:?}", col.name, col.data_type, - )))}}).try_collect()?; + )))}})?; let jvm = JVM.get_or_init()?; let sink_param = param.to_proto(); diff --git a/src/connector/src/sink/starrocks.rs b/src/connector/src/sink/starrocks.rs index 21a4fc371b940..5c3e724721d18 100644 --- a/src/connector/src/sink/starrocks.rs +++ b/src/connector/src/sink/starrocks.rs @@ -24,7 +24,6 @@ use mysql_async::Opts; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::Schema; -use risingwave_common::session_config::sink_decouple::SinkDecouple; use risingwave_common::types::DataType; use risingwave_pb::connector_service::sink_metadata::Metadata::Serialized; use risingwave_pb::connector_service::sink_metadata::SerializedMetadata; @@ -38,7 +37,7 @@ use tokio::task::JoinHandle; use url::form_urlencoded; use with_options::WithOptions; -use super::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL; +use super::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE; use super::doris_starrocks_connector::{ HeaderBuilder, InserterInner, StarrocksTxnRequestBuilder, STARROCKS_DELETE_SIGN, STARROCKS_SUCCESS_STATUS, @@ -48,7 +47,6 @@ use super::{ SinkCommitCoordinator, SinkError, SinkParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT, }; -use crate::sink::catalog::desc::SinkDesc; use crate::sink::coordinate::CoordinatedSinkWriter; use crate::sink::decouple_checkpoint_log_sink::DecoupleCheckpointLogSinkerOf; use crate::sink::{Result, Sink, SinkWriter, SinkWriterParam}; @@ -118,7 +116,7 @@ pub struct StarrocksConfig { } fn default_commit_checkpoint_interval() -> u64 { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE } impl StarrocksConfig { @@ -264,29 +262,6 @@ impl Sink for StarrocksSink { const SINK_NAME: &'static str = STARROCKS_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - if let Some(interval) = desc.properties.get("commit_checkpoint_interval") { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - } else { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL - }; - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if commit_checkpoint_interval > 1 { - return Err(SinkError::Config(anyhow!( - "config conflict: Starrocks config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn validate(&self) -> Result<()> { if !self.is_append_only && self.pk_indices.is_empty() { return Err(SinkError::Config(anyhow!( diff --git a/src/connector/src/sink/trivial.rs b/src/connector/src/sink/trivial.rs index 5c5e093c8e0f0..e19f99943338c 100644 --- a/src/connector/src/sink/trivial.rs +++ b/src/connector/src/sink/trivial.rs @@ -17,7 +17,6 @@ use std::marker::PhantomData; use async_trait::async_trait; use risingwave_common::session_config::sink_decouple::SinkDecouple; -use super::catalog::desc::SinkDesc; use crate::sink::log_store::{LogStoreReadItem, TruncateOffset}; use crate::sink::{ DummySinkCommitCoordinator, LogSinker, Result, Sink, SinkError, SinkLogReader, SinkParam, @@ -67,7 +66,7 @@ impl Sink for TrivialSink { const SINK_NAME: &'static str = T::SINK_NAME; // Disable sink decoupling for all trivial sinks because it introduces overhead without any benefit - fn is_sink_decouple(_desc: &SinkDesc, _user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(_user_specified: &SinkDecouple) -> Result { Ok(false) } diff --git a/src/connector/src/source/base.rs b/src/connector/src/source/base.rs index 38c2f25eb0336..d2b5aa1e88b4c 100644 --- a/src/connector/src/source/base.rs +++ b/src/connector/src/source/base.rs @@ -370,6 +370,33 @@ pub trait SplitReader: Sized + Send { ) -> crate::error::ConnectorResult; fn into_stream(self) -> BoxChunkSourceStream; + + fn backfill_info(&self) -> HashMap { + HashMap::new() + } +} + +/// Information used to determine whether we should start and finish source backfill. +/// +/// XXX: if a connector cannot provide the latest offsets (but we want to make it shareable), +/// perhaps we should ban blocking DDL for it. +#[derive(Debug, Clone)] +pub enum BackfillInfo { + HasDataToBackfill { + /// The last available offsets for each split (**inclusive**). + /// + /// This will be used to determine whether source backfill is finished when + /// there are no _new_ messages coming from upstream `SourceExecutor`. Otherwise, + /// blocking DDL cannot finish until new messages come. + /// + /// When there are upstream messages, we will use the latest offsets from the upstream. + latest_offset: String, + }, + /// If there are no messages in the split at all, we don't need to start backfill. + /// In this case, there will be no message from the backfill stream too. + /// If we started backfill, we cannot finish it until new messages come. + /// So we mark this a special case for optimization. + NoDataToBackfill, } for_all_sources!(impl_connector_properties); diff --git a/src/connector/src/source/cdc/external/mod.rs b/src/connector/src/source/cdc/external/mod.rs index be1c891b8d078..7a73f9b9bce98 100644 --- a/src/connector/src/source/cdc/external/mod.rs +++ b/src/connector/src/source/cdc/external/mod.rs @@ -237,7 +237,12 @@ pub struct ExternalTableConfig { /// Choices include `disabled`, `preferred`, and `required`. /// This field is optional. #[serde(rename = "ssl.mode", default = "Default::default")] - pub sslmode: SslMode, + #[serde(alias = "debezium.database.sslmode")] + pub ssl_mode: SslMode, + + #[serde(rename = "ssl.root.cert")] + #[serde(alias = "debezium.database.sslrootcert")] + pub ssl_root_cert: Option, } impl ExternalTableConfig { @@ -253,7 +258,7 @@ impl ExternalTableConfig { } } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, PartialEq, Deserialize)] #[serde(rename_all = "lowercase")] pub enum SslMode { #[serde(alias = "disable")] @@ -262,6 +267,14 @@ pub enum SslMode { Preferred, #[serde(alias = "require")] Required, + /// verify that the server is trustworthy by checking the certificate chain + /// up to the root certificate stored on the client. + #[serde(alias = "verify-ca")] + VerifyCa, + /// Besides verify the certificate, will also verify that the serverhost name + /// matches the name stored in the server certificate. + #[serde(alias = "verify-full")] + VerifyFull, } impl Default for SslMode { @@ -277,6 +290,8 @@ impl fmt::Display for SslMode { SslMode::Disabled => "disabled", SslMode::Preferred => "preferred", SslMode::Required => "required", + SslMode::VerifyCa => "verify-ca", + SslMode::VerifyFull => "verify-full", }) } } diff --git a/src/connector/src/source/cdc/external/mysql.rs b/src/connector/src/source/cdc/external/mysql.rs index 0e7ec02cfac27..59971f8761068 100644 --- a/src/connector/src/source/cdc/external/mysql.rs +++ b/src/connector/src/source/cdc/external/mysql.rs @@ -85,9 +85,12 @@ impl MySqlExternalTable { .host(&config.host) .port(config.port.parse::().unwrap()) .database(&config.database) - .ssl_mode(match config.sslmode { + .ssl_mode(match config.ssl_mode { SslMode::Disabled | SslMode::Preferred => sqlx::mysql::MySqlSslMode::Disabled, SslMode::Required => sqlx::mysql::MySqlSslMode::Required, + _ => { + return Err(anyhow!("unsupported SSL mode").into()); + } }); let connection = MySqlPool::connect_with(options).await?; @@ -308,9 +311,10 @@ impl MySqlExternalTableReader { .tcp_port(config.port.parse::().unwrap()) .db_name(Some(config.database)); - opts_builder = match config.sslmode { + opts_builder = match config.ssl_mode { SslMode::Disabled | SslMode::Preferred => opts_builder.ssl_opts(None), - SslMode::Required => { + // verify-ca and verify-full are same as required for mysql now + SslMode::Required | SslMode::VerifyCa | SslMode::VerifyFull => { let ssl_without_verify = mysql_async::SslOpts::default() .with_danger_accept_invalid_certs(true) .with_danger_skip_domain_validation(true); @@ -529,7 +533,8 @@ mod tests { database: "mydb".to_string(), schema: "".to_string(), table: "part".to_string(), - sslmode: Default::default(), + ssl_mode: Default::default(), + ssl_root_cert: None, }; let table = MySqlExternalTable::connect(config).await.unwrap(); diff --git a/src/connector/src/source/cdc/external/postgres.rs b/src/connector/src/source/cdc/external/postgres.rs index ca0caf46d6125..9123c7451b74e 100644 --- a/src/connector/src/source/cdc/external/postgres.rs +++ b/src/connector/src/source/cdc/external/postgres.rs @@ -86,18 +86,26 @@ pub struct PostgresExternalTable { impl PostgresExternalTable { pub async fn connect(config: ExternalTableConfig) -> ConnectorResult { tracing::debug!("connect to postgres external table"); - let options = PgConnectOptions::new() + let mut options = PgConnectOptions::new() .username(&config.username) .password(&config.password) .host(&config.host) .port(config.port.parse::().unwrap()) .database(&config.database) - .ssl_mode(match config.sslmode { + .ssl_mode(match config.ssl_mode { SslMode::Disabled => PgSslMode::Disable, SslMode::Preferred => PgSslMode::Prefer, SslMode::Required => PgSslMode::Require, + SslMode::VerifyCa => PgSslMode::VerifyCa, + SslMode::VerifyFull => PgSslMode::VerifyFull, }); + if config.ssl_mode == SslMode::VerifyCa || config.ssl_mode == SslMode::VerifyFull { + if let Some(ref root_cert) = config.ssl_root_cert { + options = options.ssl_root_cert(root_cert.as_str()); + } + } + let connection = PgPool::connect_with(options).await?; let schema_discovery = SchemaDiscovery::new(connection, config.schema.as_str()); // fetch column schema and primary key @@ -288,8 +296,14 @@ impl PostgresExternalTableReader { .port(config.port.parse::().unwrap()) .dbname(&config.database); + let (_verify_ca, verify_hostname) = match config.ssl_mode { + SslMode::VerifyCa => (true, false), + SslMode::VerifyFull => (true, true), + _ => (false, false), + }; + #[cfg(not(madsim))] - let connector = match config.sslmode { + let connector = match config.ssl_mode { SslMode::Disabled => { pg_config.ssl_mode(tokio_postgres::config::SslMode::Disable); MaybeMakeTlsConnector::NoTls(NoTls) @@ -315,6 +329,24 @@ impl PostgresExternalTableReader { builder.set_verify(SslVerifyMode::NONE); MaybeMakeTlsConnector::Tls(MakeTlsConnector::new(builder.build())) } + + SslMode::VerifyCa | SslMode::VerifyFull => { + pg_config.ssl_mode(tokio_postgres::config::SslMode::Require); + let mut builder = SslConnector::builder(SslMethod::tls())?; + if let Some(ssl_root_cert) = config.ssl_root_cert { + builder.set_ca_file(ssl_root_cert).map_err(|e| { + anyhow!(format!("bad ssl root cert error: {}", e.to_report_string())) + })?; + } + let mut connector = MakeTlsConnector::new(builder.build()); + if !verify_hostname { + connector.set_callback(|config, _| { + config.set_verify_hostname(false); + Ok(()) + }); + } + MaybeMakeTlsConnector::Tls(connector) + } }; #[cfg(madsim)] let connector = NoTls; @@ -482,7 +514,8 @@ mod tests { database: "mydb".to_string(), schema: "public".to_string(), table: "mytest".to_string(), - sslmode: Default::default(), + ssl_mode: Default::default(), + ssl_root_cert: None, }; let table = PostgresExternalTable::connect(config).await.unwrap(); diff --git a/src/connector/src/source/filesystem/opendal_source/azblob_source.rs b/src/connector/src/source/filesystem/opendal_source/azblob_source.rs index 2ee050f21f812..8c6dac01ab87b 100644 --- a/src/connector/src/source/filesystem/opendal_source/azblob_source.rs +++ b/src/connector/src/source/filesystem/opendal_source/azblob_source.rs @@ -66,7 +66,6 @@ impl OpendalEnumerator { }; let compression_format = azblob_properties.compression_format; - Ok(Self { op, prefix, diff --git a/src/connector/src/source/filesystem/opendal_source/gcs_source.rs b/src/connector/src/source/filesystem/opendal_source/gcs_source.rs index 768f19fc36722..9a6d883f3c922 100644 --- a/src/connector/src/source/filesystem/opendal_source/gcs_source.rs +++ b/src/connector/src/source/filesystem/opendal_source/gcs_source.rs @@ -60,7 +60,6 @@ impl OpendalEnumerator { }; let compression_format = gcs_properties.compression_format; - Ok(Self { op, prefix, diff --git a/src/connector/src/source/filesystem/opendal_source/mod.rs b/src/connector/src/source/filesystem/opendal_source/mod.rs index cbb3c2a9c7b85..cea4972def92c 100644 --- a/src/connector/src/source/filesystem/opendal_source/mod.rs +++ b/src/connector/src/source/filesystem/opendal_source/mod.rs @@ -47,6 +47,10 @@ pub struct FsSourceCommon { #[serde(rename = "refresh.interval.sec")] #[serde_as(as = "Option")] pub refresh_interval_sec: Option, + + #[serde(rename = "recursive_scan", default)] + #[serde_as(as = "Option")] + pub recursive_scan: Option, } #[derive(Clone, Debug, Deserialize, PartialEq, WithOptions)] diff --git a/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs b/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs index 7396eac2ea38e..a9cb4b6c3f7f0 100644 --- a/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs +++ b/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs @@ -66,13 +66,13 @@ impl SplitEnumerator for OpendalEnumerator { } impl OpendalEnumerator { - pub async fn list(&self) -> ConnectorResult { + pub async fn list(&self, recursive_scan: bool) -> ConnectorResult { let prefix = self.prefix.as_deref().unwrap_or("/"); let object_lister = self .op .lister_with(prefix) - .recursive(false) + .recursive(recursive_scan) .metakey(Metakey::ContentLength | Metakey::LastModified) .await?; let stream = stream::unfold(object_lister, |mut object_lister| async move { diff --git a/src/connector/src/source/iceberg/mod.rs b/src/connector/src/source/iceberg/mod.rs index f101ff9ed6d4b..845ffb66804d3 100644 --- a/src/connector/src/source/iceberg/mod.rs +++ b/src/connector/src/source/iceberg/mod.rs @@ -21,6 +21,7 @@ use async_trait::async_trait; use futures_async_stream::for_await; use iceberg::scan::FileScanTask; use iceberg::spec::TableMetadata; +use iceberg::table::Table; use itertools::Itertools; pub use parquet_file_reader::*; use risingwave_common::bail; @@ -28,7 +29,7 @@ use risingwave_common::catalog::Schema; use risingwave_common::types::JsonbVal; use serde::{Deserialize, Serialize}; -use crate::error::ConnectorResult; +use crate::error::{ConnectorError, ConnectorResult}; use crate::parser::ParserConfig; use crate::sink::iceberg::IcebergConfig; use crate::source::{ @@ -144,6 +145,7 @@ pub struct IcebergSplit { pub snapshot_id: i64, pub table_meta: TableMetadataJsonStr, pub files: Vec, + pub eq_delete_files: Vec, } impl SplitMetaData for IcebergSplit { @@ -206,6 +208,19 @@ impl IcebergSplitEnumerator { bail!("Batch parallelism is 0. Cannot split the iceberg files."); } let table = self.config.load_table_v2().await?; + + let current_snapshot = table.metadata().current_snapshot(); + if current_snapshot.is_none() { + // If there is no snapshot, we will return a mock `IcebergSplit` with empty files. + return Ok(vec![IcebergSplit { + split_id: 0, + snapshot_id: 0, // unused + table_meta: TableMetadataJsonStr::serialize(table.metadata()), + files: vec![], + eq_delete_files: vec![], + }]); + } + let snapshot_id = match time_traval_info { Some(IcebergTimeTravelInfo::Version(version)) => { let Some(snapshot) = table.metadata().snapshot_by_id(version) else { @@ -217,10 +232,13 @@ impl IcebergSplitEnumerator { let snapshot = table .metadata() .snapshots() - .filter(|snapshot| snapshot.timestamp().timestamp_millis() <= timestamp) - .max_by_key(|snapshot| snapshot.timestamp().timestamp_millis()); + .map(|snapshot| snapshot.timestamp().map(|ts| ts.timestamp_millis())) + .collect::, _>>()? + .into_iter() + .filter(|&snapshot_millis| snapshot_millis <= timestamp) + .max_by_key(|&snapshot_millis| snapshot_millis); match snapshot { - Some(snapshot) => snapshot.snapshot_id(), + Some(snapshot) => snapshot, None => { // convert unix time to human readable time let time = chrono::DateTime::from_timestamp_millis(timestamp); @@ -232,17 +250,20 @@ impl IcebergSplitEnumerator { } } } - None => match table.metadata().current_snapshot() { - Some(snapshot) => snapshot.snapshot_id(), - None => bail!("Cannot find the current snapshot id in the iceberg table."), - }, + None => { + assert!(current_snapshot.is_some()); + current_snapshot.unwrap().snapshot_id() + } }; - let mut files = vec![]; + let require_names = Self::get_require_field_names(&table, snapshot_id, schema).await?; + + let mut data_files = vec![]; + let mut eq_delete_files = vec![]; let scan = table .scan() .snapshot_id(snapshot_id) - .select(schema.names()) + .select(require_names) .build() .map_err(|e| anyhow!(e))?; @@ -250,16 +271,27 @@ impl IcebergSplitEnumerator { #[for_await] for task in file_scan_stream { - let task = task.map_err(|e| anyhow!(e))?; - files.push(IcebergFileScanTaskJsonStr::serialize(&task)); + let mut task: FileScanTask = task.map_err(|e| anyhow!(e))?; + match task.data_file_content { + iceberg::spec::DataContentType::Data => { + data_files.push(IcebergFileScanTaskJsonStr::serialize(&task)); + } + iceberg::spec::DataContentType::EqualityDeletes => { + task.project_field_ids = task.equality_ids.clone(); + eq_delete_files.push(IcebergFileScanTaskJsonStr::serialize(&task)); + } + iceberg::spec::DataContentType::PositionDeletes => { + bail!("Position delete file is not supported") + } + } } let table_meta = TableMetadataJsonStr::serialize(table.metadata()); let split_num = batch_parallelism; // evenly split the files into splits based on the parallelism. - let split_size = files.len() / split_num; - let remaining = files.len() % split_num; + let split_size = data_files.len() / split_num; + let remaining = data_files.len() % split_num; let mut splits = vec![]; for i in 0..split_num { let start = i * split_size; @@ -268,20 +300,62 @@ impl IcebergSplitEnumerator { split_id: i as i64, snapshot_id, table_meta: table_meta.clone(), - files: files[start..end].to_vec(), + files: data_files[start..end].to_vec(), + eq_delete_files: eq_delete_files.clone(), }; splits.push(split); } for i in 0..remaining { splits[i] .files - .push(files[split_num * split_size + i].clone()); + .push(data_files[split_num * split_size + i].clone()); } Ok(splits .into_iter() .filter(|split| !split.files.is_empty()) .collect_vec()) } + + async fn get_require_field_names( + table: &Table, + snapshot_id: i64, + rw_schema: Schema, + ) -> ConnectorResult> { + let scan = table + .scan() + .snapshot_id(snapshot_id) + .build() + .map_err(|e| anyhow!(e))?; + let file_scan_stream = scan.plan_files().await.map_err(|e| anyhow!(e))?; + let schema = scan.snapshot().schema(table.metadata())?; + let mut equality_ids = vec![]; + #[for_await] + for task in file_scan_stream { + let task: FileScanTask = task.map_err(|e| anyhow!(e))?; + if task.data_file_content == iceberg::spec::DataContentType::EqualityDeletes { + if equality_ids.is_empty() { + equality_ids = task.equality_ids; + } else if equality_ids != task.equality_ids { + bail!("The schema of iceberg equality delete file must be consistent"); + } + } + } + let delete_columns = equality_ids + .into_iter() + .map(|id| match schema.name_by_field_id(id) { + Some(name) => Ok::(name.to_string()), + None => bail!("Delete field id {} not found in schema", id), + }) + .collect::>>()?; + let mut require_field_names: Vec<_> = rw_schema.names().to_vec(); + // Add the delete columns to the required field names + for names in delete_columns { + if !require_field_names.contains(&names) { + require_field_names.push(names); + } + } + Ok(require_field_names) + } } #[derive(Debug)] diff --git a/src/connector/src/source/kafka/enumerator/client.rs b/src/connector/src/source/kafka/enumerator/client.rs index ff007076c1338..5551c12b433b3 100644 --- a/src/connector/src/source/kafka/enumerator/client.rs +++ b/src/connector/src/source/kafka/enumerator/client.rs @@ -170,6 +170,7 @@ impl KafkaSplitEnumerator { self.report_high_watermark(*partition, high); map.insert(*partition, (low, high)); } + tracing::debug!("fetch kafka watermarks: {map:?}"); Ok(map) } diff --git a/src/connector/src/source/kafka/source/reader.rs b/src/connector/src/source/kafka/source/reader.rs index 5ace1820b4249..72d4c36377c81 100644 --- a/src/connector/src/source/kafka/source/reader.rs +++ b/src/connector/src/source/kafka/source/reader.rs @@ -34,13 +34,14 @@ use crate::source::kafka::{ KafkaContextCommon, KafkaProperties, KafkaSplit, RwConsumerContext, KAFKA_ISOLATION_LEVEL, }; use crate::source::{ - into_chunk_stream, BoxChunkSourceStream, Column, SourceContextRef, SplitId, SplitMetaData, - SplitReader, + into_chunk_stream, BackfillInfo, BoxChunkSourceStream, Column, SourceContextRef, SplitId, + SplitMetaData, SplitReader, }; pub struct KafkaSplitReader { consumer: StreamConsumer, offsets: HashMap, Option)>, + backfill_info: HashMap, bytes_per_second: usize, max_num_messages: usize, parser_config: ParserConfig, @@ -106,7 +107,7 @@ impl SplitReader for KafkaSplitReader { let mut tpl = TopicPartitionList::with_capacity(splits.len()); let mut offsets = HashMap::new(); - + let mut backfill_info = HashMap::new(); for split in splits { offsets.insert(split.id(), (split.start_offset, split.stop_offset)); @@ -121,7 +122,29 @@ impl SplitReader for KafkaSplitReader { } else { tpl.add_partition(split.topic.as_str(), split.partition); } + + let (low, high) = consumer + .fetch_watermarks( + split.topic.as_str(), + split.partition, + properties.common.sync_call_timeout, + ) + .await?; + tracing::debug!("fetch kafka watermarks: low: {low}, high: {high}, split: {split:?}"); + // note: low is inclusive, high is exclusive + if low == high { + backfill_info.insert(split.id(), BackfillInfo::NoDataToBackfill); + } else { + debug_assert!(high > 0); + backfill_info.insert( + split.id(), + BackfillInfo::HasDataToBackfill { + latest_offset: (high - 1).to_string(), + }, + ); + } } + tracing::debug!("backfill_info: {:?}", backfill_info); consumer.assign(&tpl)?; @@ -143,6 +166,7 @@ impl SplitReader for KafkaSplitReader { Ok(Self { consumer, offsets, + backfill_info, bytes_per_second, max_num_messages, parser_config, @@ -155,6 +179,10 @@ impl SplitReader for KafkaSplitReader { let source_context = self.source_ctx.clone(); into_chunk_stream(self.into_data_stream(), parser_config, source_context) } + + fn backfill_info(&self) -> HashMap { + self.backfill_info.clone() + } } impl KafkaSplitReader { diff --git a/src/connector/src/source/kafka/stats.rs b/src/connector/src/source/kafka/stats.rs index 679f5c24bd2a1..7a36c4d1fffea 100644 --- a/src/connector/src/source/kafka/stats.rs +++ b/src/connector/src/source/kafka/stats.rs @@ -12,34 +12,37 @@ // See the License for the specific language governing permissions and // limitations under the License. -use prometheus::core::{AtomicU64, GenericGaugeVec}; -use prometheus::{register_int_gauge_vec_with_registry, IntGaugeVec, Registry}; +use prometheus::core::AtomicU64; +use prometheus::Registry; use rdkafka::statistics::{Broker, ConsumerGroup, Partition, Topic, Window}; use rdkafka::Statistics; -use risingwave_common::metrics::register_uint_gauge_vec_with_registry; +use risingwave_common::metrics::{LabelGuardedIntGaugeVec, LabelGuardedUintGaugeVec}; +use risingwave_common::{ + register_guarded_int_gauge_vec_with_registry, register_guarded_uint_gauge_vec_with_registry, +}; #[derive(Debug, Clone)] pub struct RdKafkaStats { pub registry: Registry, - pub ts: IntGaugeVec, - pub time: IntGaugeVec, - pub age: IntGaugeVec, - pub replyq: IntGaugeVec, - pub msg_cnt: GenericGaugeVec, - pub msg_size: GenericGaugeVec, - pub msg_max: GenericGaugeVec, - pub msg_size_max: GenericGaugeVec, - pub tx: IntGaugeVec, - pub tx_bytes: IntGaugeVec, - pub rx: IntGaugeVec, - pub rx_bytes: IntGaugeVec, - pub tx_msgs: IntGaugeVec, - pub tx_msgs_bytes: IntGaugeVec, - pub rx_msgs: IntGaugeVec, - pub rx_msgs_bytes: IntGaugeVec, - pub simple_cnt: IntGaugeVec, - pub metadata_cache_cnt: IntGaugeVec, + pub ts: LabelGuardedIntGaugeVec<2>, + pub time: LabelGuardedIntGaugeVec<2>, + pub age: LabelGuardedIntGaugeVec<2>, + pub replyq: LabelGuardedIntGaugeVec<2>, + pub msg_cnt: LabelGuardedUintGaugeVec<2>, + pub msg_size: LabelGuardedUintGaugeVec<2>, + pub msg_max: LabelGuardedUintGaugeVec<2>, + pub msg_size_max: LabelGuardedUintGaugeVec<2>, + pub tx: LabelGuardedIntGaugeVec<2>, + pub tx_bytes: LabelGuardedIntGaugeVec<2>, + pub rx: LabelGuardedIntGaugeVec<2>, + pub rx_bytes: LabelGuardedIntGaugeVec<2>, + pub tx_msgs: LabelGuardedIntGaugeVec<2>, + pub tx_msgs_bytes: LabelGuardedIntGaugeVec<2>, + pub rx_msgs: LabelGuardedIntGaugeVec<2>, + pub rx_msgs_bytes: LabelGuardedIntGaugeVec<2>, + pub simple_cnt: LabelGuardedIntGaugeVec<2>, + pub metadata_cache_cnt: LabelGuardedIntGaugeVec<2>, pub broker_stats: BrokerStats, pub topic_stats: TopicStats, @@ -50,29 +53,29 @@ pub struct RdKafkaStats { pub struct BrokerStats { pub registry: Registry, - pub state_age: IntGaugeVec, - pub outbuf_cnt: IntGaugeVec, - pub outbuf_msg_cnt: IntGaugeVec, - pub waitresp_cnt: IntGaugeVec, - pub waitresp_msg_cnt: IntGaugeVec, - pub tx: GenericGaugeVec, - pub tx_bytes: GenericGaugeVec, - pub tx_errs: GenericGaugeVec, - pub tx_retries: GenericGaugeVec, - pub tx_idle: IntGaugeVec, - pub req_timeouts: GenericGaugeVec, - pub rx: GenericGaugeVec, - pub rx_bytes: GenericGaugeVec, - pub rx_errs: GenericGaugeVec, - pub rx_corriderrs: GenericGaugeVec, - pub rx_partial: GenericGaugeVec, - pub rx_idle: IntGaugeVec, - pub req: IntGaugeVec, - pub zbuf_grow: GenericGaugeVec, - pub buf_grow: GenericGaugeVec, - pub wakeups: GenericGaugeVec, - pub connects: IntGaugeVec, - pub disconnects: IntGaugeVec, + pub state_age: LabelGuardedIntGaugeVec<4>, + pub outbuf_cnt: LabelGuardedIntGaugeVec<4>, + pub outbuf_msg_cnt: LabelGuardedIntGaugeVec<4>, + pub waitresp_cnt: LabelGuardedIntGaugeVec<4>, + pub waitresp_msg_cnt: LabelGuardedIntGaugeVec<4>, + pub tx: LabelGuardedUintGaugeVec<4>, + pub tx_bytes: LabelGuardedUintGaugeVec<4>, + pub tx_errs: LabelGuardedUintGaugeVec<4>, + pub tx_retries: LabelGuardedUintGaugeVec<4>, + pub tx_idle: LabelGuardedIntGaugeVec<4>, + pub req_timeouts: LabelGuardedUintGaugeVec<4>, + pub rx: LabelGuardedUintGaugeVec<4>, + pub rx_bytes: LabelGuardedUintGaugeVec<4>, + pub rx_errs: LabelGuardedUintGaugeVec<4>, + pub rx_corriderrs: LabelGuardedUintGaugeVec<4>, + pub rx_partial: LabelGuardedUintGaugeVec<4>, + pub rx_idle: LabelGuardedIntGaugeVec<4>, + pub req: LabelGuardedIntGaugeVec<5>, + pub zbuf_grow: LabelGuardedUintGaugeVec<4>, + pub buf_grow: LabelGuardedUintGaugeVec<4>, + pub wakeups: LabelGuardedUintGaugeVec<4>, + pub connects: LabelGuardedIntGaugeVec<4>, + pub disconnects: LabelGuardedIntGaugeVec<4>, pub int_latency: StatsWindow, pub outbuf_latency: StatsWindow, pub rtt: StatsWindow, @@ -83,7 +86,7 @@ pub struct BrokerStats { pub struct TopicStats { pub registry: Registry, - pub metadata_age: IntGaugeVec, + pub metadata_age: LabelGuardedIntGaugeVec<3>, pub batch_size: StatsWindow, pub batch_cnt: StatsWindow, pub partitions: PartitionStats, @@ -93,58 +96,58 @@ pub struct TopicStats { pub struct StatsWindow { pub registry: Registry, - pub min: IntGaugeVec, - pub max: IntGaugeVec, - pub avg: IntGaugeVec, - pub sum: IntGaugeVec, - pub cnt: IntGaugeVec, - pub stddev: IntGaugeVec, - pub hdr_size: IntGaugeVec, - pub p50: IntGaugeVec, - pub p75: IntGaugeVec, - pub p90: IntGaugeVec, - pub p95: IntGaugeVec, - pub p99: IntGaugeVec, - pub p99_99: IntGaugeVec, - pub out_of_range: IntGaugeVec, + pub min: LabelGuardedIntGaugeVec<4>, + pub max: LabelGuardedIntGaugeVec<4>, + pub avg: LabelGuardedIntGaugeVec<4>, + pub sum: LabelGuardedIntGaugeVec<4>, + pub cnt: LabelGuardedIntGaugeVec<4>, + pub stddev: LabelGuardedIntGaugeVec<4>, + pub hdr_size: LabelGuardedIntGaugeVec<4>, + pub p50: LabelGuardedIntGaugeVec<4>, + pub p75: LabelGuardedIntGaugeVec<4>, + pub p90: LabelGuardedIntGaugeVec<4>, + pub p95: LabelGuardedIntGaugeVec<4>, + pub p99: LabelGuardedIntGaugeVec<4>, + pub p99_99: LabelGuardedIntGaugeVec<4>, + pub out_of_range: LabelGuardedIntGaugeVec<4>, } #[derive(Debug, Clone)] pub struct ConsumerGroupStats { pub registry: Registry, - pub state_age: IntGaugeVec, + pub state_age: LabelGuardedIntGaugeVec<3>, // todo: (do not know value set) join_state: IntGaugeVec, - pub rebalance_age: IntGaugeVec, - pub rebalance_cnt: IntGaugeVec, + pub rebalance_age: LabelGuardedIntGaugeVec<3>, + pub rebalance_cnt: LabelGuardedIntGaugeVec<3>, // todo: (cannot handle string) rebalance_reason, - pub assignment_size: IntGaugeVec, + pub assignment_size: LabelGuardedIntGaugeVec<3>, } impl ConsumerGroupStats { pub fn new(registry: Registry) -> Self { - let state_age = register_int_gauge_vec_with_registry!( + let state_age = register_guarded_int_gauge_vec_with_registry!( "rdkafka_consumer_group_state_age", "Age of the consumer group state in seconds", &["id", "client_id", "state"], registry ) .unwrap(); - let rebalance_age = register_int_gauge_vec_with_registry!( + let rebalance_age = register_guarded_int_gauge_vec_with_registry!( "rdkafka_consumer_group_rebalance_age", "Age of the last rebalance in seconds", &["id", "client_id", "state"], registry ) .unwrap(); - let rebalance_cnt = register_int_gauge_vec_with_registry!( + let rebalance_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_consumer_group_rebalance_cnt", "Number of rebalances", &["id", "client_id", "state"], registry ) .unwrap(); - let assignment_size = register_int_gauge_vec_with_registry!( + let assignment_size = register_guarded_int_gauge_vec_with_registry!( "rdkafka_consumer_group_assignment_size", "Number of assigned partitions", &["id", "client_id", "state"], @@ -164,16 +167,16 @@ impl ConsumerGroupStats { pub fn report(&self, id: &str, client_id: &str, stats: &ConsumerGroup) { let state = stats.state.as_str(); self.state_age - .with_label_values(&[id, client_id, state]) + .with_guarded_label_values(&[id, client_id, state]) .set(stats.stateage); self.rebalance_age - .with_label_values(&[id, client_id, state]) + .with_guarded_label_values(&[id, client_id, state]) .set(stats.rebalance_age); self.rebalance_cnt - .with_label_values(&[id, client_id, state]) + .with_guarded_label_values(&[id, client_id, state]) .set(stats.rebalance_cnt); self.assignment_size - .with_label_values(&[id, client_id, state]) + .with_guarded_label_values(&[id, client_id, state]) .set(stats.assignment_size as i64); } } @@ -181,98 +184,98 @@ impl ConsumerGroupStats { impl StatsWindow { pub fn new(registry: Registry, path: &str) -> Self { let get_metric_name = |name: &str| format!("rdkafka_{}_{}", path, name); - let min = register_int_gauge_vec_with_registry!( + let min = register_guarded_int_gauge_vec_with_registry!( get_metric_name("min"), "Minimum value", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let max = register_int_gauge_vec_with_registry!( + let max = register_guarded_int_gauge_vec_with_registry!( get_metric_name("max"), "Maximum value", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let avg = register_int_gauge_vec_with_registry!( + let avg = register_guarded_int_gauge_vec_with_registry!( get_metric_name("avg"), "Average value", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let sum = register_int_gauge_vec_with_registry!( + let sum = register_guarded_int_gauge_vec_with_registry!( get_metric_name("sum"), "Sum of values", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let cnt = register_int_gauge_vec_with_registry!( + let cnt = register_guarded_int_gauge_vec_with_registry!( get_metric_name("cnt"), "Count of values", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let stddev = register_int_gauge_vec_with_registry!( + let stddev = register_guarded_int_gauge_vec_with_registry!( get_metric_name("stddev"), "Standard deviation", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let hdr_size = register_int_gauge_vec_with_registry!( + let hdr_size = register_guarded_int_gauge_vec_with_registry!( get_metric_name("hdrsize"), "Size of the histogram header", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p50 = register_int_gauge_vec_with_registry!( + let p50 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p50"), "50th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p75 = register_int_gauge_vec_with_registry!( + let p75 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p75"), "75th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p90 = register_int_gauge_vec_with_registry!( + let p90 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p90"), "90th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p95 = register_int_gauge_vec_with_registry!( + let p95 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p95"), "95th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p99 = register_int_gauge_vec_with_registry!( + let p99 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p99"), "99th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let p99_99 = register_int_gauge_vec_with_registry!( + let p99_99 = register_guarded_int_gauge_vec_with_registry!( get_metric_name("p99_99"), "99.99th percentile", &["id", "client_id", "broker", "topic"], registry ) .unwrap(); - let out_of_range = register_int_gauge_vec_with_registry!( + let out_of_range = register_guarded_int_gauge_vec_with_registry!( get_metric_name("out_of_range"), "Out of range values", &["id", "client_id", "broker", "topic"], @@ -302,26 +305,32 @@ impl StatsWindow { pub fn report(&self, id: &str, client_id: &str, broker: &str, topic: &str, stats: &Window) { let labels = [id, client_id, broker, topic]; - self.min.with_label_values(&labels).set(stats.min); - self.max.with_label_values(&labels).set(stats.max); - self.avg.with_label_values(&labels).set(stats.avg); - self.sum.with_label_values(&labels).set(stats.sum); - self.cnt.with_label_values(&labels).set(stats.cnt); - self.stddev.with_label_values(&labels).set(stats.stddev); - self.hdr_size.with_label_values(&labels).set(stats.hdrsize); - self.p50.with_label_values(&labels).set(stats.p50); - self.p75.with_label_values(&labels).set(stats.p75); - self.p90.with_label_values(&labels).set(stats.p90); - self.p99_99.with_label_values(&labels).set(stats.p99_99); + self.min.with_guarded_label_values(&labels).set(stats.min); + self.max.with_guarded_label_values(&labels).set(stats.max); + self.avg.with_guarded_label_values(&labels).set(stats.avg); + self.sum.with_guarded_label_values(&labels).set(stats.sum); + self.cnt.with_guarded_label_values(&labels).set(stats.cnt); + self.stddev + .with_guarded_label_values(&labels) + .set(stats.stddev); + self.hdr_size + .with_guarded_label_values(&labels) + .set(stats.hdrsize); + self.p50.with_guarded_label_values(&labels).set(stats.p50); + self.p75.with_guarded_label_values(&labels).set(stats.p75); + self.p90.with_guarded_label_values(&labels).set(stats.p90); + self.p99_99 + .with_guarded_label_values(&labels) + .set(stats.p99_99); self.out_of_range - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.outofrange); } } impl TopicStats { pub fn new(registry: Registry) -> Self { - let metadata_age = register_int_gauge_vec_with_registry!( + let metadata_age = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_metadata_age", "Age of the topic metadata in milliseconds", &["id", "client_id", "topic"], @@ -348,7 +357,7 @@ impl TopicStats { fn report_inner(&self, id: &str, client_id: &str, topic: &str, stats: &Topic) { self.metadata_age - .with_label_values(&[id, client_id, topic]) + .with_guarded_label_values(&[id, client_id, topic]) .set(stats.metadata_age); self.batch_size .report(id, client_id, "", topic, &stats.batchsize); @@ -362,212 +371,212 @@ impl TopicStats { pub struct PartitionStats { pub registry: Registry, - pub msgq_cnt: IntGaugeVec, - pub msgq_bytes: GenericGaugeVec, - pub xmit_msgq_cnt: IntGaugeVec, - pub xmit_msgq_bytes: GenericGaugeVec, - pub fetchq_cnt: IntGaugeVec, - pub fetchq_size: GenericGaugeVec, - pub query_offset: IntGaugeVec, - pub next_offset: IntGaugeVec, - pub app_offset: IntGaugeVec, - pub stored_offset: IntGaugeVec, - pub committed_offset: IntGaugeVec, - pub eof_offset: IntGaugeVec, - pub lo_offset: IntGaugeVec, - pub hi_offset: IntGaugeVec, - pub consumer_lag: IntGaugeVec, - pub consumer_lag_store: IntGaugeVec, - pub txmsgs: GenericGaugeVec, - pub txbytes: GenericGaugeVec, - pub rxmsgs: GenericGaugeVec, - pub rxbytes: GenericGaugeVec, - pub msgs: GenericGaugeVec, - pub rx_ver_drops: GenericGaugeVec, - pub msgs_inflight: IntGaugeVec, - pub next_ack_seq: IntGaugeVec, - pub next_err_seq: IntGaugeVec, - pub acked_msgid: GenericGaugeVec, + pub msgq_cnt: LabelGuardedIntGaugeVec<4>, + pub msgq_bytes: LabelGuardedUintGaugeVec<4>, + pub xmit_msgq_cnt: LabelGuardedIntGaugeVec<4>, + pub xmit_msgq_bytes: LabelGuardedUintGaugeVec<4>, + pub fetchq_cnt: LabelGuardedIntGaugeVec<4>, + pub fetchq_size: LabelGuardedUintGaugeVec<4>, + pub query_offset: LabelGuardedIntGaugeVec<4>, + pub next_offset: LabelGuardedIntGaugeVec<4>, + pub app_offset: LabelGuardedIntGaugeVec<4>, + pub stored_offset: LabelGuardedIntGaugeVec<4>, + pub committed_offset: LabelGuardedIntGaugeVec<4>, + pub eof_offset: LabelGuardedIntGaugeVec<4>, + pub lo_offset: LabelGuardedIntGaugeVec<4>, + pub hi_offset: LabelGuardedIntGaugeVec<4>, + pub consumer_lag: LabelGuardedIntGaugeVec<4>, + pub consumer_lag_store: LabelGuardedIntGaugeVec<4>, + pub txmsgs: LabelGuardedUintGaugeVec<4>, + pub txbytes: LabelGuardedUintGaugeVec<4>, + pub rxmsgs: LabelGuardedUintGaugeVec<4>, + pub rxbytes: LabelGuardedUintGaugeVec<4>, + pub msgs: LabelGuardedUintGaugeVec<4>, + pub rx_ver_drops: LabelGuardedUintGaugeVec<4>, + pub msgs_inflight: LabelGuardedIntGaugeVec<4>, + pub next_ack_seq: LabelGuardedIntGaugeVec<4>, + pub next_err_seq: LabelGuardedIntGaugeVec<4>, + pub acked_msgid: LabelGuardedUintGaugeVec<4>, } impl PartitionStats { pub fn new(registry: Registry) -> Self { - let msgq_cnt = register_int_gauge_vec_with_registry!( + let msgq_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_msgq_cnt", "Number of messages in the producer queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let msgq_bytes = register_uint_gauge_vec_with_registry!( + let msgq_bytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_msgq_bytes", "Size of messages in the producer queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let xmit_msgq_cnt = register_int_gauge_vec_with_registry!( + let xmit_msgq_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_xmit_msgq_cnt", "Number of messages in the transmit queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let xmit_msgq_bytes = register_uint_gauge_vec_with_registry!( + let xmit_msgq_bytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_xmit_msgq_bytes", "Size of messages in the transmit queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let fetchq_cnt = register_int_gauge_vec_with_registry!( + let fetchq_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_fetchq_cnt", "Number of messages in the fetch queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let fetchq_size = register_uint_gauge_vec_with_registry!( + let fetchq_size = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_fetchq_size", "Size of messages in the fetch queue", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let query_offset = register_int_gauge_vec_with_registry!( + let query_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_query_offset", "Current query offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let next_offset = register_int_gauge_vec_with_registry!( + let next_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_next_offset", "Next offset to query", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let app_offset = register_int_gauge_vec_with_registry!( + let app_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_app_offset", "Last acknowledged offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let stored_offset = register_int_gauge_vec_with_registry!( + let stored_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_stored_offset", "Last stored offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let committed_offset = register_int_gauge_vec_with_registry!( + let committed_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_committed_offset", "Last committed offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let eof_offset = register_int_gauge_vec_with_registry!( + let eof_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_eof_offset", "Last offset in broker log", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let lo_offset = register_int_gauge_vec_with_registry!( + let lo_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_lo_offset", "Low offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let hi_offset = register_int_gauge_vec_with_registry!( + let hi_offset = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_hi_offset", "High offset", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let consumer_lag = register_int_gauge_vec_with_registry!( + let consumer_lag = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_consumer_lag", "Consumer lag", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let consumer_lag_store = register_int_gauge_vec_with_registry!( + let consumer_lag_store = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_consumer_lag_store", "Consumer lag stored", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let txmsgs = register_uint_gauge_vec_with_registry!( + let txmsgs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_txmsgs", "Number of transmitted messages", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let txbytes = register_uint_gauge_vec_with_registry!( + let txbytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_txbytes", "Number of transmitted bytes", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let rxmsgs = register_uint_gauge_vec_with_registry!( + let rxmsgs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_rxmsgs", "Number of received messages", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let rxbytes = register_uint_gauge_vec_with_registry!( + let rxbytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_rxbytes", "Number of received bytes", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let msgs = register_uint_gauge_vec_with_registry!( + let msgs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_msgs", "Number of messages in partition", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let rx_ver_drops = register_uint_gauge_vec_with_registry!( + let rx_ver_drops = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_rx_ver_drops", "Number of received messages dropped due to version mismatch", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let msgs_inflight = register_int_gauge_vec_with_registry!( + let msgs_inflight = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_msgs_inflight", "Number of messages in-flight", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let next_ack_seq = register_int_gauge_vec_with_registry!( + let next_ack_seq = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_next_ack_seq", "Next ack sequence number", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let next_err_seq = register_int_gauge_vec_with_registry!( + let next_err_seq = register_guarded_int_gauge_vec_with_registry!( "rdkafka_topic_partition_next_err_seq", "Next error sequence number", &["id", "client_id", "topic", "partition"], registry ) .unwrap(); - let acked_msgid = register_uint_gauge_vec_with_registry!( + let acked_msgid = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_topic_partition_acked_msgid", "Acknowledged message ID", &["id", "client_id", "topic", "partition"], @@ -615,78 +624,88 @@ impl PartitionStats { fn report_inner(&self, id: &str, client_id: &str, topic: &str, stats: &Partition) { let labels = [id, client_id, topic, &stats.partition.to_string()]; - self.msgq_cnt.with_label_values(&labels).set(stats.msgq_cnt); + self.msgq_cnt + .with_guarded_label_values(&labels) + .set(stats.msgq_cnt); self.msgq_bytes - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.msgq_bytes); self.xmit_msgq_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.xmit_msgq_cnt); self.xmit_msgq_bytes - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.xmit_msgq_bytes); self.fetchq_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.fetchq_cnt); self.fetchq_size - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.fetchq_size); self.query_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.query_offset); self.next_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.next_offset); self.app_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.app_offset); self.stored_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.stored_offset); self.committed_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.committed_offset); self.eof_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.eof_offset); self.lo_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.lo_offset); self.hi_offset - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.hi_offset); self.consumer_lag - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.consumer_lag); self.consumer_lag_store - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.consumer_lag_stored); - self.txmsgs.with_label_values(&labels).set(stats.txmsgs); - self.txbytes.with_label_values(&labels).set(stats.txbytes); - self.rxmsgs.with_label_values(&labels).set(stats.rxmsgs); - self.rxbytes.with_label_values(&labels).set(stats.rxbytes); - self.msgs.with_label_values(&labels).set(stats.msgs); + self.txmsgs + .with_guarded_label_values(&labels) + .set(stats.txmsgs); + self.txbytes + .with_guarded_label_values(&labels) + .set(stats.txbytes); + self.rxmsgs + .with_guarded_label_values(&labels) + .set(stats.rxmsgs); + self.rxbytes + .with_guarded_label_values(&labels) + .set(stats.rxbytes); + self.msgs.with_guarded_label_values(&labels).set(stats.msgs); self.rx_ver_drops - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.rx_ver_drops); self.msgs_inflight - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.msgs_inflight); self.next_ack_seq - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.next_ack_seq); self.next_err_seq - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.next_err_seq); self.acked_msgid - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.acked_msgid); } } impl RdKafkaStats { pub fn new(registry: Registry) -> Self { - let ts = register_int_gauge_vec_with_registry!( + let ts = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_ts", "librdkafka's internal monotonic clock (microseconds)", // we cannot tell whether it is for consumer or producer, @@ -695,119 +714,119 @@ impl RdKafkaStats { registry ) .unwrap(); - let time = register_int_gauge_vec_with_registry!( + let time = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_time", "Wall clock time in seconds since the epoch", &["id", "client_id"], registry ) .unwrap(); - let age = register_int_gauge_vec_with_registry!( + let age = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_age", "Age of the topic metadata in milliseconds", &["id", "client_id"], registry ) .unwrap(); - let replyq = register_int_gauge_vec_with_registry!( + let replyq = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_replyq", "Number of replies waiting to be served", &["id", "client_id"], registry ) .unwrap(); - let msg_cnt = register_uint_gauge_vec_with_registry!( + let msg_cnt = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_top_msg_cnt", "Number of messages in all topics", &["id", "client_id"], registry ) .unwrap(); - let msg_size = register_uint_gauge_vec_with_registry!( + let msg_size = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_top_msg_size", "Size of messages in all topics", &["id", "client_id"], registry ) .unwrap(); - let msg_max = register_uint_gauge_vec_with_registry!( + let msg_max = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_top_msg_max", "Maximum message size in all topics", &["id", "client_id"], registry ) .unwrap(); - let msg_size_max = register_uint_gauge_vec_with_registry!( + let msg_size_max = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_top_msg_size_max", "Maximum message size in all topics", &["id", "client_id"], registry ) .unwrap(); - let tx = register_int_gauge_vec_with_registry!( + let tx = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_tx", "Number of transmitted messages", &["id", "client_id"], registry ) .unwrap(); - let tx_bytes = register_int_gauge_vec_with_registry!( + let tx_bytes = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_tx_bytes", "Number of transmitted bytes", &["id", "client_id"], registry ) .unwrap(); - let rx = register_int_gauge_vec_with_registry!( + let rx = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_rx", "Number of received messages", &["id", "client_id"], registry ) .unwrap(); - let rx_bytes = register_int_gauge_vec_with_registry!( + let rx_bytes = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_rx_bytes", "Number of received bytes", &["id", "client_id"], registry ) .unwrap(); - let tx_msgs = register_int_gauge_vec_with_registry!( + let tx_msgs = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_tx_msgs", "Number of transmitted messages", &["id", "client_id"], registry ) .unwrap(); - let tx_msgs_bytes = register_int_gauge_vec_with_registry!( + let tx_msgs_bytes = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_tx_msgs_bytes", "Number of transmitted bytes", &["id", "client_id"], registry ) .unwrap(); - let rx_msgs = register_int_gauge_vec_with_registry!( + let rx_msgs = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_rx_msgs", "Number of received messages", &["id", "client_id"], registry ) .unwrap(); - let rx_msgs_bytes = register_int_gauge_vec_with_registry!( + let rx_msgs_bytes = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_rx_msgs_bytes", "Number of received bytes", &["id", "client_id"], registry ) .unwrap(); - let simple_cnt = register_int_gauge_vec_with_registry!( + let simple_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_simple_cnt", "Number of simple consumer queues", &["id", "client_id"], registry ) .unwrap(); - let metadata_cache_cnt = register_int_gauge_vec_with_registry!( + let metadata_cache_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_top_metadata_cache_cnt", "Number of entries in the metadata cache", &["id", "client_id"], @@ -846,51 +865,59 @@ impl RdKafkaStats { pub fn report(&self, id: &str, stats: &Statistics) { let client_id = stats.name.as_str(); - self.ts.with_label_values(&[id, client_id]).set(stats.ts); + self.ts + .with_guarded_label_values(&[id, client_id]) + .set(stats.ts); self.time - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.time); - self.age.with_label_values(&[id, client_id]).set(stats.age); + self.age + .with_guarded_label_values(&[id, client_id]) + .set(stats.age); self.replyq - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.replyq); self.msg_cnt - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.msg_cnt); self.msg_size - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.msg_size); self.msg_max - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.msg_max); self.msg_size_max - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.msg_size_max); - self.tx.with_label_values(&[id, client_id]).set(stats.tx); + self.tx + .with_guarded_label_values(&[id, client_id]) + .set(stats.tx); self.tx_bytes - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.tx_bytes); - self.rx.with_label_values(&[id, client_id]).set(stats.rx); + self.rx + .with_guarded_label_values(&[id, client_id]) + .set(stats.rx); self.rx_bytes - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.rx_bytes); self.tx_msgs - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.txmsgs); self.tx_msgs_bytes - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.txmsg_bytes); self.rx_msgs - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.rxmsgs); self.rx_msgs_bytes - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.rxmsg_bytes); self.simple_cnt - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.simple_cnt); self.metadata_cache_cnt - .with_label_values(&[id, client_id]) + .with_guarded_label_values(&[id, client_id]) .set(stats.metadata_cache_cnt); self.broker_stats.report(id, client_id, stats); @@ -903,161 +930,161 @@ impl RdKafkaStats { impl BrokerStats { pub fn new(registry: Registry) -> Self { - let state_age = register_int_gauge_vec_with_registry!( + let state_age = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_state_age", "Age of the broker state in seconds", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let outbuf_cnt = register_int_gauge_vec_with_registry!( + let outbuf_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_outbuf_cnt", "Number of messages waiting to be sent to broker", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let outbuf_msg_cnt = register_int_gauge_vec_with_registry!( + let outbuf_msg_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_outbuf_msg_cnt", "Number of messages waiting to be sent to broker", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let waitresp_cnt = register_int_gauge_vec_with_registry!( + let waitresp_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_waitresp_cnt", "Number of requests waiting for response", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let waitresp_msg_cnt = register_int_gauge_vec_with_registry!( + let waitresp_msg_cnt = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_waitresp_msg_cnt", "Number of messages waiting for response", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let tx = register_uint_gauge_vec_with_registry!( + let tx = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_tx", "Number of transmitted messages", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let tx_bytes = register_uint_gauge_vec_with_registry!( + let tx_bytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_tx_bytes", "Number of transmitted bytes", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let tx_errs = register_uint_gauge_vec_with_registry!( + let tx_errs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_tx_errs", "Number of failed transmitted messages", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let tx_retries = register_uint_gauge_vec_with_registry!( + let tx_retries = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_tx_retries", "Number of message retries", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let tx_idle = register_int_gauge_vec_with_registry!( + let tx_idle = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_tx_idle", "Number of idle transmit connections", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let req_timeouts = register_uint_gauge_vec_with_registry!( + let req_timeouts = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_req_timeouts", "Number of request timeouts", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx = register_uint_gauge_vec_with_registry!( + let rx = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_rx", "Number of received messages", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx_bytes = register_uint_gauge_vec_with_registry!( + let rx_bytes = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_rx_bytes", "Number of received bytes", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx_errs = register_uint_gauge_vec_with_registry!( + let rx_errs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_rx_errs", "Number of failed received messages", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx_corriderrs = register_uint_gauge_vec_with_registry!( + let rx_corriderrs = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_rx_corriderrs", "Number of received messages with invalid correlation id", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx_partial = register_uint_gauge_vec_with_registry!( + let rx_partial = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_rx_partial", "Number of partial messages received", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let rx_idle = register_int_gauge_vec_with_registry!( + let rx_idle = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_rx_idle", "Number of idle receive connections", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let req = register_int_gauge_vec_with_registry!( + let req = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_req", "Number of requests in flight", &["id", "client_id", "broker", "state", "type"], registry ) .unwrap(); - let zbuf_grow = register_uint_gauge_vec_with_registry!( + let zbuf_grow = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_zbuf_grow", "Number of times the broker's output buffer has been reallocated", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let buf_grow = register_uint_gauge_vec_with_registry!( + let buf_grow = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_buf_grow", "Number of times the broker's input buffer has been reallocated", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let wakeups = register_uint_gauge_vec_with_registry!( + let wakeups = register_guarded_uint_gauge_vec_with_registry!( "rdkafka_broker_wakeups", "Number of wakeups", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let connects = register_int_gauge_vec_with_registry!( + let connects = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_connects", "Number of connection attempts", &["id", "client_id", "broker", "state"], registry ) .unwrap(); - let disconnects = register_int_gauge_vec_with_registry!( + let disconnects = register_guarded_int_gauge_vec_with_registry!( "rdkafka_broker_disconnects", "Number of disconnects", &["id", "client_id", "broker", "state"], @@ -1113,57 +1140,75 @@ impl BrokerStats { let labels = [id, client_id, broker, state]; self.state_age - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.stateage); self.outbuf_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.outbuf_cnt); self.outbuf_msg_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.outbuf_msg_cnt); self.waitresp_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.waitresp_cnt); self.waitresp_msg_cnt - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.waitresp_msg_cnt); - self.tx.with_label_values(&labels).set(stats.tx); - self.tx_bytes.with_label_values(&labels).set(stats.txbytes); - self.tx_errs.with_label_values(&labels).set(stats.txerrs); + self.tx.with_guarded_label_values(&labels).set(stats.tx); + self.tx_bytes + .with_guarded_label_values(&labels) + .set(stats.txbytes); + self.tx_errs + .with_guarded_label_values(&labels) + .set(stats.txerrs); self.tx_retries - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.txretries); - self.tx_idle.with_label_values(&labels).set(stats.txidle); + self.tx_idle + .with_guarded_label_values(&labels) + .set(stats.txidle); self.req_timeouts - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.req_timeouts); - self.rx.with_label_values(&labels).set(stats.rx); - self.rx_bytes.with_label_values(&labels).set(stats.rxbytes); - self.rx_errs.with_label_values(&labels).set(stats.rxerrs); + self.rx.with_guarded_label_values(&labels).set(stats.rx); + self.rx_bytes + .with_guarded_label_values(&labels) + .set(stats.rxbytes); + self.rx_errs + .with_guarded_label_values(&labels) + .set(stats.rxerrs); self.rx_corriderrs - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.rxcorriderrs); self.rx_partial - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.rxpartial); - self.rx_idle.with_label_values(&labels).set(stats.rxidle); + self.rx_idle + .with_guarded_label_values(&labels) + .set(stats.rxidle); for (req_type, req_cnt) in &stats.req { self.req - .with_label_values(&[id, client_id, broker, state, req_type]) + .with_guarded_label_values(&[id, client_id, broker, state, req_type]) .set(*req_cnt); } self.zbuf_grow - .with_label_values(&labels) + .with_guarded_label_values(&labels) .set(stats.zbuf_grow); - self.buf_grow.with_label_values(&labels).set(stats.buf_grow); + self.buf_grow + .with_guarded_label_values(&labels) + .set(stats.buf_grow); if let Some(wakeups) = stats.wakeups { - self.wakeups.with_label_values(&labels).set(wakeups); + self.wakeups.with_guarded_label_values(&labels).set(wakeups); } if let Some(connects) = stats.connects { - self.connects.with_label_values(&labels).set(connects); + self.connects + .with_guarded_label_values(&labels) + .set(connects); } if let Some(disconnects) = stats.disconnects { - self.disconnects.with_label_values(&labels).set(disconnects); + self.disconnects + .with_guarded_label_values(&labels) + .set(disconnects); } if let Some(int_latency) = &stats.int_latency { self.int_latency diff --git a/src/connector/src/source/pulsar/mod.rs b/src/connector/src/source/pulsar/mod.rs index 5d6d111b13bff..ffbc3be495bf9 100644 --- a/src/connector/src/source/pulsar/mod.rs +++ b/src/connector/src/source/pulsar/mod.rs @@ -74,6 +74,16 @@ pub struct PulsarProperties { #[serde(rename = "iceberg.bucket", default)] pub iceberg_bucket: Option, + /// Specify a custom consumer group id prefix for the source. + /// Defaults to `rw-consumer`. + /// + /// Notes: + /// - Each job (materialized view) will have multiple subscriptions and + /// contains a generated suffix in the subscription name. + /// The subscription name will be `{subscription_name_prefix}-{fragment_id}-{actor_id}`. + #[serde(rename = "subscription.name.prefix")] + pub subscription_name_prefix: Option, + #[serde(flatten)] pub unknown_fields: HashMap, } diff --git a/src/connector/src/source/pulsar/source/reader.rs b/src/connector/src/source/pulsar/source/reader.rs index 212c459388b25..20f6872474e88 100644 --- a/src/connector/src/source/pulsar/source/reader.rs +++ b/src/connector/src/source/pulsar/source/reader.rs @@ -42,6 +42,8 @@ use crate::source::{ SplitMetaData, SplitReader, }; +const PULSAR_DEFAULT_SUBSCRIPTION_PREFIX: &str = "rw-consumer"; + pub enum PulsarSplitReader { Broker(PulsarBrokerReader), Iceberg(PulsarIcebergReader), @@ -174,8 +176,12 @@ impl SplitReader for PulsarBrokerReader { .with_topic(&topic) .with_subscription_type(SubType::Exclusive) .with_subscription(format!( - "rw-consumer-{}-{}", - source_ctx.fragment_id, source_ctx.actor_id + "{}-{}-{}", + props + .subscription_name_prefix + .unwrap_or(PULSAR_DEFAULT_SUBSCRIPTION_PREFIX.to_string()), + source_ctx.fragment_id, + source_ctx.actor_id )); let builder = match split.start_offset.clone() { diff --git a/src/connector/src/source/reader/reader.rs b/src/connector/src/source/reader/reader.rs index 61468bd72a4b6..9a7cb1e440e9f 100644 --- a/src/connector/src/source/reader/reader.rs +++ b/src/connector/src/source/reader/reader.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::sync::Arc; use anyhow::Context; @@ -34,8 +35,9 @@ use crate::source::filesystem::opendal_source::{ }; use crate::source::filesystem::{FsPageItem, OpendalFsSplit}; use crate::source::{ - create_split_reader, BoxChunkSourceStream, BoxTryStream, Column, ConnectorProperties, - ConnectorState, SourceColumnDesc, SourceContext, SplitReader, WaitCheckpointTask, + create_split_reader, BackfillInfo, BoxChunkSourceStream, BoxTryStream, Column, + ConnectorProperties, ConnectorState, SourceColumnDesc, SourceContext, SplitId, SplitReader, + WaitCheckpointTask, }; use crate::{dispatch_source_prop, WithOptionsSecResolved}; @@ -91,27 +93,47 @@ impl SourceReader { match config { ConnectorProperties::Gcs(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_gcs_source(*prop)?; - Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) + Ok(build_opendal_fs_list_stream( + lister, + list_interval_sec, + recursive_scan, + )) } ConnectorProperties::OpendalS3(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_s3_source(prop.s3_properties, prop.assume_role)?; - Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) + Ok(build_opendal_fs_list_stream( + lister, + list_interval_sec, + recursive_scan, + )) } ConnectorProperties::Azblob(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_azblob_source(*prop)?; - Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) + Ok(build_opendal_fs_list_stream( + lister, + list_interval_sec, + recursive_scan, + )) } ConnectorProperties::PosixFs(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_posix_fs_source(*prop)?; - Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) + Ok(build_opendal_fs_list_stream( + lister, + list_interval_sec, + recursive_scan, + )) } other => bail!("Unsupported source: {:?}", other), } @@ -129,6 +151,72 @@ impl SourceReader { }) } + pub async fn build_stream_for_backfill( + &self, + state: ConnectorState, + column_ids: Vec, + source_ctx: Arc, + ) -> ConnectorResult<(BoxChunkSourceStream, HashMap)> { + let Some(splits) = state else { + return Ok((pending().boxed(), HashMap::new())); + }; + let config = self.config.clone(); + let columns = self.get_target_columns(column_ids)?; + + let data_gen_columns = Some( + columns + .iter() + .map(|col| Column { + name: col.name.clone(), + data_type: col.data_type.clone(), + is_visible: col.is_visible(), + }) + .collect_vec(), + ); + + let parser_config = ParserConfig { + specific: self.parser_config.clone(), + common: CommonParserConfig { + rw_columns: columns, + }, + }; + + let support_multiple_splits = config.support_multiple_splits(); + dispatch_source_prop!(config, prop, { + let readers = if support_multiple_splits { + tracing::debug!( + "spawning connector split reader for multiple splits {:?}", + splits + ); + let reader = + create_split_reader(*prop, splits, parser_config, source_ctx, data_gen_columns) + .await?; + + vec![reader] + } else { + let to_reader_splits = splits.into_iter().map(|split| vec![split]); + try_join_all(to_reader_splits.into_iter().map(|splits| { + tracing::debug!(?splits, "spawning connector split reader"); + let props = prop.clone(); + let data_gen_columns = data_gen_columns.clone(); + let parser_config = parser_config.clone(); + // TODO: is this reader split across multiple threads...? Realistically, we want + // source_ctx to live in a single actor. + let source_ctx = source_ctx.clone(); + create_split_reader(*props, splits, parser_config, source_ctx, data_gen_columns) + })) + .await? + }; + + let backfill_info = readers.iter().flat_map(|r| r.backfill_info()).collect(); + + Ok(( + select_all(readers.into_iter().map(|r| r.into_stream())).boxed(), + backfill_info, + )) + }) + } + /// Build `SplitReader`s and then `BoxChunkSourceStream` from the given `ConnectorState` (`SplitImpl`s). pub async fn build_stream( &self, @@ -196,10 +284,11 @@ impl SourceReader { async fn build_opendal_fs_list_stream( lister: OpendalEnumerator, list_interval_sec: u64, + recursive_scan: bool, ) { loop { let matcher = lister.get_matcher(); - let mut object_metadata_iter = lister.list().await?; + let mut object_metadata_iter = lister.list(recursive_scan).await?; while let Some(list_res) = object_metadata_iter.next().await { match list_res { @@ -226,9 +315,12 @@ async fn build_opendal_fs_list_stream( } #[try_stream(boxed, ok = OpendalFsSplit, error = crate::error::ConnectorError)] -pub async fn build_opendal_fs_list_for_batch(lister: OpendalEnumerator) { +pub async fn build_opendal_fs_list_for_batch( + lister: OpendalEnumerator, + recursive_scan: bool, +) { let matcher = lister.get_matcher(); - let mut object_metadata_iter = lister.list().await?; + let mut object_metadata_iter = lister.list(recursive_scan).await?; while let Some(list_res) = object_metadata_iter.next().await { match list_res { diff --git a/src/connector/src/with_options.rs b/src/connector/src/with_options.rs index ae2d432fdfd74..065c9394b8a49 100644 --- a/src/connector/src/with_options.rs +++ b/src/connector/src/with_options.rs @@ -126,6 +126,10 @@ pub trait WithPropertiesExt: Get + Sized { CdcTableType::from_properties(self).enable_transaction_metadata() } + fn is_shareable_non_cdc_connector(&self) -> bool { + self.is_kafka_connector() + } + #[inline(always)] fn is_iceberg_connector(&self) -> bool { let Some(connector) = self.get_connector() else { diff --git a/src/connector/with_options_sink.yaml b/src/connector/with_options_sink.yaml index d028ef5e30198..1af3435eaea24 100644 --- a/src/connector/with_options_sink.yaml +++ b/src/connector/with_options_sink.yaml @@ -41,14 +41,6 @@ BigQueryConfig: - name: bigquery.table field_type: String required: true - - name: bigquery.max_batch_rows - field_type: usize - required: false - default: '1024' - - name: bigquery.retry_times - field_type: usize - required: false - default: '5' - name: auto_create field_type: bool required: false @@ -123,7 +115,7 @@ ClickHouseConfig: field_type: u64 comments: Commit every n(>0) checkpoints, default is 10. required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: r#type field_type: String required: true @@ -151,7 +143,7 @@ DeltaLakeConfig: field_type: u64 comments: Commit every n(>0) checkpoints, default is 10. required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: r#type field_type: String required: true @@ -347,7 +339,7 @@ IcebergConfig: field_type: u64 comments: Commit every n(>0) checkpoints, default is 10. required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: create_table_if_not_exists field_type: bool required: false @@ -381,14 +373,26 @@ KafkaConfig: field_type: String comments: Path to CA certificate file for verifying the broker's key. required: false + - name: properties.ssl.ca.pem + field_type: String + comments: CA certificate string (PEM format) for verifying the broker's key. + required: false - name: properties.ssl.certificate.location field_type: String comments: Path to client's certificate file (PEM). required: false + - name: properties.ssl.certificate.pem + field_type: String + comments: Client's public key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.location field_type: String comments: Path to client's private key file (PEM). required: false + - name: properties.ssl.key.pem + field_type: String + comments: Client's private key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.password field_type: String comments: Passphrase of client's private key. @@ -1017,7 +1021,7 @@ StarrocksConfig: also, in this time, the `sink_decouple` option should be enabled as well. Defaults to 10 if commit_checkpoint_interval <= 0 required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: starrocks.partial_update field_type: String comments: Enable partial update diff --git a/src/connector/with_options_source.yaml b/src/connector/with_options_source.yaml index 4eaf1e0d3db4b..c54dce97ad1cd 100644 --- a/src/connector/with_options_source.yaml +++ b/src/connector/with_options_source.yaml @@ -24,6 +24,10 @@ AzblobProperties: - name: refresh.interval.sec field_type: u64 required: false + - name: recursive_scan + field_type: bool + required: false + default: Default::default - name: compression_format field_type: CompressionFormat required: false @@ -75,6 +79,10 @@ GcsProperties: - name: refresh.interval.sec field_type: u64 required: false + - name: recursive_scan + field_type: bool + required: false + default: Default::default - name: compression_format field_type: CompressionFormat required: false @@ -199,14 +207,26 @@ KafkaProperties: field_type: String comments: Path to CA certificate file for verifying the broker's key. required: false + - name: properties.ssl.ca.pem + field_type: String + comments: CA certificate string (PEM format) for verifying the broker's key. + required: false - name: properties.ssl.certificate.location field_type: String comments: Path to client's certificate file (PEM). required: false + - name: properties.ssl.certificate.pem + field_type: String + comments: Client's public key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.location field_type: String comments: Path to client's private key file (PEM). required: false + - name: properties.ssl.key.pem + field_type: String + comments: Client's private key string (PEM format) used for authentication. + required: false - name: properties.ssl.key.password field_type: String comments: Passphrase of client's private key. @@ -828,6 +848,10 @@ OpendalS3Properties: - name: refresh.interval.sec field_type: u64 required: false + - name: recursive_scan + field_type: bool + required: false + default: Default::default PosixFsProperties: fields: - name: posix_fs.root @@ -842,6 +866,10 @@ PosixFsProperties: - name: refresh.interval.sec field_type: u64 required: false + - name: recursive_scan + field_type: bool + required: false + default: Default::default - name: compression_format field_type: CompressionFormat required: false @@ -988,6 +1016,17 @@ PulsarProperties: field_type: String required: false default: Default::default + - name: subscription.name.prefix + field_type: String + comments: |- + Specify a custom consumer group id prefix for the source. + Defaults to `rw-consumer`. + + Notes: + - Each job (materialized view) will have multiple subscriptions and + contains a generated suffix in the subscription name. + The subscription name will be `{subscription_name_prefix}-{fragment_id}-{actor_id}`. + required: false S3Properties: fields: - name: s3.region_name diff --git a/src/ctl/src/cmd_impl/hummock/compaction_group.rs b/src/ctl/src/cmd_impl/hummock/compaction_group.rs index a0395d236d504..c41b4c6e25b9e 100644 --- a/src/ctl/src/cmd_impl/hummock/compaction_group.rs +++ b/src/ctl/src/cmd_impl/hummock/compaction_group.rs @@ -131,10 +131,11 @@ pub async fn split_compaction_group( context: &CtlContext, group_id: CompactionGroupId, table_ids_to_new_group: &[StateTableId], + partition_vnode_count: u32, ) -> anyhow::Result<()> { let meta_client = context.meta_client().await?; let new_group_id = meta_client - .split_compaction_group(group_id, table_ids_to_new_group) + .split_compaction_group(group_id, table_ids_to_new_group, partition_vnode_count) .await?; println!( "Succeed: split compaction group {}. tables {:#?} are moved to new group {}.", @@ -284,3 +285,15 @@ pub async fn cancel_compact_task(context: &CtlContext, task_id: u64) -> anyhow:: Ok(()) } + +pub async fn merge_compaction_group( + context: &CtlContext, + left_group_id: CompactionGroupId, + right_group_id: CompactionGroupId, +) -> anyhow::Result<()> { + let meta_client = context.meta_client().await?; + meta_client + .merge_compaction_group(left_group_id, right_group_id) + .await?; + Ok(()) +} diff --git a/src/ctl/src/cmd_impl/meta/cluster_info.rs b/src/ctl/src/cmd_impl/meta/cluster_info.rs index cbc21ca6ec610..76b91d37fbd3c 100644 --- a/src/ctl/src/cmd_impl/meta/cluster_info.rs +++ b/src/ctl/src/cmd_impl/meta/cluster_info.rs @@ -31,7 +31,7 @@ pub async fn get_cluster_info(context: &CtlContext) -> anyhow::Result anyhow::Result<()> { +pub async fn source_split_info(context: &CtlContext, ignore_id: bool) -> anyhow::Result<()> { let GetClusterInfoResponse { worker_nodes: _, source_infos: _, @@ -40,37 +40,113 @@ pub async fn source_split_info(context: &CtlContext) -> anyhow::Result<()> { revision: _, } = get_cluster_info(context).await?; + let mut actor_splits_map: BTreeMap = BTreeMap::new(); + + // build actor_splits_map for table_fragment in &table_fragments { if table_fragment.actor_splits.is_empty() { continue; } - println!("Table #{}", table_fragment.table_id); - for fragment in table_fragment.fragments.values() { let fragment_type_mask = fragment.fragment_type_mask; if fragment_type_mask & FragmentTypeFlag::Source as u32 == 0 - || fragment_type_mask & FragmentTypeFlag::Dml as u32 != 0 + && fragment_type_mask & FragmentTypeFlag::SourceScan as u32 == 0 { + // no source or source backfill + continue; + } + if fragment_type_mask & FragmentTypeFlag::Dml as u32 != 0 { // skip dummy source for dml fragment continue; } - println!("\tFragment #{}", fragment.fragment_id); for actor in &fragment.actors { if let Some(ConnectorSplits { splits }) = actor_splits.remove(&actor.actor_id) { let splits = splits .iter() .map(|split| SplitImpl::try_from(split).unwrap()) .map(|split| split.id()) - .collect_vec(); + .collect_vec() + .join(","); + actor_splits_map.insert(actor.actor_id, (splits.len(), splits)); + } + } + } + } + // print in the second iteration. Otherwise we don't have upstream splits info + for table_fragment in &table_fragments { + if table_fragment.actor_splits.is_empty() { + continue; + } + if ignore_id { + println!("Table"); + } else { + println!("Table #{}", table_fragment.table_id); + } + for fragment in table_fragment.fragments.values() { + let fragment_type_mask = fragment.fragment_type_mask; + if fragment_type_mask & FragmentTypeFlag::Source as u32 == 0 + && fragment_type_mask & FragmentTypeFlag::SourceScan as u32 == 0 + { + // no source or source backfill + continue; + } + if fragment_type_mask & FragmentTypeFlag::Dml as u32 != 0 { + // skip dummy source for dml fragment + continue; + } + + println!( + "\tFragment{} ({})", + if ignore_id { + "".to_string() + } else { + format!(" #{}", fragment.fragment_id) + }, + if fragment_type_mask == FragmentTypeFlag::Source as u32 { + "Source" + } else { + "SourceScan" + } + ); + for actor in &fragment.actors { + if let Some((split_count, splits)) = actor_splits_map.get(&actor.actor_id) { println!( - "\t\tActor #{:<3} ({}): [{}]", - actor.actor_id, - splits.len(), - splits.join(",") + "\t\tActor{} ({} splits): [{}]{}", + if ignore_id { + "".to_string() + } else { + format!(" #{:<3}", actor.actor_id,) + }, + split_count, + splits, + if !actor.upstream_actor_id.is_empty() { + assert!( + actor.upstream_actor_id.len() == 1, + "should have only one upstream actor, got {actor:?}" + ); + let upstream_splits = + actor_splits_map.get(&actor.upstream_actor_id[0]).unwrap(); + format!( + " <- Upstream Actor{}: [{}]", + if ignore_id { + "".to_string() + } else { + format!(" #{}", actor.upstream_actor_id[0]) + }, + upstream_splits.1 + ) + } else { + "".to_string() + } ); + } else { + println!( + "\t\tError: Actor #{:<3} (not found in actor_splits)", + actor.actor_id, + ) } } } diff --git a/src/ctl/src/cmd_impl/table/scan.rs b/src/ctl/src/cmd_impl/table/scan.rs index e5bba170bf97a..f5cee710a40fc 100644 --- a/src/ctl/src/cmd_impl/table/scan.rs +++ b/src/ctl/src/cmd_impl/table/scan.rs @@ -14,6 +14,8 @@ use anyhow::{anyhow, Result}; use futures::{pin_mut, StreamExt}; +use risingwave_common::bitmap::Bitmap; +use risingwave_common::hash::VirtualNode; use risingwave_frontend::TableCatalog; use risingwave_hummock_sdk::HummockReadEpoch; use risingwave_rpc_client::MetaClient; @@ -63,7 +65,8 @@ pub async fn make_state_table(hummock: S, table: &TableCatalog) - .collect(), table.pk().iter().map(|x| x.order_type).collect(), table.pk().iter().map(|x| x.column_index).collect(), - TableDistribution::all(table.distribution_key().to_vec()), // scan all vnodes + // TODO(var-vnode): use vnode count from table desc + TableDistribution::all(table.distribution_key().to_vec(), VirtualNode::COUNT), // scan all vnodes Some(table.value_indices.clone()), ) .await @@ -81,7 +84,8 @@ pub fn make_storage_table( Ok(StorageTable::new_partial( hummock, output_columns_ids, - Some(TableDistribution::all_vnodes()), + // TODO(var-vnode): use vnode count from table desc + Some(Bitmap::ones(VirtualNode::COUNT).into()), &table.table_desc().try_to_protobuf()?, )) } diff --git a/src/ctl/src/lib.rs b/src/ctl/src/lib.rs index d1deba4f99140..b35b8d1e42cb2 100644 --- a/src/ctl/src/lib.rs +++ b/src/ctl/src/lib.rs @@ -276,6 +276,8 @@ enum HummockCommands { compaction_group_id: u64, #[clap(long, value_delimiter = ',')] table_ids: Vec, + #[clap(long, default_value_t = 0)] + partition_vnode_count: u32, }, /// Pause version checkpoint, which subsequently pauses GC of delta log and SST object. PauseVersionCheckpoint, @@ -340,6 +342,12 @@ enum HummockCommands { #[clap(long)] record_hybrid_fetch_threshold_ms: Option, }, + MergeCompactionGroup { + #[clap(long)] + left_group_id: u64, + #[clap(long)] + right_group_id: u64, + }, } #[derive(Subcommand)] @@ -404,7 +412,10 @@ enum MetaCommands { /// get cluster info ClusterInfo, /// get source split info - SourceSplitInfo, + SourceSplitInfo { + #[clap(long)] + ignore_id: bool, + }, /// Reschedule the actors in the stream graph /// /// The format is `fragment_id-[worker_id:count]+[worker_id:count]` @@ -708,9 +719,15 @@ async fn start_impl(opts: CliOpts, context: &CtlContext) -> Result<()> { Commands::Hummock(HummockCommands::SplitCompactionGroup { compaction_group_id, table_ids, + partition_vnode_count, }) => { - cmd_impl::hummock::split_compaction_group(context, compaction_group_id, &table_ids) - .await?; + cmd_impl::hummock::split_compaction_group( + context, + compaction_group_id, + &table_ids, + partition_vnode_count, + ) + .await?; } Commands::Hummock(HummockCommands::PauseVersionCheckpoint) => { cmd_impl::hummock::pause_version_checkpoint(context).await?; @@ -787,6 +804,13 @@ async fn start_impl(opts: CliOpts, context: &CtlContext) -> Result<()> { ) .await? } + Commands::Hummock(HummockCommands::MergeCompactionGroup { + left_group_id, + right_group_id, + }) => { + cmd_impl::hummock::merge_compaction_group(context, left_group_id, right_group_id) + .await? + } Commands::Table(TableCommands::Scan { mv_name, data_dir, @@ -808,8 +832,8 @@ async fn start_impl(opts: CliOpts, context: &CtlContext) -> Result<()> { Commands::Meta(MetaCommands::Pause) => cmd_impl::meta::pause(context).await?, Commands::Meta(MetaCommands::Resume) => cmd_impl::meta::resume(context).await?, Commands::Meta(MetaCommands::ClusterInfo) => cmd_impl::meta::cluster_info(context).await?, - Commands::Meta(MetaCommands::SourceSplitInfo) => { - cmd_impl::meta::source_split_info(context).await? + Commands::Meta(MetaCommands::SourceSplitInfo { ignore_id }) => { + cmd_impl::meta::source_split_info(context, ignore_id).await? } Commands::Meta(MetaCommands::Reschedule { from, diff --git a/src/dml/src/lib.rs b/src/dml/src/lib.rs index a15a4dfb3fba9..f0034a630a823 100644 --- a/src/dml/src/lib.rs +++ b/src/dml/src/lib.rs @@ -14,7 +14,6 @@ #![allow(clippy::derive_partial_eq_without_eq)] #![feature(trait_alias)] -#![feature(lint_reasons)] #![feature(coroutines)] #![feature(hash_extract_if)] #![feature(type_alias_impl_trait)] diff --git a/src/error/src/lib.rs b/src/error/src/lib.rs index 4dde816be458b..010308bf95cc8 100644 --- a/src/error/src/lib.rs +++ b/src/error/src/lib.rs @@ -21,7 +21,6 @@ //! access if `risingwave_common` is already a dependency. #![feature(error_generic_member_access)] -#![feature(lint_reasons)] #![feature(register_tool)] #![register_tool(rw)] #![feature(trait_alias)] diff --git a/src/expr/core/src/lib.rs b/src/expr/core/src/lib.rs index d45d4ca11f80a..73e3b6a6ed2e3 100644 --- a/src/expr/core/src/lib.rs +++ b/src/expr/core/src/lib.rs @@ -13,7 +13,6 @@ // limitations under the License. #![feature(let_chains)] -#![feature(lint_reasons)] #![feature(iterator_try_collect)] #![feature(coroutines)] #![feature(never_type)] diff --git a/src/expr/impl/Cargo.toml b/src/expr/impl/Cargo.toml index e493037c200b7..c0e506889ef77 100644 --- a/src/expr/impl/Cargo.toml +++ b/src/expr/impl/Cargo.toml @@ -51,7 +51,7 @@ itertools = { workspace = true } jsonbb = { workspace = true } linkme = { version = "0.3", features = ["used_linker"] } md5 = "0.7" -moka = { version = "0.12", features = ["sync"] } +moka = { version = "0.12.0", features = ["sync"] } num-traits = "0.2" openssl = "0.10" regex = "1" diff --git a/src/expr/impl/src/lib.rs b/src/expr/impl/src/lib.rs index e5c69c2660eeb..e710749a122d6 100644 --- a/src/expr/impl/src/lib.rs +++ b/src/expr/impl/src/lib.rs @@ -23,7 +23,6 @@ #![allow(non_snake_case)] // for `ctor` generated code #![feature(let_chains)] #![feature(assert_matches)] -#![feature(lint_reasons)] #![feature(iterator_try_collect)] #![feature(coroutines)] #![feature(test)] diff --git a/src/expr/impl/src/scalar/array.rs b/src/expr/impl/src/scalar/array.rs index d5f53213bf277..7b7d272000597 100644 --- a/src/expr/impl/src/scalar/array.rs +++ b/src/expr/impl/src/scalar/array.rs @@ -15,7 +15,7 @@ use risingwave_common::array::{ListValue, StructValue}; use risingwave_common::row::Row; use risingwave_common::types::{ - DataType, ListRef, MapRef, MapType, MapValue, ScalarRefImpl, ToOwnedDatum, + DataType, ListRef, MapRef, MapType, MapValue, ScalarRef, ScalarRefImpl, ToOwnedDatum, }; use risingwave_expr::expr::Context; use risingwave_expr::{function, ExprError}; @@ -241,6 +241,60 @@ fn map_delete(map: MapRef<'_>, key: Option>) -> MapValue { MapValue::delete(map, key) } +/// # Example +/// +/// ```slt +/// query T +/// select map_keys(map{'a':1, 'b':2, 'c':3}); +/// ---- +/// {a,b,c} +/// ``` +#[function( + "map_keys(anymap) -> anyarray", + type_infer = "|args|{ + Ok(DataType::List(Box::new(args[0].as_map().key().clone()))) + }" +)] +fn map_keys(map: MapRef<'_>) -> ListValue { + map.into_kv().0.to_owned_scalar() +} + +/// # Example +/// +/// ```slt +/// query T +/// select map_values(map{'a':1, 'b':2, 'c':3}); +/// ---- +/// {1,2,3} +/// ``` +#[function( + "map_values(anymap) -> anyarray", + type_infer = "|args|{ + Ok(DataType::List(Box::new(args[0].as_map().value().clone()))) + }" +)] +fn map_values(map: MapRef<'_>) -> ListValue { + map.into_kv().1.to_owned_scalar() +} + +/// # Example +/// +/// ```slt +/// query T +/// select map_entries(map{'a':1, 'b':2, 'c':3}); +/// ---- +/// {"(a,1)","(b,2)","(c,3)"} +/// ``` +#[function( + "map_entries(anymap) -> anyarray", + type_infer = "|args|{ + Ok(args[0].as_map().clone().into_list()) + }" +)] +fn map_entries(map: MapRef<'_>) -> ListValue { + map.into_inner().to_owned() +} + #[cfg(test)] mod tests { use risingwave_common::array::DataChunk; diff --git a/src/expr/impl/src/scalar/jsonb_record.rs b/src/expr/impl/src/scalar/jsonb_record.rs index b85feb9190d2a..a6def7cb25643 100644 --- a/src/expr/impl/src/scalar/jsonb_record.rs +++ b/src/expr/impl/src/scalar/jsonb_record.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use risingwave_common::types::{JsonbRef, StructRef, StructValue}; +use risingwave_common::types::{JsonbRef, MapRef, MapValue, Scalar, StructRef, StructValue}; use risingwave_expr::expr::Context; use risingwave_expr::{function, ExprError, Result}; @@ -60,6 +60,22 @@ fn jsonb_populate_record( jsonb.populate_struct(output_type, base).map_err(parse_err) } +#[function("jsonb_populate_map(anymap, jsonb) -> anymap")] +pub fn jsonb_populate_map( + base: Option>, + v: JsonbRef<'_>, + ctx: &Context, +) -> Result { + let output_type = ctx.return_type.as_map(); + let jsonb_map = v + .to_map(output_type) + .map_err(|e| ExprError::Parse(e.into()))?; + match base { + Some(base) => Ok(MapValue::concat(base, jsonb_map.as_scalar_ref())), + None => Ok(jsonb_map), + } +} + /// Expands the top-level JSON array of objects to a set of rows having the composite type of the /// base argument. Each element of the JSON array is processed as described above for /// `jsonb_populate_record`. diff --git a/src/expr/impl/src/scalar/vnode.rs b/src/expr/impl/src/scalar/vnode.rs index e544c39f62499..edd4caa39970e 100644 --- a/src/expr/impl/src/scalar/vnode.rs +++ b/src/expr/impl/src/scalar/vnode.rs @@ -43,7 +43,8 @@ impl Expression for VnodeExpression { } async fn eval(&self, input: &DataChunk) -> Result { - let vnodes = VirtualNode::compute_chunk(input, &self.dist_key_indices); + // TODO(var-vnode): get vnode count from context + let vnodes = VirtualNode::compute_chunk(input, &self.dist_key_indices, VirtualNode::COUNT); let mut builder = I16ArrayBuilder::new(input.capacity()); vnodes .into_iter() @@ -52,8 +53,9 @@ impl Expression for VnodeExpression { } async fn eval_row(&self, input: &OwnedRow) -> Result { + // TODO(var-vnode): get vnode count from context Ok(Some( - VirtualNode::compute_row(input, &self.dist_key_indices) + VirtualNode::compute_row(input, &self.dist_key_indices, VirtualNode::COUNT) .to_scalar() .into(), )) diff --git a/src/expr/impl/src/udf/external.rs b/src/expr/impl/src/udf/external.rs index 5c400df26c179..0d6ba0e409386 100644 --- a/src/expr/impl/src/udf/external.rs +++ b/src/expr/impl/src/udf/external.rs @@ -25,6 +25,7 @@ use ginepro::{LoadBalancedChannel, ResolutionStrategy}; use risingwave_common::array::arrow::{ToArrow, UdfArrowConvert}; use risingwave_common::util::addr::HostAddr; use thiserror_ext::AsReport; +use tokio::runtime::Runtime; use super::*; @@ -174,9 +175,16 @@ fn get_or_create_flight_client(link: &str) -> Result> { // reuse existing client Ok(client) } else { + static RUNTIME: LazyLock = LazyLock::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("rw-udf") + .enable_all() + .build() + .expect("failed to build udf runtime") + }); // create new client let client = Arc::new(tokio::task::block_in_place(|| { - tokio::runtime::Handle::current().block_on(async { + RUNTIME.block_on(async { let channel = connect_tonic(link).await?; Ok(Client::new(channel).await?) as Result<_> }) diff --git a/src/expr/macro/src/lib.rs b/src/expr/macro/src/lib.rs index 8fd03e344db89..630c82a87701b 100644 --- a/src/expr/macro/src/lib.rs +++ b/src/expr/macro/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] #![feature(let_chains)] use std::vec; diff --git a/src/frontend/planner_test/tests/testdata/output/agg.yaml b/src/frontend/planner_test/tests/testdata/output/agg.yaml index e44426caa3a49..da2a391a8c603 100644 --- a/src/frontend/planner_test/tests/testdata/output/agg.yaml +++ b/src/frontend/planner_test/tests/testdata/output/agg.yaml @@ -71,7 +71,7 @@ └─BatchScan { table: t, columns: [t.v1, t.v2, t.v3], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [agg], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [(min(min(t.v1)) + (max(max(t.v2)) * sum0(count(t.v3)))) as $expr1], noop_update_hint: true } + └─StreamProject { exprs: [(min(min(t.v1)) + (max(max(t.v2)) * sum0(count(t.v3)))) as $expr1] } └─StreamSimpleAgg { aggs: [min(min(t.v1)), max(max(t.v2)), sum0(count(t.v3)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [min(t.v1), max(t.v2), count(t.v3), count] } @@ -273,7 +273,7 @@ └─BatchScan { table: t, columns: [t.v1, t.v2], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [cnt, sum], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum0(count($expr1)), sum(sum($expr1))], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count($expr1)), sum(sum($expr1))] } └─StreamSimpleAgg { aggs: [sum0(count($expr1)), sum(sum($expr1)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count($expr1), sum($expr1)] } @@ -571,7 +571,7 @@ └─BatchScan { table: t, columns: [t.v1, t.v2, t.v3], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [agg], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [(min(min(t.v1)) + (max(max(t.v3)) * sum0(count(t.v2)))) as $expr1], noop_update_hint: true } + └─StreamProject { exprs: [(min(min(t.v1)) + (max(max(t.v3)) * sum0(count(t.v2)))) as $expr1] } └─StreamSimpleAgg { aggs: [min(min(t.v1)), max(max(t.v3)), sum0(count(t.v2)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [min(t.v1), max(t.v3), count(t.v2), count] } @@ -628,7 +628,7 @@ └─LogicalScan { table: t, columns: [t.v1] } stream_plan: |- StreamMaterialize { columns: [s1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v1))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(t.v1))] } └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(t.v1)] } @@ -647,7 +647,7 @@ └─LogicalScan { table: t, columns: [t.v1] } stream_plan: |- StreamMaterialize { columns: [s1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v1))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(t.v1))] } └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(t.v1)] } @@ -666,7 +666,7 @@ └─LogicalScan { table: t, columns: [t.v1] } stream_plan: |- StreamMaterialize { columns: [s1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v1))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(t.v1))] } └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(t.v1)] } @@ -685,7 +685,7 @@ └─LogicalScan { table: t, columns: [t.v1] } stream_plan: |- StreamMaterialize { columns: [sa], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v1) filter((t.v1 > 0:Int32)))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(t.v1) filter((t.v1 > 0:Int32)))] } └─StreamSimpleAgg { aggs: [sum(sum(t.v1) filter((t.v1 > 0:Int32))), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(t.v1) filter((t.v1 > 0:Int32))] } @@ -720,7 +720,7 @@ └─LogicalScan { table: t, columns: [t.a, t.b] } stream_plan: |- StreamMaterialize { columns: [sab], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [max(max($expr1) filter((t.a < t.b) AND ((t.a + t.b) < 100:Int32) AND ((t.a * t.b) <> ((t.a + t.b) - 1:Int32))))], noop_update_hint: true } + └─StreamProject { exprs: [max(max($expr1) filter((t.a < t.b) AND ((t.a + t.b) < 100:Int32) AND ((t.a * t.b) <> ((t.a + t.b) - 1:Int32))))] } └─StreamSimpleAgg { aggs: [max(max($expr1) filter((t.a < t.b) AND ((t.a + t.b) < 100:Int32) AND ((t.a * t.b) <> ((t.a + t.b) - 1:Int32)))), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [$expr2], aggs: [max($expr1) filter((t.a < t.b) AND ((t.a + t.b) < 100:Int32) AND ((t.a * t.b) <> ((t.a + t.b) - 1:Int32))), count] } @@ -759,7 +759,7 @@ └─LogicalScan { table: t, columns: [t.a, t.b] } stream_plan: |- StreamMaterialize { columns: [cnt_agb], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum0(count filter((t.a > t.b)))], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count filter((t.a > t.b)))] } └─StreamSimpleAgg { aggs: [sum0(count filter((t.a > t.b))), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count filter((t.a > t.b))] } @@ -813,7 +813,7 @@ └─BatchScan { table: t, columns: [t.v2], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [b], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v2) filter((t.v2 < 5:Int32)))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(t.v2) filter((t.v2 < 5:Int32)))] } └─StreamSimpleAgg { aggs: [sum(sum(t.v2) filter((t.v2 < 5:Int32))), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(t.v2) filter((t.v2 < 5:Int32))] } @@ -896,7 +896,7 @@ └─BatchScan { table: t, columns: [t.x, t.y], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [string_agg, count], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [string_agg(t.y, ',':Varchar), count(distinct t.x)], noop_update_hint: true } + └─StreamProject { exprs: [string_agg(t.y, ',':Varchar), count(distinct t.x)] } └─StreamSimpleAgg { aggs: [string_agg(t.y, ',':Varchar), count(distinct t.x), count] } └─StreamExchange { dist: Single } └─StreamProject { exprs: [t.y, ',':Varchar, t.x, t._row_id] } @@ -917,7 +917,7 @@ └─BatchScan { table: t, columns: [t.x, t.y], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [string_agg, count], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [string_agg(t.y, ',':Varchar order_by(t.y ASC)), count(distinct t.x)], noop_update_hint: true } + └─StreamProject { exprs: [string_agg(t.y, ',':Varchar order_by(t.y ASC)), count(distinct t.x)] } └─StreamSimpleAgg { aggs: [string_agg(t.y, ',':Varchar order_by(t.y ASC)), count(distinct t.x), count] } └─StreamExchange { dist: Single } └─StreamProject { exprs: [t.y, ',':Varchar, t.x, t._row_id] } @@ -938,7 +938,7 @@ └─BatchScan { table: t, columns: [t.x, t.y], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [string_agg, count], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [string_agg(distinct t.y, ',':Varchar order_by(t.y ASC)), count(distinct t.x)], noop_update_hint: true } + └─StreamProject { exprs: [string_agg(distinct t.y, ',':Varchar order_by(t.y ASC)), count(distinct t.x)] } └─StreamSimpleAgg { aggs: [string_agg(distinct t.y, ',':Varchar order_by(t.y ASC)), count(distinct t.x), count] } └─StreamExchange { dist: Single } └─StreamProject { exprs: [t.y, ',':Varchar, t.x, t._row_id] } @@ -1006,7 +1006,7 @@ └─LogicalScan { table: t, columns: [t.a, t.b] } stream_plan: |- StreamMaterialize { columns: [s1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum($expr1) filter((t.b < 100:Int32) AND ((t.b * 2:Int32) > 10:Int32)))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum($expr1) filter((t.b < 100:Int32) AND ((t.b * 2:Int32) > 10:Int32)))] } └─StreamSimpleAgg { aggs: [sum(sum($expr1) filter((t.b < 100:Int32) AND ((t.b * 2:Int32) > 10:Int32))), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum($expr1) filter((t.b < 100:Int32) AND ((t.b * 2:Int32) > 10:Int32))] } @@ -1313,7 +1313,7 @@ stream_plan: |- StreamMaterialize { columns: [stddev_samp, stddev_pop], stream_key: [], pk_columns: [], pk_conflict: NoCheck } └─StreamProject { exprs: [Case((sum0(count(t.v1)) <= 1:Int64), null:Decimal, Sqrt(((sum(sum($expr1))::Decimal - (($expr2 * $expr2) / $expr3)) / (sum0(count(t.v1)) - 1:Int64)::Decimal))) as $expr4, Sqrt(((sum(sum($expr1))::Decimal - (($expr2 * $expr2) / $expr3)) / $expr3)) as $expr5] } - └─StreamProject { exprs: [sum(sum($expr1)), sum(sum(t.v1)), sum0(count(t.v1)), sum(sum(t.v1))::Decimal as $expr2, sum0(count(t.v1))::Decimal as $expr3], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum($expr1)), sum(sum(t.v1)), sum0(count(t.v1)), sum(sum(t.v1))::Decimal as $expr2, sum0(count(t.v1))::Decimal as $expr3] } └─StreamSimpleAgg { aggs: [sum(sum($expr1)), sum(sum(t.v1)), sum0(count(t.v1)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum($expr1), sum(t.v1), count(t.v1)] } @@ -1370,7 +1370,7 @@ └─BatchScan { table: t, columns: [t.v1, t.v2], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [min, sum], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [min(min(t.v1)), sum(sum(t.v2))], noop_update_hint: true } + └─StreamProject { exprs: [min(min(t.v1)), sum(sum(t.v2))] } └─StreamSimpleAgg { aggs: [min(min(t.v1)), sum(sum(t.v2)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [min(t.v1), sum(t.v2), count] } @@ -1388,7 +1388,7 @@ └─BatchScan { table: t, columns: [t.v1, t.v2], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [min, sum], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [min(t.v1), sum(t.v2)], noop_update_hint: true } + └─StreamProject { exprs: [min(t.v1), sum(t.v2)] } └─StreamSimpleAgg { aggs: [min(t.v1), sum(t.v2), count] } └─StreamExchange { dist: Single } └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } @@ -1677,7 +1677,7 @@ └─BatchScan { table: t, columns: [t.x, t.y], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [first_value], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [first_value(t.x order_by(t.y ASC))], noop_update_hint: true } + └─StreamProject { exprs: [first_value(t.x order_by(t.y ASC))] } └─StreamSimpleAgg { aggs: [first_value(t.x order_by(t.y ASC)), count] } └─StreamExchange { dist: Single } └─StreamTableScan { table: t, columns: [t.x, t.y, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } @@ -1685,7 +1685,7 @@ Fragment 0 StreamMaterialize { columns: [first_value], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [first_value(t.x order_by(t.y ASC))], noop_update_hint: true } + └── StreamProject { exprs: [first_value(t.x order_by(t.y ASC))] } └── StreamSimpleAgg { aggs: [first_value(t.x order_by(t.y ASC)), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] └── StreamExchange Single from 1 @@ -1717,7 +1717,7 @@ Fragment 0 StreamMaterialize { columns: [first_value], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [first_value(distinct t.x order_by(t.x ASC))], noop_update_hint: true } + └── StreamProject { exprs: [first_value(distinct t.x order_by(t.x ASC))] } └── StreamSimpleAgg { aggs: [first_value(distinct t.x order_by(t.x ASC)), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall0: 0, SimpleAggDedupForCol0: 2 ] └── StreamExchange Single from 1 @@ -1753,7 +1753,7 @@ └─BatchScan { table: t, columns: [t.x, t.y], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [last_value], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [last_value(t.x order_by(t.y DESC NULLS LAST))], noop_update_hint: true } + └─StreamProject { exprs: [last_value(t.x order_by(t.y DESC NULLS LAST))] } └─StreamSimpleAgg { aggs: [last_value(t.x order_by(t.y DESC NULLS LAST)), count] } └─StreamExchange { dist: Single } └─StreamTableScan { table: t, columns: [t.x, t.y, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } @@ -1874,7 +1874,7 @@ └─LogicalScan { table: t, columns: [t.v1, t._row_id] } stream_plan: |- StreamMaterialize { columns: [x, y, z, w], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v1)), sum0(count(t.v1)), sum(sum(t.v1)), sum0(count(t.v1))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(t.v1)), sum0(count(t.v1)), sum(sum(t.v1)), sum0(count(t.v1))] } └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), sum0(count(t.v1)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(t.v1), count(t.v1)] } @@ -1895,12 +1895,11 @@ └─BatchScan { table: t, columns: [t.v1], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [approx_percentile], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [approx_percentile], noop_update_hint: true } - └─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } - └─StreamExchange { dist: Single } - └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } - └─StreamProject { exprs: [t.v1::Float64 as $expr1, t._row_id] } - └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } + └─StreamExchange { dist: Single } + └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } + └─StreamProject { exprs: [t.v1::Float64 as $expr1, t._row_id] } + └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - name: test simple approx_percentile with other simple aggs sql: | CREATE TABLE t (v1 int); @@ -1917,20 +1916,19 @@ └─BatchScan { table: t, columns: [t.v1], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [approx_percentile, sum], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [approx_percentile, sum(sum(t.v1))], noop_update_hint: true } - └─StreamRowMerge { output: [approx_percentile:Float64, sum(sum(t.v1)):Int64] } - ├─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } - │ └─StreamExchange { dist: Single } - │ └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } - │ └─StreamShare { id: 2 } - │ └─StreamProject { exprs: [t.v1::Float64 as $expr1, t.v1, t._row_id] } - │ └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), count] } - └─StreamExchange { dist: Single } - └─StreamStatelessSimpleAgg { aggs: [sum(t.v1)] } - └─StreamShare { id: 2 } - └─StreamProject { exprs: [t.v1::Float64 as $expr1, t.v1, t._row_id] } - └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamRowMerge { output: [approx_percentile:Float64, sum(sum(t.v1)):Int64] } + ├─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } + │ └─StreamExchange { dist: Single } + │ └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } + │ └─StreamShare { id: 2 } + │ └─StreamProject { exprs: [t.v1::Float64 as $expr1, t.v1, t._row_id] } + │ └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), count], must_output_per_barrier: true } + └─StreamExchange { dist: Single } + └─StreamStatelessSimpleAgg { aggs: [sum(t.v1)] } + └─StreamShare { id: 2 } + └─StreamProject { exprs: [t.v1::Float64 as $expr1, t.v1, t._row_id] } + └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - name: test simple approx_percentile with other simple aggs (sum, count) sql: | CREATE TABLE t (v1 int); @@ -1948,7 +1946,7 @@ └─BatchScan { table: t, columns: [t.v1], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [s1, approx_percentile, s2, count], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v1)), approx_percentile, sum(sum(t.v1)), sum0(count(t.v1))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(t.v1)), approx_percentile, sum(sum(t.v1)), sum0(count(t.v1))] } └─StreamRowMerge { output: [sum(sum(t.v1)):Int64, approx_percentile:Float64, sum0(count(t.v1)):Int64] } ├─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } │ └─StreamExchange { dist: Single } @@ -1956,7 +1954,7 @@ │ └─StreamShare { id: 2 } │ └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t._row_id] } │ └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), sum0(count(t.v1)), count] } + └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), sum0(count(t.v1)), count], must_output_per_barrier: true } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(t.v1), count(t.v1)] } └─StreamShare { id: 2 } @@ -1973,7 +1971,7 @@ └─LogicalScan { table: t, columns: [t.v1, t._row_id] } stream_plan: |- StreamMaterialize { columns: [x, y], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [approx_percentile, approx_percentile], noop_update_hint: true } + └─StreamProject { exprs: [approx_percentile, approx_percentile] } └─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } └─StreamExchange { dist: Single } └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } @@ -1995,20 +1993,19 @@ └─BatchScan { table: t, columns: [t.v1, t.v2], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [x, y], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [approx_percentile, approx_percentile], noop_update_hint: true } - └─StreamRowMerge { output: [approx_percentile:Float64, approx_percentile:Float64] } - ├─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } - │ └─StreamExchange { dist: Single } - │ └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } - │ └─StreamShare { id: 2 } - │ └─StreamProject { exprs: [t.v1::Float64 as $expr1, t.v2::Float64 as $expr2, t._row_id] } - │ └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - └─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } - └─StreamExchange { dist: Single } - └─StreamLocalApproxPercentile { percentile_col: $expr2, quantile: 0.5:Float64, relative_error: 0.01:Float64 } - └─StreamShare { id: 2 } - └─StreamProject { exprs: [t.v1::Float64 as $expr1, t.v2::Float64 as $expr2, t._row_id] } - └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamRowMerge { output: [approx_percentile:Float64, approx_percentile:Float64] } + ├─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } + │ └─StreamExchange { dist: Single } + │ └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } + │ └─StreamShare { id: 2 } + │ └─StreamProject { exprs: [t.v1::Float64 as $expr1, t.v2::Float64 as $expr2, t._row_id] } + │ └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } + └─StreamExchange { dist: Single } + └─StreamLocalApproxPercentile { percentile_col: $expr2, quantile: 0.5:Float64, relative_error: 0.01:Float64 } + └─StreamShare { id: 2 } + └─StreamProject { exprs: [t.v1::Float64 as $expr1, t.v2::Float64 as $expr2, t._row_id] } + └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - name: test simple approx_percentile with different approx_percentile interleaved with stateless simple aggs sql: | CREATE TABLE t (v1 int, v2 int); @@ -2026,7 +2023,7 @@ └─BatchScan { table: t, columns: [t.v1, t.v2], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [s1, x, count, y], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v1)), approx_percentile, sum0(count), (sum(sum(t.v2))::Float64 + approx_percentile) as $expr3], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(t.v1)), approx_percentile, sum0(count), (sum(sum(t.v2))::Float64 + approx_percentile) as $expr3] } └─StreamRowMerge { output: [sum(sum(t.v1)):Int64, approx_percentile:Float64, sum0(count):Int64, sum(sum(t.v2)):Int64, approx_percentile:Float64] } ├─StreamRowMerge { output: [approx_percentile:Float64, approx_percentile:Float64] } │ ├─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } @@ -2041,7 +2038,7 @@ │ └─StreamShare { id: 2 } │ └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t.v2, t.v2::Float64 as $expr2, t._row_id] } │ └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), sum0(count), sum(sum(t.v2)), count] } + └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), sum0(count), sum(sum(t.v2)), count], must_output_per_barrier: true } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(t.v1), count, sum(t.v2)] } └─StreamShare { id: 2 } @@ -2064,7 +2061,7 @@ └─BatchScan { table: t, columns: [t.v1, t.v2], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [s1, x, count, y], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v1)), approx_percentile, sum0(count), (sum(sum(t.v2))::Float64 + approx_percentile) as $expr3], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(t.v1)), approx_percentile, sum0(count), (sum(sum(t.v2))::Float64 + approx_percentile) as $expr3] } └─StreamRowMerge { output: [sum(sum(t.v1)):Int64, approx_percentile:Float64, sum0(count):Int64, sum(sum(t.v2)):Int64, approx_percentile:Float64] } ├─StreamRowMerge { output: [approx_percentile:Float64, approx_percentile:Float64] } │ ├─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } @@ -2079,7 +2076,7 @@ │ └─StreamShare { id: 2 } │ └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t.v2, t.v2::Float64 as $expr2, t._row_id] } │ └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), sum0(count), sum(sum(t.v2)), count] } + └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), sum0(count), sum(sum(t.v2)), count], must_output_per_barrier: true } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(t.v1), count, sum(t.v2)] } └─StreamShare { id: 2 } @@ -2101,20 +2098,19 @@ └─BatchScan { table: t, columns: [t.v1], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [s1, approx_percentile], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v1)), approx_percentile], noop_update_hint: true } - └─StreamRowMerge { output: [sum(sum(t.v1)):Int64, approx_percentile:Float64] } - ├─StreamGlobalApproxPercentile { quantile: 0.8:Float64, relative_error: 0.01:Float64 } - │ └─StreamExchange { dist: Single } - │ └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.8:Float64, relative_error: 0.01:Float64 } - │ └─StreamShare { id: 2 } - │ └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t._row_id] } - │ └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), count] } - └─StreamExchange { dist: Single } - └─StreamStatelessSimpleAgg { aggs: [sum(t.v1)] } - └─StreamShare { id: 2 } - └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t._row_id] } - └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamRowMerge { output: [sum(sum(t.v1)):Int64, approx_percentile:Float64] } + ├─StreamGlobalApproxPercentile { quantile: 0.8:Float64, relative_error: 0.01:Float64 } + │ └─StreamExchange { dist: Single } + │ └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.8:Float64, relative_error: 0.01:Float64 } + │ └─StreamShare { id: 2 } + │ └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t._row_id] } + │ └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), count], must_output_per_barrier: true } + └─StreamExchange { dist: Single } + └─StreamStatelessSimpleAgg { aggs: [sum(t.v1)] } + └─StreamShare { id: 2 } + └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t._row_id] } + └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - name: test simple approx_percentile with different approx_percentile interleaved with stateless + stateful simple aggs sql: | CREATE TABLE t (v1 int, v2 int); @@ -2131,26 +2127,25 @@ └─BatchScan { table: t, columns: [t.v1, t.v2], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [s1, x, count, m2, y], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v1)), approx_percentile, sum0(count), max(max(t.v2)), approx_percentile], noop_update_hint: true } - └─StreamRowMerge { output: [sum(sum(t.v1)):Int64, approx_percentile:Float64, sum0(count):Int64, max(max(t.v2)):Int32, approx_percentile:Float64] } - ├─StreamRowMerge { output: [approx_percentile:Float64, approx_percentile:Float64] } - │ ├─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } - │ │ └─StreamExchange { dist: Single } - │ │ └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } - │ │ └─StreamShare { id: 2 } - │ │ └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t.v2, t.v2::Float64 as $expr2, t._row_id] } - │ │ └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - │ └─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } - │ └─StreamExchange { dist: Single } - │ └─StreamLocalApproxPercentile { percentile_col: $expr2, quantile: 0.5:Float64, relative_error: 0.01:Float64 } - │ └─StreamShare { id: 2 } - │ └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t.v2, t.v2::Float64 as $expr2, t._row_id] } - │ └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), sum0(count), max(max(t.v2)), count] } - └─StreamExchange { dist: Single } - └─StreamHashAgg { group_key: [$expr5], aggs: [sum(t.v1), count, max(t.v2)] } - └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr3, t.v2, t.v2::Float64 as $expr4, t._row_id, Vnode(t._row_id) as $expr5] } - └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamRowMerge { output: [sum(sum(t.v1)):Int64, approx_percentile:Float64, sum0(count):Int64, max(max(t.v2)):Int32, approx_percentile:Float64] } + ├─StreamRowMerge { output: [approx_percentile:Float64, approx_percentile:Float64] } + │ ├─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } + │ │ └─StreamExchange { dist: Single } + │ │ └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } + │ │ └─StreamShare { id: 2 } + │ │ └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t.v2, t.v2::Float64 as $expr2, t._row_id] } + │ │ └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + │ └─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } + │ └─StreamExchange { dist: Single } + │ └─StreamLocalApproxPercentile { percentile_col: $expr2, quantile: 0.5:Float64, relative_error: 0.01:Float64 } + │ └─StreamShare { id: 2 } + │ └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr1, t.v2, t.v2::Float64 as $expr2, t._row_id] } + │ └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamSimpleAgg { aggs: [sum(sum(t.v1)), sum0(count), max(max(t.v2)), count], must_output_per_barrier: true } + └─StreamExchange { dist: Single } + └─StreamHashAgg { group_key: [$expr5], aggs: [sum(t.v1), count, max(t.v2)] } + └─StreamProject { exprs: [t.v1, t.v1::Float64 as $expr3, t.v2, t.v2::Float64 as $expr4, t._row_id, Vnode(t._row_id) as $expr5] } + └─StreamTableScan { table: t, columns: [t.v1, t.v2, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - name: test hash approx_percentile sql: | CREATE TABLE t (v1 int, v2 int); @@ -2198,9 +2193,8 @@ └─BatchScan { table: t, columns: [t.v1], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [approx_percentile], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [approx_percentile], noop_update_hint: true } - └─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } - └─StreamExchange { dist: Single } - └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } - └─StreamProject { exprs: [t.v1::Float64 as $expr1, t._row_id] } - └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamGlobalApproxPercentile { quantile: 0.5:Float64, relative_error: 0.01:Float64 } + └─StreamExchange { dist: Single } + └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } + └─StreamProject { exprs: [t.v1::Float64 as $expr1, t._row_id] } + └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } diff --git a/src/frontend/planner_test/tests/testdata/output/append_only.yaml b/src/frontend/planner_test/tests/testdata/output/append_only.yaml index e76813e05f759..d0701675c3617 100644 --- a/src/frontend/planner_test/tests/testdata/output/append_only.yaml +++ b/src/frontend/planner_test/tests/testdata/output/append_only.yaml @@ -33,7 +33,7 @@ select max(v1) as max_v1 from t1; stream_plan: |- StreamMaterialize { columns: [max_v1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [max(max(t1.v1))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(t1.v1))] } └─StreamSimpleAgg [append_only] { aggs: [max(max(t1.v1)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [max(t1.v1)] } diff --git a/src/frontend/planner_test/tests/testdata/output/bushy_join.yaml b/src/frontend/planner_test/tests/testdata/output/bushy_join.yaml index a785ac443901a..9d042f1e60c8b 100644 --- a/src/frontend/planner_test/tests/testdata/output/bushy_join.yaml +++ b/src/frontend/planner_test/tests/testdata/output/bushy_join.yaml @@ -8,7 +8,7 @@ sql: select count(*) from t t1 join t t2 on t1.id = t2.id join t t3 on t1.id = t3.id join t t4 on t1.id = t4.id join t t5 on t1.id = t5.id join t t6 on t1.id = t6.id join t t7 on t1.id = t7.id join t t8 on t1.id = t8.id; stream_plan: |- StreamMaterialize { columns: [count], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum0(count)], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count)] } └─StreamSimpleAgg { aggs: [sum0(count), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count] } diff --git a/src/frontend/planner_test/tests/testdata/output/ch_benchmark.yaml b/src/frontend/planner_test/tests/testdata/output/ch_benchmark.yaml index 3b96806aabc7a..ce98b8bea75c9 100644 --- a/src/frontend/planner_test/tests/testdata/output/ch_benchmark.yaml +++ b/src/frontend/planner_test/tests/testdata/output/ch_benchmark.yaml @@ -869,7 +869,7 @@ └─BatchScan { table: order_line, columns: [order_line.ol_amount, order_line.ol_delivery_d, order_line.ol_quantity], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(order_line.ol_amount))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(order_line.ol_amount))] } └─StreamSimpleAgg { aggs: [sum(sum(order_line.ol_amount)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(order_line.ol_amount)] } @@ -880,7 +880,7 @@ Fragment 0 StreamMaterialize { columns: [revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [sum(sum(order_line.ol_amount))], noop_update_hint: true } + └── StreamProject { exprs: [sum(sum(order_line.ol_amount))] } └── StreamSimpleAgg { aggs: [sum(sum(order_line.ol_amount)), count] } ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 @@ -1940,7 +1940,7 @@ │ └─StreamProject { exprs: [stock.s_i_id, stock.s_order_cnt, ((stock.s_w_id * stock.s_i_id) % 10000:Int32)::Int64 as $expr1, stock.s_w_id] } │ └─StreamTableScan { table: stock, columns: [stock.s_i_id, stock.s_w_id, stock.s_order_cnt], stream_scan_type: ArrangementBackfill, stream_key: [stock.s_w_id, stock.s_i_id], pk: [s_w_id, s_i_id], dist: UpstreamHashShard(stock.s_w_id, stock.s_i_id) } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [(sum(sum(stock.s_order_cnt))::Decimal * 0.005:Decimal) as $expr3], noop_update_hint: true } + └─StreamProject { exprs: [(sum(sum(stock.s_order_cnt))::Decimal * 0.005:Decimal) as $expr3] } └─StreamSimpleAgg { aggs: [sum(sum(stock.s_order_cnt)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(stock.s_order_cnt)] } @@ -2008,7 +2008,7 @@ └── BatchPlanNode Fragment 7 - StreamProject { exprs: [(sum(sum(stock.s_order_cnt))::Decimal * 0.005:Decimal) as $expr3], noop_update_hint: true } + StreamProject { exprs: [(sum(sum(stock.s_order_cnt))::Decimal * 0.005:Decimal) as $expr3] } └── StreamSimpleAgg { aggs: [sum(sum(stock.s_order_cnt)), count] } { tables: [ SimpleAggState: 14 ] } └── StreamExchange Single from 8 @@ -2265,7 +2265,7 @@ └─BatchScan { table: order_line, columns: [order_line.ol_i_id, order_line.ol_amount, order_line.ol_delivery_d], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [promo_revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [((100.00:Decimal * sum(sum($expr1))) / (1:Decimal + sum(sum(order_line.ol_amount)))) as $expr2], noop_update_hint: true } + └─StreamProject { exprs: [((100.00:Decimal * sum(sum($expr1))) / (1:Decimal + sum(sum(order_line.ol_amount)))) as $expr2] } └─StreamSimpleAgg { aggs: [sum(sum($expr1)), sum(sum(order_line.ol_amount)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum($expr1), sum(order_line.ol_amount)] } @@ -2279,9 +2279,11 @@ └─StreamTableScan { table: item, columns: [item.i_id, item.i_data], stream_scan_type: ArrangementBackfill, stream_key: [item.i_id], pk: [i_id], dist: UpstreamHashShard(item.i_id) } stream_dist_plan: |+ Fragment 0 - StreamMaterialize { columns: [promo_revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } { tables: [ Materialize: 4294967294 ] } - └── StreamProject { exprs: [((100.00:Decimal * sum(sum($expr1))) / (1:Decimal + sum(sum(order_line.ol_amount)))) as $expr2], noop_update_hint: true } - └── StreamSimpleAgg { aggs: [sum(sum($expr1)), sum(sum(order_line.ol_amount)), count] } { tables: [ SimpleAggState: 0 ] } + StreamMaterialize { columns: [promo_revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } + ├── tables: [ Materialize: 4294967294 ] + └── StreamProject { exprs: [((100.00:Decimal * sum(sum($expr1))) / (1:Decimal + sum(sum(order_line.ol_amount)))) as $expr2] } + └── StreamSimpleAgg { aggs: [sum(sum($expr1)), sum(sum(order_line.ol_amount)), count] } + ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 Fragment 1 @@ -2356,7 +2358,7 @@ │ └─StreamProject { exprs: [revenue1.total_revenue, revenue1.supplier_no::Int64 as $expr1, revenue1.supplier_no] } │ └─StreamTableScan { table: revenue1, columns: [revenue1.supplier_no, revenue1.total_revenue], stream_scan_type: ArrangementBackfill, stream_key: [revenue1.supplier_no], pk: [supplier_no], dist: UpstreamHashShard(revenue1.supplier_no) } └─StreamExchange { dist: HashShard(max(max(revenue1.total_revenue))) } - └─StreamProject { exprs: [max(max(revenue1.total_revenue))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(revenue1.total_revenue))] } └─StreamSimpleAgg { aggs: [max(max(revenue1.total_revenue)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(revenue1.total_revenue), count] } @@ -2394,7 +2396,7 @@ └── BatchPlanNode Fragment 5 - StreamProject { exprs: [max(max(revenue1.total_revenue))], noop_update_hint: true } + StreamProject { exprs: [max(max(revenue1.total_revenue))] } └── StreamSimpleAgg { aggs: [max(max(revenue1.total_revenue)), count] } { tables: [ SimpleAggState: 11, SimpleAggCall0: 10 ] } └── StreamExchange Single from 6 @@ -2626,7 +2628,7 @@ └─BatchScan { table: order_line, columns: [order_line.ol_i_id, order_line.ol_quantity], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [avg_yearly], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [(sum(sum(order_line.ol_amount)) / 2.0:Decimal) as $expr3], noop_update_hint: true } + └─StreamProject { exprs: [(sum(sum(order_line.ol_amount)) / 2.0:Decimal) as $expr3] } └─StreamSimpleAgg { aggs: [sum(sum(order_line.ol_amount)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(order_line.ol_amount)] } @@ -2649,8 +2651,9 @@ Fragment 0 StreamMaterialize { columns: [avg_yearly], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [(sum(sum(order_line.ol_amount)) / 2.0:Decimal) as $expr3], noop_update_hint: true } - └── StreamSimpleAgg { aggs: [sum(sum(order_line.ol_amount)), count] } { tables: [ SimpleAggState: 0 ] } + └── StreamProject { exprs: [(sum(sum(order_line.ol_amount)) / 2.0:Decimal) as $expr3] } + └── StreamSimpleAgg { aggs: [sum(sum(order_line.ol_amount)), count] } + ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 Fragment 1 @@ -2858,7 +2861,7 @@ └─BatchScan { table: order_line, columns: [order_line.ol_w_id, order_line.ol_i_id, order_line.ol_amount, order_line.ol_quantity], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(order_line.ol_amount))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(order_line.ol_amount))] } └─StreamSimpleAgg { aggs: [sum(sum(order_line.ol_amount)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(order_line.ol_amount)] } @@ -2877,7 +2880,7 @@ Fragment 0 StreamMaterialize { columns: [revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [sum(sum(order_line.ol_amount))], noop_update_hint: true } + └── StreamProject { exprs: [sum(sum(order_line.ol_amount))] } └── StreamSimpleAgg { aggs: [sum(sum(order_line.ol_amount)), count] } ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 @@ -3407,7 +3410,7 @@ │ └─StreamProject { exprs: [orders.o_c_id, orders.o_w_id, orders.o_d_id, orders.o_id] } │ └─StreamTableScan { table: orders, columns: [orders.o_d_id, orders.o_w_id, orders.o_c_id, orders.o_id], stream_scan_type: ArrangementBackfill, stream_key: [orders.o_w_id, orders.o_d_id, orders.o_id], pk: [o_w_id, o_d_id, o_id], dist: UpstreamHashShard(orders.o_w_id, orders.o_d_id, orders.o_id) } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [(sum(sum(customer.c_balance)) / sum0(count(customer.c_balance))::Decimal) as $expr1], noop_update_hint: true } + └─StreamProject { exprs: [(sum(sum(customer.c_balance)) / sum0(count(customer.c_balance))::Decimal) as $expr1] } └─StreamSimpleAgg { aggs: [sum(sum(customer.c_balance)), sum0(count(customer.c_balance)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(customer.c_balance), count(customer.c_balance)] } diff --git a/src/frontend/planner_test/tests/testdata/output/cse_expr.yaml b/src/frontend/planner_test/tests/testdata/output/cse_expr.yaml index ce97db3c0d33e..abbc0aae184e0 100644 --- a/src/frontend/planner_test/tests/testdata/output/cse_expr.yaml +++ b/src/frontend/planner_test/tests/testdata/output/cse_expr.yaml @@ -74,7 +74,7 @@ └─StreamProject { exprs: [Sqrt($expr5) as $expr6, Case((sum0(count(t.v)) <= 1:Int64), null:Decimal, Sqrt(($expr4 / (sum0(count(t.v)) - 1:Int64)::Decimal))) as $expr7, $expr5, Case((sum0(count(t.v)) <= 1:Int64), null:Decimal, ($expr4 / (sum0(count(t.v)) - 1:Int64)::Decimal)) as $expr8] } └─StreamProject { exprs: [sum(sum($expr1)), sum(sum(t.v)), sum0(count(t.v)), ($expr4 / $expr3) as $expr5, $expr4] } └─StreamProject { exprs: [sum(sum($expr1)), sum(sum(t.v)), sum0(count(t.v)), (sum(sum($expr1))::Decimal - (($expr2 * $expr2) / $expr3)) as $expr4, $expr3] } - └─StreamProject { exprs: [sum(sum($expr1)), sum(sum(t.v)), sum0(count(t.v)), sum(sum(t.v))::Decimal as $expr2, sum0(count(t.v))::Decimal as $expr3], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum($expr1)), sum(sum(t.v)), sum0(count(t.v)), sum(sum(t.v))::Decimal as $expr2, sum0(count(t.v))::Decimal as $expr3] } └─StreamSimpleAgg { aggs: [sum(sum($expr1)), sum(sum(t.v)), sum0(count(t.v)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum($expr1), sum(t.v), count(t.v)] } diff --git a/src/frontend/planner_test/tests/testdata/output/dynamic_filter.yaml b/src/frontend/planner_test/tests/testdata/output/dynamic_filter.yaml index 922723851944b..89aea24f2bd80 100644 --- a/src/frontend/planner_test/tests/testdata/output/dynamic_filter.yaml +++ b/src/frontend/planner_test/tests/testdata/output/dynamic_filter.yaml @@ -18,7 +18,7 @@ └─StreamDynamicFilter { predicate: (t1.v1 > max(max(t2.v2))), output: [t1.v1, t1._row_id] } ├─StreamTableScan { table: t1, columns: [t1.v1, t1._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t1._row_id], pk: [_row_id], dist: UpstreamHashShard(t1._row_id) } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [max(max(t2.v2))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(t2.v2))] } └─StreamSimpleAgg { aggs: [max(max(t2.v2)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(t2.v2), count] } @@ -77,7 +77,7 @@ ├─StreamProject { exprs: [t1.v1, (t1.v1 + t1.v1) as $expr1, t1._row_id] } │ └─StreamTableScan { table: t1, columns: [t1.v1, t1._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t1._row_id], pk: [_row_id], dist: UpstreamHashShard(t1._row_id) } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [max(max(t2.v2))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(t2.v2))] } └─StreamSimpleAgg { aggs: [max(max(t2.v2)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(t2.v2), count] } @@ -129,7 +129,7 @@ ├─StreamExchange { dist: HashShard(t1.v1) } │ └─StreamTableScan { table: t1, columns: [t1.v1, t1._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t1._row_id], pk: [_row_id], dist: UpstreamHashShard(t1._row_id) } └─StreamExchange { dist: HashShard(max(max(t2.v2))) } - └─StreamProject { exprs: [max(max(t2.v2))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(t2.v2))] } └─StreamSimpleAgg { aggs: [max(max(t2.v2)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(t2.v2), count] } @@ -153,7 +153,7 @@ ├─StreamProject { exprs: [t1.v1, t1.v1::Int64 as $expr1, t1._row_id] } │ └─StreamTableScan { table: t1, columns: [t1.v1, t1._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t1._row_id], pk: [_row_id], dist: UpstreamHashShard(t1._row_id) } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [max(max(t2.v2))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(t2.v2))] } └─StreamSimpleAgg { aggs: [max(max(t2.v2)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(t2.v2), count] } @@ -169,7 +169,7 @@ └─StreamDynamicFilter { predicate: (t1.v1 > max(max(t2.v2))), output: [t1.v1, t1._row_id] } ├─StreamTableScan { table: t1, columns: [t1.v1, t1._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t1._row_id], pk: [_row_id], dist: UpstreamHashShard(t1._row_id) } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [max(max(t2.v2))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(t2.v2))] } └─StreamSimpleAgg { aggs: [max(max(t2.v2)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(t2.v2), count] } @@ -191,7 +191,7 @@ └─StreamDynamicFilter { predicate: (t1.v1 > $expr1), output: [t1.v1, t1._row_id] } ├─StreamTableScan { table: t1, columns: [t1.v1, t1._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t1._row_id], pk: [_row_id], dist: UpstreamHashShard(t1._row_id) } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [(2:Int32 * max(max(t2.v2))) as $expr1], noop_update_hint: true } + └─StreamProject { exprs: [(2:Int32 * max(max(t2.v2))) as $expr1] } └─StreamSimpleAgg { aggs: [max(max(t2.v2)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(t2.v2), count] } @@ -220,7 +220,7 @@ │ ├─StreamTableScan { table: t1, columns: [t1.v1, t1._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t1._row_id], pk: [_row_id], dist: UpstreamHashShard(t1._row_id) } │ └─StreamExchange { dist: Broadcast } │ └─StreamShare { id: 6 } - │ └─StreamProject { exprs: [max(max(t2.v2))], noop_update_hint: true } + │ └─StreamProject { exprs: [max(max(t2.v2))] } │ └─StreamSimpleAgg { aggs: [max(max(t2.v2)), count] } │ └─StreamExchange { dist: Single } │ └─StreamHashAgg { group_key: [_vnode], aggs: [max(t2.v2), count] } @@ -229,7 +229,7 @@ └─StreamExchange { dist: Broadcast } └─StreamProject { exprs: [(max(max(t2.v2)) + 5:Int32) as $expr1] } └─StreamShare { id: 6 } - └─StreamProject { exprs: [max(max(t2.v2))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(t2.v2))] } └─StreamSimpleAgg { aggs: [max(max(t2.v2)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(t2.v2), count] } diff --git a/src/frontend/planner_test/tests/testdata/output/lateral_subquery.yaml b/src/frontend/planner_test/tests/testdata/output/lateral_subquery.yaml index e7a1951ffde54..815890d6a73b8 100644 --- a/src/frontend/planner_test/tests/testdata/output/lateral_subquery.yaml +++ b/src/frontend/planner_test/tests/testdata/output/lateral_subquery.yaml @@ -180,7 +180,7 @@ where path_val = t1.id; stream_plan: |- StreamMaterialize { columns: [array_agg], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [array_agg(t1.n order_by($expr1 ASC))], noop_update_hint: true } + └─StreamProject { exprs: [array_agg(t1.n order_by($expr1 ASC))] } └─StreamSimpleAgg { aggs: [array_agg(t1.n order_by($expr1 ASC)), count] } └─StreamExchange { dist: Single } └─StreamProject { exprs: [t1.n, (projected_row_id + 1:Int64) as $expr1, t1._row_id, t2.p, t2.p, t2.d, t2.d, projected_row_id, t1.id, t2._row_id] } diff --git a/src/frontend/planner_test/tests/testdata/output/limit.yaml b/src/frontend/planner_test/tests/testdata/output/limit.yaml index 500dbe1dd5824..22fb2add9d30c 100644 --- a/src/frontend/planner_test/tests/testdata/output/limit.yaml +++ b/src/frontend/planner_test/tests/testdata/output/limit.yaml @@ -131,7 +131,7 @@ stream_plan: |- StreamMaterialize { columns: [count], stream_key: [], pk_columns: [], pk_conflict: NoCheck } └─StreamTopN { order: [sum0(count) ASC], limit: 1, offset: 0 } - └─StreamProject { exprs: [sum0(count)], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count)] } └─StreamSimpleAgg { aggs: [sum0(count), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count] } @@ -154,7 +154,7 @@ stream_plan: |- StreamMaterialize { columns: [count], stream_key: [], pk_columns: [], pk_conflict: NoCheck } └─StreamTopN { order: [sum0(count) ASC], limit: 1, offset: 0 } - └─StreamProject { exprs: [sum0(count)], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count)] } └─StreamSimpleAgg { aggs: [sum0(count), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count] } diff --git a/src/frontend/planner_test/tests/testdata/output/mv_column_name.yaml b/src/frontend/planner_test/tests/testdata/output/mv_column_name.yaml index 91352992bb17a..3db4034336315 100644 --- a/src/frontend/planner_test/tests/testdata/output/mv_column_name.yaml +++ b/src/frontend/planner_test/tests/testdata/output/mv_column_name.yaml @@ -63,7 +63,7 @@ select count(*), max(a) from t; stream_plan: |- StreamMaterialize { columns: [count, max], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum0(count), max(max(t.a))], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count), max(max(t.a))] } └─StreamSimpleAgg { aggs: [sum0(count), max(max(t.a)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [count, max(t.a)] } diff --git a/src/frontend/planner_test/tests/testdata/output/nexmark.yaml b/src/frontend/planner_test/tests/testdata/output/nexmark.yaml index 1ea7349b24769..d6b90da0a8c1a 100644 --- a/src/frontend/planner_test/tests/testdata/output/nexmark.yaml +++ b/src/frontend/planner_test/tests/testdata/output/nexmark.yaml @@ -1891,7 +1891,7 @@ │ └─StreamExchange { dist: HashShard(bid.auction) } │ └─StreamTableScan { table: bid, columns: [bid.auction, bid._row_id], stream_scan_type: ArrangementBackfill, stream_key: [bid._row_id], pk: [_row_id], dist: UpstreamHashShard(bid._row_id) } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [(sum0(sum0(count)) / sum0(count(bid.auction))) as $expr1], noop_update_hint: true } + └─StreamProject { exprs: [(sum0(sum0(count)) / sum0(count(bid.auction))) as $expr1] } └─StreamSimpleAgg { aggs: [sum0(sum0(count)), sum0(count(bid.auction)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum0(count), count(bid.auction)] } @@ -1926,7 +1926,7 @@ └── BatchPlanNode Fragment 3 - StreamProject { exprs: [(sum0(sum0(count)) / sum0(count(bid.auction))) as $expr1], noop_update_hint: true } + StreamProject { exprs: [(sum0(sum0(count)) / sum0(count(bid.auction))) as $expr1] } └── StreamSimpleAgg { aggs: [sum0(sum0(count)), sum0(count(bid.auction)), count] } { tables: [ SimpleAggState: 9 ] } └── StreamExchange Single from 4 @@ -2331,7 +2331,7 @@ └─BatchScan { table: bid, columns: [bid.auction, bid.price, bid.date_time], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [min_final], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [min(min(max(bid.price)))], noop_update_hint: true } + └─StreamProject { exprs: [min(min(max(bid.price)))] } └─StreamSimpleAgg { aggs: [min(min(max(bid.price))), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [$expr1], aggs: [min(max(bid.price)), count] } @@ -2348,7 +2348,7 @@ Fragment 0 StreamMaterialize { columns: [min_final], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [min(min(max(bid.price)))], noop_update_hint: true } + └── StreamProject { exprs: [min(min(max(bid.price)))] } └── StreamSimpleAgg { aggs: [min(min(max(bid.price))), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] └── StreamExchange Single from 1 diff --git a/src/frontend/planner_test/tests/testdata/output/nexmark_source.yaml b/src/frontend/planner_test/tests/testdata/output/nexmark_source.yaml index 15e1647721d53..35713c9682a35 100644 --- a/src/frontend/planner_test/tests/testdata/output/nexmark_source.yaml +++ b/src/frontend/planner_test/tests/testdata/output/nexmark_source.yaml @@ -1878,7 +1878,7 @@ │ └─StreamRowIdGen { row_id_index: 7 } │ └─StreamSource { source: bid, columns: [auction, bidder, price, channel, url, date_time, extra, _row_id] } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [(sum0(sum0(count)) / sum0(count(auction))) as $expr1], noop_update_hint: true } + └─StreamProject { exprs: [(sum0(sum0(count)) / sum0(count(auction))) as $expr1] } └─StreamSimpleAgg { aggs: [sum0(sum0(count)), sum0(count(auction)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum0(count), count(auction)] } @@ -1915,7 +1915,7 @@ └── StreamSource { source: bid, columns: [auction, bidder, price, channel, url, date_time, extra, _row_id] } { tables: [ Source: 8 ] } Fragment 4 - StreamProject { exprs: [(sum0(sum0(count)) / sum0(count(auction))) as $expr1], noop_update_hint: true } + StreamProject { exprs: [(sum0(sum0(count)) / sum0(count(auction))) as $expr1] } └── StreamSimpleAgg { aggs: [sum0(sum0(count)), sum0(count(auction)), count] } { tables: [ SimpleAggState: 9 ] } └── StreamExchange Single from 5 @@ -2277,7 +2277,7 @@ └─BatchSource { source: bid, columns: [auction, bidder, price, channel, url, date_time, extra, _row_id] } stream_plan: |- StreamMaterialize { columns: [min_final], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [min(min(max(price)))], noop_update_hint: true } + └─StreamProject { exprs: [min(min(max(price)))] } └─StreamSimpleAgg { aggs: [min(min(max(price))), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [$expr1], aggs: [min(max(price)), count] } @@ -2296,7 +2296,7 @@ Fragment 0 StreamMaterialize { columns: [min_final], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [min(min(max(price)))], noop_update_hint: true } + └── StreamProject { exprs: [min(min(max(price)))] } └── StreamSimpleAgg { aggs: [min(min(max(price))), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] └── StreamExchange Single from 1 diff --git a/src/frontend/planner_test/tests/testdata/output/nexmark_temporal_filter.yaml b/src/frontend/planner_test/tests/testdata/output/nexmark_temporal_filter.yaml index 0658030573dd1..d5d948e5b507c 100644 --- a/src/frontend/planner_test/tests/testdata/output/nexmark_temporal_filter.yaml +++ b/src/frontend/planner_test/tests/testdata/output/nexmark_temporal_filter.yaml @@ -1517,7 +1517,7 @@ │ └─StreamProject { exprs: [SubtractWithTimeZone(now, '00:05:00':Interval, 'UTC':Varchar) as $expr4], output_watermarks: [$expr4] } │ └─StreamNow { output: [now] } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [(sum0(sum0(count)) / sum0(count($expr5))) as $expr6], noop_update_hint: true } + └─StreamProject { exprs: [(sum0(sum0(count)) / sum0(count($expr5))) as $expr6] } └─StreamSimpleAgg { aggs: [sum0(sum0(count)), sum0(count($expr5)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum0(count), count($expr5)] } @@ -1578,7 +1578,7 @@ └── StreamNow { output: [now] } { tables: [ Now: 10 ] } Fragment 6 - StreamProject { exprs: [(sum0(sum0(count)) / sum0(count($expr5))) as $expr6], noop_update_hint: true } + StreamProject { exprs: [(sum0(sum0(count)) / sum0(count($expr5))) as $expr6] } └── StreamSimpleAgg { aggs: [sum0(sum0(count)), sum0(count($expr5)), count] } { tables: [ SimpleAggState: 11 ] } └── StreamExchange Single from 7 @@ -2000,7 +2000,7 @@ ) stream_plan: |- StreamMaterialize { columns: [min_final], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [min(min(max($expr7)))], noop_update_hint: true } + └─StreamProject { exprs: [min(min(max($expr7)))] } └─StreamSimpleAgg { aggs: [min(min(max($expr7))), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [$expr9], aggs: [min(max($expr7)), count] } @@ -2035,7 +2035,7 @@ Fragment 0 StreamMaterialize { columns: [min_final], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [min(min(max($expr7)))], noop_update_hint: true } + └── StreamProject { exprs: [min(min(max($expr7)))] } └── StreamSimpleAgg { aggs: [min(min(max($expr7))), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] └── StreamExchange Single from 1 diff --git a/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml b/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml index c577b72eaafd6..f065ba33c252d 100644 --- a/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml +++ b/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml @@ -2059,7 +2059,7 @@ │ └─StreamProject { exprs: [event_type, person, auction, bid, Case((event_type = 0:Int32), Field(person, 6:Int32), (event_type = 1:Int32), Field(auction, 5:Int32), Field(bid, 5:Int32)) as $expr1, _row_id] } │ └─StreamSource { source: nexmark, columns: [event_type, person, auction, bid, _row_id] } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [(sum0(sum0(count)) / sum0(count($expr4))) as $expr5], noop_update_hint: true } + └─StreamProject { exprs: [(sum0(sum0(count)) / sum0(count($expr4))) as $expr5] } └─StreamSimpleAgg { aggs: [sum0(sum0(count)), sum0(count($expr4)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum0(count), count($expr4)] } @@ -2111,7 +2111,7 @@ └── StreamExchange NoShuffle from 2 Fragment 5 - StreamProject { exprs: [(sum0(sum0(count)) / sum0(count($expr4))) as $expr5], noop_update_hint: true } + StreamProject { exprs: [(sum0(sum0(count)) / sum0(count($expr4))) as $expr5] } └── StreamSimpleAgg { aggs: [sum0(sum0(count)), sum0(count($expr4)), count] } { tables: [ SimpleAggState: 9 ] } └── StreamExchange Single from 6 @@ -2533,7 +2533,7 @@ └─BatchSource { source: nexmark, columns: [event_type, person, auction, bid, _row_id] } stream_plan: |- StreamMaterialize { columns: [min_final], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [min(min(max($expr5)))], noop_update_hint: true } + └─StreamProject { exprs: [min(min(max($expr5)))] } └─StreamSimpleAgg { aggs: [min(min(max($expr5))), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [$expr6], aggs: [min(max($expr5)), count] } @@ -2564,7 +2564,7 @@ Fragment 0 StreamMaterialize { columns: [min_final], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [min(min(max($expr5)))], noop_update_hint: true } + └── StreamProject { exprs: [min(min(max($expr5)))] } └── StreamSimpleAgg { aggs: [min(min(max($expr5))), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] └── StreamExchange Single from 1 diff --git a/src/frontend/planner_test/tests/testdata/output/share.yaml b/src/frontend/planner_test/tests/testdata/output/share.yaml index 7962e4724f347..2cf3aee9fe043 100644 --- a/src/frontend/planner_test/tests/testdata/output/share.yaml +++ b/src/frontend/planner_test/tests/testdata/output/share.yaml @@ -33,7 +33,7 @@ └─BatchSource { source: auction, columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, extra, _row_id] } stream_plan: |- StreamMaterialize { columns: [cnt], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum0(count)], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count)] } └─StreamSimpleAgg [append_only] { aggs: [sum0(count), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count] } @@ -155,7 +155,7 @@ ├─StreamExchange { dist: HashShard(0:Int32) } │ └─StreamProject { exprs: [sum0(count), 0:Int32] } │ └─StreamShare { id: 5 } - │ └─StreamProject { exprs: [sum0(count)], noop_update_hint: true } + │ └─StreamProject { exprs: [sum0(count)] } │ └─StreamSimpleAgg { aggs: [sum0(count), count] } │ └─StreamExchange { dist: Single } │ └─StreamStatelessSimpleAgg { aggs: [count] } @@ -163,7 +163,7 @@ └─StreamExchange { dist: HashShard(1:Int32) } └─StreamProject { exprs: [sum0(count), 1:Int32] } └─StreamShare { id: 5 } - └─StreamProject { exprs: [sum0(count)], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count)] } └─StreamSimpleAgg { aggs: [sum0(count), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count] } @@ -176,13 +176,13 @@ StreamMaterialize { columns: [count, $src(hidden)], stream_key: [$src], pk_columns: [$src], pk_conflict: NoCheck } └─StreamUnion { all: true } ├─StreamExchange { dist: HashShard(0:Int32) } - │ └─StreamProject { exprs: [sum0(count), 0:Int32], noop_update_hint: true } + │ └─StreamProject { exprs: [sum0(count), 0:Int32] } │ └─StreamSimpleAgg { aggs: [sum0(count), count] } │ └─StreamExchange { dist: Single } │ └─StreamStatelessSimpleAgg { aggs: [count] } │ └─StreamTableScan { table: t, columns: [t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } └─StreamExchange { dist: HashShard(1:Int32) } - └─StreamProject { exprs: [sum0(count), 1:Int32], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count), 1:Int32] } └─StreamSimpleAgg { aggs: [sum0(count), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count] } @@ -195,7 +195,7 @@ select count(*) cnt from auction A join auction B on A.id = B.id; stream_plan: |- StreamMaterialize { columns: [cnt], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum0(count)], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count)] } └─StreamSimpleAgg [append_only] { aggs: [sum0(count), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count] } @@ -216,7 +216,7 @@ with cte as (select a, sum(b) sum from t group by a) select count(*) from cte c1 join cte c2 on c1.a = c2.a; stream_plan: |- StreamMaterialize { columns: [count], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum0(count)], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count)] } └─StreamSimpleAgg { aggs: [sum0(count), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count] } @@ -235,7 +235,7 @@ Fragment 0 StreamMaterialize { columns: [count], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [sum0(count)], noop_update_hint: true } + └── StreamProject { exprs: [sum0(count)] } └── StreamSimpleAgg { aggs: [sum0(count), count] } { tables: [ SimpleAggState: 0 ] } └── StreamExchange Single from 1 diff --git a/src/frontend/planner_test/tests/testdata/output/stream_dist_agg.yaml b/src/frontend/planner_test/tests/testdata/output/stream_dist_agg.yaml index 0b7d7d7f2f2bf..48caec86bd940 100644 --- a/src/frontend/planner_test/tests/testdata/output/stream_dist_agg.yaml +++ b/src/frontend/planner_test/tests/testdata/output/stream_dist_agg.yaml @@ -17,13 +17,13 @@ └─BatchScan { table: s, columns: [s.v], distribution: Single } stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [max(s.v)], noop_update_hint: true } + └─StreamProject { exprs: [max(s.v)] } └─StreamSimpleAgg { aggs: [max(s.v), count] } └─StreamTableScan { table: s, columns: [s.v, s.o, s.t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [s.t._row_id], pk: [o, t._row_id], dist: Single } stream_dist_plan: |+ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } { tables: [ Materialize: 4294967294 ] } - └── StreamProject { exprs: [max(s.v)], noop_update_hint: true } + └── StreamProject { exprs: [max(s.v)] } └── StreamSimpleAgg { aggs: [max(s.v), count] } { tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] } └── StreamTableScan { table: s, columns: [s.v, s.o, s.t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [s.t._row_id], pk: [o, t._row_id], dist: Single } ├── tables: [ StreamScan: 2 ] @@ -55,13 +55,13 @@ └─BatchScan { table: s, columns: [s.v], distribution: Single } stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(s.v)], noop_update_hint: true } + └─StreamProject { exprs: [sum(s.v)] } └─StreamSimpleAgg { aggs: [sum(s.v), count] } └─StreamTableScan { table: s, columns: [s.v, s.o, s.t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [s.t._row_id], pk: [o, t._row_id], dist: Single } stream_dist_plan: |+ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } { tables: [ Materialize: 4294967294 ] } - └── StreamProject { exprs: [sum(s.v)], noop_update_hint: true } + └── StreamProject { exprs: [sum(s.v)] } └── StreamSimpleAgg { aggs: [sum(s.v), count] } { tables: [ SimpleAggState: 0 ] } └── StreamTableScan { table: s, columns: [s.v, s.o, s.t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [s.t._row_id], pk: [o, t._row_id], dist: Single } ├── tables: [ StreamScan: 1 ] @@ -91,13 +91,13 @@ └─BatchScan { table: s, columns: [s.v], distribution: Single } stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [count(s.v)], noop_update_hint: true } + └─StreamProject { exprs: [count(s.v)] } └─StreamSimpleAgg { aggs: [count(s.v), count] } └─StreamTableScan { table: s, columns: [s.v, s.o, s.t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [s.t._row_id], pk: [o, t._row_id], dist: Single } stream_dist_plan: |+ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } { tables: [ Materialize: 4294967294 ] } - └── StreamProject { exprs: [count(s.v)], noop_update_hint: true } + └── StreamProject { exprs: [count(s.v)] } └── StreamSimpleAgg { aggs: [count(s.v), count] } { tables: [ SimpleAggState: 0 ] } └── StreamTableScan { table: s, columns: [s.v, s.o, s.t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [s.t._row_id], pk: [o, t._row_id], dist: Single } ├── tables: [ StreamScan: 1 ] @@ -128,14 +128,14 @@ └─BatchScan { table: s, columns: [s.v, s.s], distribution: Single } stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [string_agg(s.s, ',':Varchar order_by(s.v ASC))], noop_update_hint: true } + └─StreamProject { exprs: [string_agg(s.s, ',':Varchar order_by(s.v ASC))] } └─StreamSimpleAgg { aggs: [string_agg(s.s, ',':Varchar order_by(s.v ASC)), count] } └─StreamProject { exprs: [s.s, ',':Varchar, s.v, s.t._row_id] } └─StreamTableScan { table: s, columns: [s.v, s.s, s.o, s.t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [s.t._row_id], pk: [o, t._row_id], dist: Single } stream_dist_plan: |+ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } { tables: [ Materialize: 4294967294 ] } - └── StreamProject { exprs: [string_agg(s.s, ',':Varchar order_by(s.v ASC))], noop_update_hint: true } + └── StreamProject { exprs: [string_agg(s.s, ',':Varchar order_by(s.v ASC))] } └── StreamSimpleAgg { aggs: [string_agg(s.s, ',':Varchar order_by(s.v ASC)), count] } { tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] } └── StreamProject { exprs: [s.s, ',':Varchar, s.v, s.t._row_id] } └── StreamTableScan { table: s, columns: [s.v, s.s, s.o, s.t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [s.t._row_id], pk: [o, t._row_id], dist: Single } @@ -169,7 +169,7 @@ └─BatchScan { table: t, columns: [t.v], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [max(max(t.v))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(t.v))] } └─StreamSimpleAgg { aggs: [max(max(t.v)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(t.v), count] } @@ -179,7 +179,7 @@ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [max(max(t.v))], noop_update_hint: true } + └── StreamProject { exprs: [max(max(t.v))] } └── StreamSimpleAgg { aggs: [max(max(t.v)), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] └── StreamExchange Single from 1 @@ -223,7 +223,7 @@ select max(v) as a1 from AO; stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [max(max(ao.v))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(ao.v))] } └─StreamSimpleAgg [append_only] { aggs: [max(max(ao.v)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [max(ao.v)] } @@ -232,7 +232,7 @@ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [max(max(ao.v))], noop_update_hint: true } + └── StreamProject { exprs: [max(max(ao.v))] } └── StreamSimpleAgg [append_only] { aggs: [max(max(ao.v)), count] } ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 @@ -268,7 +268,7 @@ └─BatchScan { table: t, columns: [t.v], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(t.v))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(t.v))] } └─StreamSimpleAgg { aggs: [sum(sum(t.v)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(t.v)] } @@ -277,7 +277,7 @@ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [sum(sum(t.v))], noop_update_hint: true } + └── StreamProject { exprs: [sum(sum(t.v))] } └── StreamSimpleAgg { aggs: [sum(sum(t.v)), count] } ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 @@ -308,7 +308,7 @@ select sum(v) as a1 from AO; stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum(ao.v))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum(ao.v))] } └─StreamSimpleAgg [append_only] { aggs: [sum(sum(ao.v)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(ao.v)] } @@ -317,7 +317,7 @@ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [sum(sum(ao.v))], noop_update_hint: true } + └── StreamProject { exprs: [sum(sum(ao.v))] } └── StreamSimpleAgg [append_only] { aggs: [sum(sum(ao.v)), count] } ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 @@ -353,7 +353,7 @@ └─BatchScan { table: t, columns: [t.v], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum0(count(t.v))], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count(t.v))] } └─StreamSimpleAgg { aggs: [sum0(count(t.v)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count(t.v)] } @@ -362,7 +362,7 @@ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [sum0(count(t.v))], noop_update_hint: true } + └── StreamProject { exprs: [sum0(count(t.v))] } └── StreamSimpleAgg { aggs: [sum0(count(t.v)), count] } ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 @@ -393,7 +393,7 @@ select count(v) as a1 from AO; stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum0(count(ao.v))], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count(ao.v))] } └─StreamSimpleAgg [append_only] { aggs: [sum0(count(ao.v)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count(ao.v)] } @@ -402,7 +402,7 @@ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [sum0(count(ao.v))], noop_update_hint: true } + └── StreamProject { exprs: [sum0(count(ao.v))] } └── StreamSimpleAgg [append_only] { aggs: [sum0(count(ao.v)), count] } ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 @@ -438,7 +438,7 @@ └─BatchScan { table: t, columns: [t.o, t.s], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [string_agg(t.s, ',':Varchar order_by(t.o ASC))], noop_update_hint: true } + └─StreamProject { exprs: [string_agg(t.s, ',':Varchar order_by(t.o ASC))] } └─StreamSimpleAgg { aggs: [string_agg(t.s, ',':Varchar order_by(t.o ASC)), count] } └─StreamExchange { dist: Single } └─StreamProject { exprs: [t.s, ',':Varchar, t.o, t._row_id] } @@ -447,7 +447,7 @@ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [string_agg(t.s, ',':Varchar order_by(t.o ASC))], noop_update_hint: true } + └── StreamProject { exprs: [string_agg(t.s, ',':Varchar order_by(t.o ASC))] } └── StreamSimpleAgg { aggs: [string_agg(t.s, ',':Varchar order_by(t.o ASC)), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] └── StreamExchange Single from 1 @@ -480,7 +480,7 @@ select string_agg(s, ',' order by o) as a1 from AO; stream_plan: |- StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [string_agg(ao.s, ',':Varchar order_by(ao.o ASC))], noop_update_hint: true } + └─StreamProject { exprs: [string_agg(ao.s, ',':Varchar order_by(ao.o ASC))] } └─StreamSimpleAgg [append_only] { aggs: [string_agg(ao.s, ',':Varchar order_by(ao.o ASC)), count] } └─StreamExchange { dist: Single } └─StreamProject { exprs: [ao.s, ',':Varchar, ao.o, ao._row_id] } @@ -489,7 +489,7 @@ Fragment 0 StreamMaterialize { columns: [a1], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [string_agg(ao.s, ',':Varchar order_by(ao.o ASC))], noop_update_hint: true } + └── StreamProject { exprs: [string_agg(ao.s, ',':Varchar order_by(ao.o ASC))] } └── StreamSimpleAgg [append_only] { aggs: [string_agg(ao.s, ',':Varchar order_by(ao.o ASC)), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] └── StreamExchange Single from 1 @@ -527,7 +527,7 @@ └─BatchScan { table: t, columns: [t.v], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [max(max(t.v)), sum0(count(t.v))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(t.v)), sum0(count(t.v))] } └─StreamSimpleAgg { aggs: [max(max(t.v)), sum0(count(t.v)), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(t.v), count(t.v), count] } @@ -537,7 +537,7 @@ Fragment 0 StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [max(max(t.v)), sum0(count(t.v))], noop_update_hint: true } + └── StreamProject { exprs: [max(max(t.v)), sum0(count(t.v))] } └── StreamSimpleAgg { aggs: [max(max(t.v)), sum0(count(t.v)), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall0: 0 ] └── StreamExchange Single from 1 @@ -587,7 +587,7 @@ select max(v) as a1, count(v) as a2 from AO; stream_plan: |- StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [max(max(ao.v)), sum0(count(ao.v))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(ao.v)), sum0(count(ao.v))] } └─StreamSimpleAgg [append_only] { aggs: [max(max(ao.v)), sum0(count(ao.v)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [max(ao.v), count(ao.v)] } @@ -596,7 +596,7 @@ Fragment 0 StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [max(max(ao.v)), sum0(count(ao.v))], noop_update_hint: true } + └── StreamProject { exprs: [max(max(ao.v)), sum0(count(ao.v))] } └── StreamSimpleAgg [append_only] { aggs: [max(max(ao.v)), sum0(count(ao.v)), count] } ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 @@ -632,7 +632,7 @@ └─BatchScan { table: t, columns: [t.v, t.o, t.s], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [count(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC))], noop_update_hint: true } + └─StreamProject { exprs: [count(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC))] } └─StreamSimpleAgg { aggs: [count(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC)), count] } └─StreamExchange { dist: Single } └─StreamProject { exprs: [t.v, t.s, ',':Varchar, t.o, t._row_id] } @@ -641,7 +641,7 @@ Fragment 0 StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [count(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC))], noop_update_hint: true } + └── StreamProject { exprs: [count(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC))] } └── StreamSimpleAgg { aggs: [count(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC)), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall1: 0 ] └── StreamExchange Single from 1 @@ -679,7 +679,7 @@ select count(v) as a1, string_agg(s, ',' order by o) as a2 from AO; stream_plan: |- StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [count(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC))], noop_update_hint: true } + └─StreamProject { exprs: [count(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC))] } └─StreamSimpleAgg [append_only] { aggs: [count(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC)), count] } └─StreamExchange { dist: Single } └─StreamProject { exprs: [ao.v, ao.s, ',':Varchar, ao.o, ao._row_id] } @@ -688,7 +688,7 @@ Fragment 0 StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [count(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC))], noop_update_hint: true } + └── StreamProject { exprs: [count(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC))] } └── StreamSimpleAgg [append_only] { aggs: [count(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC)), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall1: 0 ] └── StreamExchange Single from 1 @@ -726,7 +726,7 @@ └─BatchScan { table: t, columns: [t.v, t.o, t.s], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [max(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC))], noop_update_hint: true } + └─StreamProject { exprs: [max(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC))] } └─StreamSimpleAgg { aggs: [max(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC)), count] } └─StreamExchange { dist: Single } └─StreamProject { exprs: [t.v, t.s, ',':Varchar, t.o, t._row_id] } @@ -735,7 +735,7 @@ Fragment 0 StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [max(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC))], noop_update_hint: true } + └── StreamProject { exprs: [max(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC))] } └── StreamSimpleAgg { aggs: [max(t.v), string_agg(t.s, ',':Varchar order_by(t.o ASC)), count] } ├── tables: [ SimpleAggState: 2, SimpleAggCall0: 0, SimpleAggCall1: 1 ] └── StreamExchange Single from 1 @@ -770,7 +770,7 @@ select max(v) as a1, string_agg(s, ',' order by o) as a2 from AO; stream_plan: |- StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [max(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC))], noop_update_hint: true } + └─StreamProject { exprs: [max(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC))] } └─StreamSimpleAgg [append_only] { aggs: [max(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC)), count] } └─StreamExchange { dist: Single } └─StreamProject { exprs: [ao.v, ao.s, ',':Varchar, ao.o, ao._row_id] } @@ -779,7 +779,7 @@ Fragment 0 StreamMaterialize { columns: [a1, a2], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [max(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC))], noop_update_hint: true } + └── StreamProject { exprs: [max(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC))] } └── StreamSimpleAgg [append_only] { aggs: [max(ao.v), string_agg(ao.s, ',':Varchar order_by(ao.o ASC)), count] } ├── tables: [ SimpleAggState: 1, SimpleAggCall1: 0 ] └── StreamExchange Single from 1 diff --git a/src/frontend/planner_test/tests/testdata/output/temporal_join.yaml b/src/frontend/planner_test/tests/testdata/output/temporal_join.yaml index 49d14526af640..5cdfdf6cf45ea 100644 --- a/src/frontend/planner_test/tests/testdata/output/temporal_join.yaml +++ b/src/frontend/planner_test/tests/testdata/output/temporal_join.yaml @@ -61,7 +61,7 @@ select count(*) from stream left join version FOR SYSTEM_TIME AS OF PROCTIME() on id1 = id2 where a2 < 10; stream_plan: |- StreamMaterialize { columns: [count], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum0(count)], noop_update_hint: true } + └─StreamProject { exprs: [sum0(count)] } └─StreamSimpleAgg [append_only] { aggs: [sum0(count), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [count] } diff --git a/src/frontend/planner_test/tests/testdata/output/tpch.yaml b/src/frontend/planner_test/tests/testdata/output/tpch.yaml index dbb7a5c08a62a..3c43faa8d2494 100644 --- a/src/frontend/planner_test/tests/testdata/output/tpch.yaml +++ b/src/frontend/planner_test/tests/testdata/output/tpch.yaml @@ -1160,7 +1160,7 @@ └─BatchScan { table: lineitem, columns: [lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_quantity, lineitem.l_shipdate], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum($expr1))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum($expr1))] } └─StreamSimpleAgg { aggs: [sum(sum($expr1)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum($expr1)] } @@ -1171,7 +1171,7 @@ Fragment 0 StreamMaterialize { columns: [revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [sum(sum($expr1))], noop_update_hint: true } + └── StreamProject { exprs: [sum(sum($expr1))] } └── StreamSimpleAgg { aggs: [sum(sum($expr1)), count] } { tables: [ SimpleAggState: 0 ] } └── StreamExchange Single from 1 @@ -2389,7 +2389,7 @@ │ └─StreamFilter { predicate: (nation.n_name = 'ARGENTINA':Varchar) } │ └─StreamTableScan { table: nation, columns: [nation.n_nationkey, nation.n_name], stream_scan_type: ArrangementBackfill, stream_key: [nation.n_nationkey], pk: [n_nationkey], dist: UpstreamHashShard(nation.n_nationkey) } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [(sum(sum($expr2)) * 0.0001000000:Decimal) as $expr3], noop_update_hint: true } + └─StreamProject { exprs: [(sum(sum($expr2)) * 0.0001000000:Decimal) as $expr3] } └─StreamSimpleAgg { aggs: [sum(sum($expr2)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum($expr2)] } @@ -2461,7 +2461,7 @@ └── BatchPlanNode Fragment 8 - StreamProject { exprs: [(sum(sum($expr2)) * 0.0001000000:Decimal) as $expr3], noop_update_hint: true } + StreamProject { exprs: [(sum(sum($expr2)) * 0.0001000000:Decimal) as $expr3] } └── StreamSimpleAgg { aggs: [sum(sum($expr2)), count] } { tables: [ SimpleAggState: 16 ] } └── StreamExchange Single from 9 @@ -2818,7 +2818,7 @@ └─BatchScan { table: lineitem, columns: [lineitem.l_partkey, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [promo_revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [((100.00:Decimal * sum(sum($expr1))) / sum(sum($expr2))) as $expr3], noop_update_hint: true } + └─StreamProject { exprs: [((100.00:Decimal * sum(sum($expr1))) / sum(sum($expr2))) as $expr3] } └─StreamSimpleAgg { aggs: [sum(sum($expr1)), sum(sum($expr2)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum($expr1), sum($expr2)] } @@ -2834,8 +2834,9 @@ Fragment 0 StreamMaterialize { columns: [promo_revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [((100.00:Decimal * sum(sum($expr1))) / sum(sum($expr2))) as $expr3], noop_update_hint: true } - └── StreamSimpleAgg { aggs: [sum(sum($expr1)), sum(sum($expr2)), count] } { tables: [ SimpleAggState: 0 ] } + └── StreamProject { exprs: [((100.00:Decimal * sum(sum($expr1))) / sum(sum($expr2))) as $expr3] } + └── StreamSimpleAgg { aggs: [sum(sum($expr1)), sum(sum($expr2)), count] } + ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 Fragment 1 @@ -2965,7 +2966,7 @@ │ └─StreamFilter { predicate: (lineitem.l_shipdate >= '1993-01-01':Date) AND (lineitem.l_shipdate < '1993-04-01 00:00:00':Timestamp) } │ └─StreamTableScan { table: lineitem, columns: [lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_orderkey, lineitem.l_linenumber, lineitem.l_shipdate], stream_scan_type: ArrangementBackfill, stream_key: [lineitem.l_orderkey, lineitem.l_linenumber], pk: [l_orderkey, l_linenumber], dist: UpstreamHashShard(lineitem.l_orderkey, lineitem.l_linenumber) } └─StreamExchange { dist: HashShard(max(max(sum($expr1)))) } - └─StreamProject { exprs: [max(max(sum($expr1)))], noop_update_hint: true } + └─StreamProject { exprs: [max(max(sum($expr1)))] } └─StreamSimpleAgg { aggs: [max(max(sum($expr1))), count] } └─StreamExchange { dist: Single } └─StreamHashAgg { group_key: [_vnode], aggs: [max(sum($expr1)), count] } @@ -3019,7 +3020,7 @@ └── BatchPlanNode Fragment 6 - StreamProject { exprs: [max(max(sum($expr1)))], noop_update_hint: true } + StreamProject { exprs: [max(max(sum($expr1)))] } └── StreamSimpleAgg { aggs: [max(max(sum($expr1))), count] } { tables: [ SimpleAggState: 14, SimpleAggCall0: 13 ] } └── StreamExchange Single from 7 @@ -3295,7 +3296,7 @@ └─BatchScan { table: lineitem, columns: [lineitem.l_partkey, lineitem.l_quantity], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [avg_yearly], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [(sum(sum(lineitem.l_extendedprice)) / 7.0:Decimal) as $expr2], noop_update_hint: true } + └─StreamProject { exprs: [(sum(sum(lineitem.l_extendedprice)) / 7.0:Decimal) as $expr2] } └─StreamSimpleAgg { aggs: [sum(sum(lineitem.l_extendedprice)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(lineitem.l_extendedprice)] } @@ -3318,8 +3319,9 @@ Fragment 0 StreamMaterialize { columns: [avg_yearly], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [(sum(sum(lineitem.l_extendedprice)) / 7.0:Decimal) as $expr2], noop_update_hint: true } - └── StreamSimpleAgg { aggs: [sum(sum(lineitem.l_extendedprice)), count] } { tables: [ SimpleAggState: 0 ] } + └── StreamProject { exprs: [(sum(sum(lineitem.l_extendedprice)) / 7.0:Decimal) as $expr2] } + └── StreamSimpleAgg { aggs: [sum(sum(lineitem.l_extendedprice)), count] } + ├── tables: [ SimpleAggState: 0 ] └── StreamExchange Single from 1 Fragment 1 @@ -3670,7 +3672,7 @@ └─BatchScan { table: lineitem, columns: [lineitem.l_partkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipinstruct, lineitem.l_shipmode], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } - └─StreamProject { exprs: [sum(sum($expr1))], noop_update_hint: true } + └─StreamProject { exprs: [sum(sum($expr1))] } └─StreamSimpleAgg { aggs: [sum(sum($expr1)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum($expr1)] } @@ -3688,7 +3690,7 @@ Fragment 0 StreamMaterialize { columns: [revenue], stream_key: [], pk_columns: [], pk_conflict: NoCheck } ├── tables: [ Materialize: 4294967294 ] - └── StreamProject { exprs: [sum(sum($expr1))], noop_update_hint: true } + └── StreamProject { exprs: [sum(sum($expr1))] } └── StreamSimpleAgg { aggs: [sum(sum($expr1)), count] } { tables: [ SimpleAggState: 0 ] } └── StreamExchange Single from 1 @@ -4340,7 +4342,7 @@ │ └─StreamExchange { dist: HashShard(orders.o_custkey) } │ └─StreamTableScan { table: orders, columns: [orders.o_custkey, orders.o_orderkey], stream_scan_type: ArrangementBackfill, stream_key: [orders.o_orderkey], pk: [o_orderkey], dist: UpstreamHashShard(orders.o_orderkey) } └─StreamExchange { dist: Broadcast } - └─StreamProject { exprs: [(sum(sum(customer.c_acctbal)) / sum0(count(customer.c_acctbal))::Decimal) as $expr1], noop_update_hint: true } + └─StreamProject { exprs: [(sum(sum(customer.c_acctbal)) / sum0(count(customer.c_acctbal))::Decimal) as $expr1] } └─StreamSimpleAgg { aggs: [sum(sum(customer.c_acctbal)), sum0(count(customer.c_acctbal)), count] } └─StreamExchange { dist: Single } └─StreamStatelessSimpleAgg { aggs: [sum(customer.c_acctbal), count(customer.c_acctbal)] } diff --git a/src/frontend/src/binder/expr/function/builtin_scalar.rs b/src/frontend/src/binder/expr/function/builtin_scalar.rs index 73eb722b26011..d46681c51ab3e 100644 --- a/src/frontend/src/binder/expr/function/builtin_scalar.rs +++ b/src/frontend/src/binder/expr/function/builtin_scalar.rs @@ -399,6 +399,7 @@ impl Binder { ("jsonb_path_query_array", raw_call(ExprType::JsonbPathQueryArray)), ("jsonb_path_query_first", raw_call(ExprType::JsonbPathQueryFirst)), ("jsonb_set", raw_call(ExprType::JsonbSet)), + ("jsonb_populate_map", raw_call(ExprType::JsonbPopulateMap)), // map ("map_from_entries", raw_call(ExprType::MapFromEntries)), ("map_access",raw_call(ExprType::MapAccess)), diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs index 879e375e2b762..5e3261c06d186 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs @@ -59,3 +59,4 @@ mod rw_worker_nodes; mod rw_actor_id_to_ddl; mod rw_fragment_id_to_ddl; +mod rw_worker_actor_count; diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_ddl_progress.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_ddl_progress.rs index 9f592d4e4f6b3..032b0f82907ef 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_ddl_progress.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_ddl_progress.rs @@ -31,7 +31,7 @@ struct RwDdlProgress { #[system_catalog(table, "rw_catalog.rw_ddl_progress")] async fn read(reader: &SysCatalogReaderImpl) -> Result> { - let ddl_progresses = reader.meta_client.list_ddl_progress().await?; + let ddl_progresses = reader.meta_client.get_ddl_progress().await?; let table_ids = ddl_progresses .iter() diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_iceberg_snapshots.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_iceberg_snapshots.rs index e2bbcb486b926..3c60236f96e66 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_iceberg_snapshots.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_iceberg_snapshots.rs @@ -17,6 +17,7 @@ use std::ops::Deref; use iceberg::table::Table; use jsonbb::{Value, ValueRef}; use risingwave_common::types::{Fields, JsonbVal, Timestamptz}; +use risingwave_connector::error::ConnectorResult; use risingwave_connector::sink::iceberg::IcebergConfig; use risingwave_connector::source::ConnectorProperties; use risingwave_connector::WithPropertiesExt; @@ -62,25 +63,32 @@ async fn read(reader: &SysCatalogReaderImpl) -> Result> let iceberg_config: IcebergConfig = iceberg_properties.to_iceberg_config(); let table: Table = iceberg_config.load_table_v2().await?; - result.extend(table.metadata().snapshots().map(|snapshot| { - RwIcebergSnapshots { - source_id: source.id as i32, - schema_name: schema_name.clone(), - source_name: source.name.clone(), - sequence_number: snapshot.sequence_number(), - snapshot_id: snapshot.snapshot_id(), - timestamp_ms: Timestamptz::from_millis(snapshot.timestamp().timestamp_millis()), - manifest_list: snapshot.manifest_list().to_string(), - summary: Value::object( - snapshot - .summary() - .other - .iter() - .map(|(k, v)| (k.as_str(), ValueRef::String(v))), - ) - .into(), - } - })); + let snapshots: ConnectorResult> = table + .metadata() + .snapshots() + .map(|snapshot| { + Ok(RwIcebergSnapshots { + source_id: source.id as i32, + schema_name: schema_name.clone(), + source_name: source.name.clone(), + sequence_number: snapshot.sequence_number(), + snapshot_id: snapshot.snapshot_id(), + timestamp_ms: Timestamptz::from_millis( + snapshot.timestamp()?.timestamp_millis(), + ), + manifest_list: snapshot.manifest_list().to_string(), + summary: Value::object( + snapshot + .summary() + .other + .iter() + .map(|(k, v)| (k.as_str(), ValueRef::String(v))), + ) + .into(), + }) + }) + .collect(); + result.extend(snapshots?); } } Ok(result) diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_worker_actor_count.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_worker_actor_count.rs new file mode 100644 index 0000000000000..a336f69b2029f --- /dev/null +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_worker_actor_count.rs @@ -0,0 +1,31 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_common::types::Fields; +use risingwave_frontend_macro::system_catalog; + +#[system_catalog( + view, + "rw_catalog.rw_worker_actor_count", + "SELECT t2.id as worker_id, parallelism, count(*) as actor_count + FROM rw_actors t1, rw_worker_nodes t2 + where t1.worker_id = t2.id + GROUP BY t2.id, t2.parallelism;" +)] +#[derive(Fields)] +struct RwWorkerActorCount { + worker_id: i32, + parallelism: i32, + actor_count: i64, +} diff --git a/src/frontend/src/expr/mod.rs b/src/frontend/src/expr/mod.rs index f650fa3cb521b..c7acdfa5c4a3c 100644 --- a/src/frontend/src/expr/mod.rs +++ b/src/frontend/src/expr/mod.rs @@ -988,10 +988,9 @@ impl ExprImpl { _ => return None, }; let list: Vec<_> = inputs - .map(|expr| { + .inspect(|expr| { // Non constant IN will be bound to OR assert!(expr.is_const()); - expr }) .collect(); diff --git a/src/frontend/src/expr/pure.rs b/src/frontend/src/expr/pure.rs index 3e6c83d8330fb..d47cc3851f641 100644 --- a/src/frontend/src/expr/pure.rs +++ b/src/frontend/src/expr/pure.rs @@ -211,6 +211,7 @@ impl ExprVisitor for ImpureAnalyzer { | Type::JsonbPathQueryArray | Type::JsonbPathQueryFirst | Type::JsonbSet + | Type::JsonbPopulateMap | Type::IsJson | Type::ToJsonb | Type::Sind diff --git a/src/frontend/src/handler/alter_parallelism.rs b/src/frontend/src/handler/alter_parallelism.rs index 3c6ab52f51e39..ee3c26708908c 100644 --- a/src/frontend/src/handler/alter_parallelism.rs +++ b/src/frontend/src/handler/alter_parallelism.rs @@ -103,21 +103,23 @@ pub async fn handle_alter_parallelism( .filter(|w| w.is_streaming_schedulable()) .map(|w| w.parallelism) .sum::(); + // TODO(var-vnode): use vnode count from config + let max_parallelism = VirtualNode::COUNT; let mut builder = RwPgResponse::builder(stmt_type); match &target_parallelism.parallelism { Some(Parallelism::Adaptive(_)) | Some(Parallelism::Auto(_)) => { - if available_parallelism > VirtualNode::COUNT as u32 { - builder = builder.notice(format!("Available parallelism exceeds the maximum parallelism limit, the actual parallelism will be limited to {}", VirtualNode::COUNT)); + if available_parallelism > max_parallelism as u32 { + builder = builder.notice(format!("Available parallelism exceeds the maximum parallelism limit, the actual parallelism will be limited to {max_parallelism}")); } } Some(Parallelism::Fixed(FixedParallelism { parallelism })) => { - if *parallelism > VirtualNode::COUNT as u32 { - builder = builder.notice(format!("Provided parallelism exceeds the maximum parallelism limit, resetting to FIXED({})", VirtualNode::COUNT)); + if *parallelism > max_parallelism as u32 { + builder = builder.notice(format!("Provided parallelism exceeds the maximum parallelism limit, resetting to FIXED({max_parallelism})")); target_parallelism = PbTableParallelism { parallelism: Some(PbParallelism::Fixed(FixedParallelism { - parallelism: VirtualNode::COUNT as u32, + parallelism: max_parallelism as u32, })), }; } diff --git a/src/frontend/src/handler/create_index.rs b/src/frontend/src/handler/create_index.rs index ee6429a85e32e..a6cc1e20548f7 100644 --- a/src/frontend/src/handler/create_index.rs +++ b/src/frontend/src/handler/create_index.rs @@ -25,7 +25,6 @@ use risingwave_common::acl::AclMode; use risingwave_common::catalog::{IndexId, TableDesc, TableId}; use risingwave_common::util::sort_util::{ColumnOrder, OrderType}; use risingwave_pb::catalog::{PbIndex, PbIndexColumnProperties, PbStreamJobStatus, PbTable}; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_pb::user::grant_privilege::Object; use risingwave_sqlparser::ast; use risingwave_sqlparser::ast::{Ident, ObjectName, OrderByExpr}; @@ -448,14 +447,8 @@ pub async fn handle_create_index( include, distributed_by, )?; - let mut graph = build_graph(plan)?; - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); + let graph = build_graph(plan)?; + (graph, index_table, index) }; diff --git a/src/frontend/src/handler/create_mv.rs b/src/frontend/src/handler/create_mv.rs index 4399d80811c19..1c8a866db3e06 100644 --- a/src/frontend/src/handler/create_mv.rs +++ b/src/frontend/src/handler/create_mv.rs @@ -20,7 +20,6 @@ use pgwire::pg_response::{PgResponse, StatementType}; use risingwave_common::acl::AclMode; use risingwave_common::catalog::TableId; use risingwave_pb::catalog::PbTable; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_sqlparser::ast::{EmitMode, Ident, ObjectName, Query}; use super::privilege::resolve_relation_privileges; @@ -205,6 +204,9 @@ pub async fn handle_create_mv_bound( ) -> Result { let session = handler_args.session.clone(); + // Check cluster limits + session.check_cluster_limits().await?; + if let Either::Right(resp) = session.check_relation_name_duplicated( name.clone(), StatementType::CREATE_MATERIALIZED_VIEW, @@ -240,18 +242,7 @@ It only indicates the physical clustering of the data, which may improve the per emit_mode, )?; - let context = plan.plan_base().ctx().clone(); - let mut graph = build_graph(plan)?; - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); - // Set the timezone for the stream context - let ctx = graph.ctx.as_mut().unwrap(); - ctx.timezone = context.get_session_timezone(); + let graph = build_graph(plan)?; (table, graph) }; diff --git a/src/frontend/src/handler/create_sink.rs b/src/frontend/src/handler/create_sink.rs index d0bd1d0cc8f2f..9f4f2f63975f1 100644 --- a/src/frontend/src/handler/create_sink.rs +++ b/src/frontend/src/handler/create_sink.rs @@ -35,7 +35,6 @@ use risingwave_connector::sink::{ }; use risingwave_pb::catalog::{PbSink, PbSource, Table}; use risingwave_pb::ddl_service::{ReplaceTablePlan, TableJobType}; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_pb::stream_plan::stream_node::{NodeBody, PbNodeBody}; use risingwave_pb::stream_plan::{MergeNode, StreamFragmentGraph, StreamNode}; use risingwave_sqlparser::ast::{ @@ -419,6 +418,8 @@ pub async fn handle_create_sink( ) -> Result { let session = handle_args.session.clone(); + session.check_cluster_limits().await?; + if let Either::Right(resp) = session.check_relation_name_duplicated( stmt.sink_name.clone(), StatementType::CREATE_SINK, @@ -443,15 +444,7 @@ pub async fn handle_create_sink( ); } - let mut graph = build_graph(plan)?; - - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); + let graph = build_graph(plan)?; (sink, graph, target_table_catalog) }; diff --git a/src/frontend/src/handler/create_source.rs b/src/frontend/src/handler/create_source.rs index f006ca929f54c..f1535fa769b28 100644 --- a/src/frontend/src/handler/create_source.rs +++ b/src/frontend/src/handler/create_source.rs @@ -62,12 +62,11 @@ use risingwave_connector::WithPropertiesExt; use risingwave_pb::catalog::{PbSchemaRegistryNameStrategy, StreamSourceInfo, WatermarkDesc}; use risingwave_pb::plan_common::additional_column::ColumnType as AdditionalColumnType; use risingwave_pb::plan_common::{EncodeType, FormatType}; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_sqlparser::ast::{ get_delimiter, AstString, ColumnDef, ConnectorSchema, CreateSourceStatement, Encode, Format, ObjectName, ProtobufSchema, SourceWatermark, TableConstraint, }; -use risingwave_sqlparser::parser::IncludeOption; +use risingwave_sqlparser::parser::{IncludeOption, IncludeOptionItem}; use thiserror_ext::AsReport; use super::RwPgResponse; @@ -595,8 +594,43 @@ fn bind_columns_from_source_for_cdc( Ok((Some(columns), stream_source_info)) } +// check the additional column compatibility with the format and encode +fn check_additional_column_compatibility( + column_def: &IncludeOptionItem, + source_schema: Option<&ConnectorSchema>, +) -> Result<()> { + // only allow header column have inner field + if column_def.inner_field.is_some() + && !column_def + .column_type + .real_value() + .eq_ignore_ascii_case("header") + { + return Err(RwError::from(ProtocolError(format!( + "Only header column can have inner field, but got {:?}", + column_def.column_type.real_value(), + )))); + } + + // Payload column only allowed when encode is JSON + if let Some(schema) = source_schema + && column_def + .column_type + .real_value() + .eq_ignore_ascii_case("payload") + && !matches!(schema.row_encode, Encode::Json) + { + return Err(RwError::from(ProtocolError(format!( + "INCLUDE payload is only allowed when using ENCODE JSON, but got ENCODE {:?}", + schema.row_encode + )))); + } + Ok(()) +} + /// add connector-spec columns to the end of column catalog pub fn handle_addition_columns( + source_schema: Option<&ConnectorSchema>, with_properties: &BTreeMap, mut additional_columns: IncludeOption, columns: &mut Vec, @@ -620,17 +654,7 @@ pub fn handle_addition_columns( .unwrap(); // there must be at least one column in the column catalog while let Some(item) = additional_columns.pop() { - { - // only allow header column have inner field - if item.inner_field.is_some() - && !item.column_type.real_value().eq_ignore_ascii_case("header") - { - return Err(RwError::from(ProtocolError(format!( - "Only header column can have inner field, but got {:?}", - item.column_type.real_value(), - )))); - } - } + check_additional_column_compatibility(&item, source_schema)?; let data_type_name: Option = item .header_inner_expect_type @@ -1513,6 +1537,7 @@ pub async fn bind_create_source_or_table_with_connector( // add additional columns before bind pk, because `format upsert` requires the key column handle_addition_columns( + Some(&source_schema), &with_properties, include_column_options, &mut columns, @@ -1640,7 +1665,8 @@ pub async fn handle_create_source( let create_cdc_source_job = with_properties.is_shareable_cdc_connector(); let is_shared = create_cdc_source_job - || (with_properties.is_kafka_connector() && session.config().rw_enable_shared_source()); + || (with_properties.is_shareable_non_cdc_connector() + && session.config().rw_enable_shared_source()); let (columns_from_resolve_source, mut source_info) = if create_cdc_source_job { bind_columns_from_source_for_cdc(&session, &source_schema)? @@ -1696,15 +1722,7 @@ pub async fn handle_create_source( )?; let stream_plan = source_node.to_stream(&mut ToStreamContext::new(false))?; - let mut graph = build_graph(stream_plan)?; - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); - graph + build_graph(stream_plan)? }; catalog_writer .create_source_with_graph(source, graph) diff --git a/src/frontend/src/handler/create_table.rs b/src/frontend/src/handler/create_table.rs index a10453a43ea4e..e7b2b44226657 100644 --- a/src/frontend/src/handler/create_table.rs +++ b/src/frontend/src/handler/create_table.rs @@ -41,7 +41,6 @@ use risingwave_pb::plan_common::column_desc::GeneratedOrDefaultColumn; use risingwave_pb::plan_common::{ AdditionalColumn, ColumnDescVersion, DefaultColumnDesc, GeneratedColumnDesc, }; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_pb::stream_plan::StreamFragmentGraph; use risingwave_sqlparser::ast::{ CdcTableInfo, ColumnDef, ColumnOption, ConnectorSchema, DataType as AstDataType, @@ -772,6 +771,7 @@ pub(crate) fn gen_create_table_plan_for_cdc_table( // append additional columns to the end handle_addition_columns( + None, &connect_properties, include_column_options, &mut columns, @@ -1235,6 +1235,8 @@ pub async fn handle_create_table( session.notice_to_user("APPEND ONLY TABLE is currently an experimental feature."); } + session.check_cluster_limits().await?; + if let Either::Right(resp) = session.check_relation_name_duplicated( table_name.clone(), StatementType::CREATE_TABLE, @@ -1261,14 +1263,8 @@ pub async fn handle_create_table( ) .await?; - let mut graph = build_graph(plan)?; - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); + let graph = build_graph(plan)?; + (graph, source, table, job_type) }; @@ -1313,7 +1309,7 @@ pub fn check_create_table_with_source( #[allow(clippy::too_many_arguments)] pub async fn generate_stream_graph_for_table( - session: &Arc, + _session: &Arc, table_name: ObjectName, original_catalog: &Arc, source_schema: Option, @@ -1428,15 +1424,7 @@ pub async fn generate_stream_graph_for_table( ))? } - let graph = StreamFragmentGraph { - parallelism: session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }), - ..build_graph(plan)? - }; + let graph = build_graph(plan)?; // Fill the original table ID. let table = Table { diff --git a/src/frontend/src/handler/create_table_as.rs b/src/frontend/src/handler/create_table_as.rs index bb00be2dfa486..27c527969f9b2 100644 --- a/src/frontend/src/handler/create_table_as.rs +++ b/src/frontend/src/handler/create_table_as.rs @@ -16,7 +16,6 @@ use either::Either; use pgwire::pg_response::StatementType; use risingwave_common::catalog::{ColumnCatalog, ColumnDesc}; use risingwave_pb::ddl_service::TableJobType; -use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_sqlparser::ast::{ColumnDef, ObjectName, OnConflict, Query, Statement}; use super::{HandlerArgs, RwPgResponse}; @@ -110,14 +109,8 @@ pub async fn handle_create_as( with_version_column, Some(col_id_gen.into_version()), )?; - let mut graph = build_graph(plan)?; - graph.parallelism = - session - .config() - .streaming_parallelism() - .map(|parallelism| Parallelism { - parallelism: parallelism.get(), - }); + let graph = build_graph(plan)?; + (graph, None, table) }; diff --git a/src/frontend/src/handler/show.rs b/src/frontend/src/handler/show.rs index 6cd8b95f95b49..1821ccc289ebc 100644 --- a/src/frontend/src/handler/show.rs +++ b/src/frontend/src/handler/show.rs @@ -450,7 +450,7 @@ pub async fn handle_show_object( .into()); } ShowObject::Jobs => { - let resp = session.env().meta_client().list_ddl_progress().await?; + let resp = session.env().meta_client().get_ddl_progress().await?; let rows = resp.into_iter().map(|job| ShowJobRow { id: job.id as i64, statement: job.statement, diff --git a/src/frontend/src/lib.rs b/src/frontend/src/lib.rs index d8b484e3d6fa2..d3d5d1623bd58 100644 --- a/src/frontend/src/lib.rs +++ b/src/frontend/src/lib.rs @@ -23,7 +23,6 @@ #![feature(if_let_guard)] #![feature(let_chains)] #![feature(assert_matches)] -#![feature(lint_reasons)] #![feature(box_patterns)] #![feature(macro_metavar_expr)] #![feature(min_specialization)] @@ -142,8 +141,9 @@ pub struct FrontendOpts { pub config_path: String, /// Used for control the metrics level, similar to log level. - /// 0 = disable metrics - /// >0 = enable metrics + /// + /// level = 0: disable metrics + /// level > 0: enable metrics #[clap(long, hide = true, env = "RW_METRICS_LEVEL")] #[override_opts(path = server.metrics_level)] pub metrics_level: Option, diff --git a/src/frontend/src/meta_client.rs b/src/frontend/src/meta_client.rs index 60fa992bdbe2d..c58dcc365f431 100644 --- a/src/frontend/src/meta_client.rs +++ b/src/frontend/src/meta_client.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use anyhow::Context; use risingwave_common::session_config::SessionConfig; use risingwave_common::system_param::reader::SystemParamsReader; +use risingwave_common::util::cluster_limit::ClusterLimit; use risingwave_hummock_sdk::version::{HummockVersion, HummockVersionDelta}; use risingwave_hummock_sdk::HummockVersionId; use risingwave_pb::backup_service::MetaSnapshotMetadata; @@ -90,7 +91,7 @@ pub trait FrontendMetaClient: Send + Sync { async fn set_session_param(&self, param: String, value: Option) -> Result; - async fn list_ddl_progress(&self) -> Result>; + async fn get_ddl_progress(&self) -> Result>; async fn get_tables(&self, table_ids: &[u32]) -> Result>; @@ -136,6 +137,8 @@ pub trait FrontendMetaClient: Send + Sync { ) -> Result>; async fn get_cluster_recovery_status(&self) -> Result; + + async fn get_cluster_limits(&self) -> Result>; } pub struct FrontendMetaClientImpl(pub MetaClient); @@ -229,7 +232,7 @@ impl FrontendMetaClient for FrontendMetaClientImpl { self.0.set_session_param(param, value).await } - async fn list_ddl_progress(&self) -> Result> { + async fn get_ddl_progress(&self) -> Result> { let ddl_progress = self.0.get_ddl_progress().await?; Ok(ddl_progress) } @@ -345,4 +348,8 @@ impl FrontendMetaClient for FrontendMetaClientImpl { async fn get_cluster_recovery_status(&self) -> Result { self.0.get_cluster_recovery_status().await } + + async fn get_cluster_limits(&self) -> Result> { + self.0.get_cluster_limits().await + } } diff --git a/src/frontend/src/optimizer/delta_join_solver.rs b/src/frontend/src/optimizer/delta_join_solver.rs index 5dc1bb30cc9f9..470fc0426d7d5 100644 --- a/src/frontend/src/optimizer/delta_join_solver.rs +++ b/src/frontend/src/optimizer/delta_join_solver.rs @@ -66,7 +66,8 @@ //! possible that every lookup path produces different distribution. We need to shuffle them //! before feeding data to union. -#![expect(dead_code)] +// FIXME: https://github.com/rust-lang/rust-analyzer/issues/17685 +#![allow(dead_code)] use std::collections::{BTreeMap, BTreeSet}; diff --git a/src/frontend/src/optimizer/plan_expr_visitor/strong.rs b/src/frontend/src/optimizer/plan_expr_visitor/strong.rs index 2c14fc730877d..673a5f41746bb 100644 --- a/src/frontend/src/optimizer/plan_expr_visitor/strong.rs +++ b/src/frontend/src/optimizer/plan_expr_visitor/strong.rs @@ -291,6 +291,7 @@ impl Strong { | ExprType::JsonbPopulateRecord | ExprType::JsonbToRecord | ExprType::JsonbSet + | ExprType::JsonbPopulateMap | ExprType::MapFromEntries | ExprType::MapAccess | ExprType::MapKeys diff --git a/src/frontend/src/optimizer/plan_node/logical_agg.rs b/src/frontend/src/optimizer/plan_node/logical_agg.rs index 9e774628fc262..b0ad102ee693c 100644 --- a/src/frontend/src/optimizer/plan_node/logical_agg.rs +++ b/src/frontend/src/optimizer/plan_node/logical_agg.rs @@ -86,6 +86,8 @@ impl LogicalAgg { bail!("expected at least one agg call"); } + let need_row_merge: bool = Self::need_row_merge(&approx_percentile); + // ====== Handle normal aggs let total_agg_calls = core .agg_calls @@ -98,8 +100,12 @@ impl LogicalAgg { let local_agg = StreamStatelessSimpleAgg::new(core); let exchange = RequiredDist::single().enforce_if_not_satisfies(local_agg.into(), &Order::any())?; - let global_agg = - new_stream_simple_agg(Agg::new(total_agg_calls, IndexSet::empty(), exchange)); + + let must_output_per_barrier = need_row_merge; + let global_agg = new_stream_simple_agg( + Agg::new(total_agg_calls, IndexSet::empty(), exchange), + must_output_per_barrier, + ); // ====== Merge approx percentile and normal aggs Self::add_row_merge_if_needed( @@ -129,6 +135,7 @@ impl LogicalAgg { }; bail!("expected at least one agg call"); } + let need_row_merge = Self::need_row_merge(&approx_percentile); // Generate vnode via project // TODO(kwannoel): We should apply Project optimization rules here. @@ -157,19 +164,26 @@ impl LogicalAgg { let global_agg = if self.group_key().is_empty() { let exchange = RequiredDist::single().enforce_if_not_satisfies(local_agg.into(), &Order::any())?; - let global_agg = new_stream_simple_agg(Agg::new( - core.agg_calls - .iter() - .enumerate() - .map(|(partial_output_idx, agg_call)| { - agg_call.partial_to_total_agg_call(n_local_group_key + partial_output_idx) - }) - .collect(), - global_group_key.into_iter().collect(), - exchange, - )); + let must_output_per_barrier = need_row_merge; + let global_agg = new_stream_simple_agg( + Agg::new( + core.agg_calls + .iter() + .enumerate() + .map(|(partial_output_idx, agg_call)| { + agg_call + .partial_to_total_agg_call(n_local_group_key + partial_output_idx) + }) + .collect(), + global_group_key.into_iter().collect(), + exchange, + ), + must_output_per_barrier, + ); global_agg.into() } else { + // the `RowMergeExec` has not supported keyed merge + assert!(!need_row_merge); let exchange = RequiredDist::shard_by_key(input_col_num, &global_group_key) .enforce_if_not_satisfies(local_agg.into(), &Order::any())?; // Local phase should have reordered the group keys into their required order. @@ -203,7 +217,7 @@ impl LogicalAgg { let mut core = self.core.clone(); let input = RequiredDist::single().enforce_if_not_satisfies(stream_input, &Order::any())?; core.input = input; - Ok(new_stream_simple_agg(core).into()) + Ok(new_stream_simple_agg(core, false).into()) } fn gen_shuffle_plan(&self, stream_input: PlanRef) -> Result { @@ -339,6 +353,10 @@ impl LogicalAgg { )) } + fn need_row_merge(approx_percentile: &Option) -> bool { + approx_percentile.is_some() + } + /// Add `RowMerge` if needed fn add_row_merge_if_needed( approx_percentile: Option, @@ -346,7 +364,11 @@ impl LogicalAgg { approx_percentile_col_mapping: ColIndexMapping, non_approx_percentile_col_mapping: ColIndexMapping, ) -> Result { + // just for assert + let need_row_merge = Self::need_row_merge(&approx_percentile); + if let Some(approx_percentile) = approx_percentile { + assert!(need_row_merge); let row_merge = StreamRowMerge::new( approx_percentile, global_agg, @@ -355,6 +377,7 @@ impl LogicalAgg { )?; Ok(row_merge.into()) } else { + assert!(!need_row_merge); Ok(global_agg) } } @@ -1305,9 +1328,9 @@ fn find_or_append_row_count(mut logical: Agg) -> (Agg, usize) (logical, row_count_idx) } -fn new_stream_simple_agg(core: Agg) -> StreamSimpleAgg { +fn new_stream_simple_agg(core: Agg, must_output_per_barrier: bool) -> StreamSimpleAgg { let (logical, row_count_idx) = find_or_append_row_count(core); - StreamSimpleAgg::new(logical, row_count_idx) + StreamSimpleAgg::new(logical, row_count_idx, must_output_per_barrier) } fn new_stream_hash_agg(core: Agg, vnode_col_idx: Option) -> StreamHashAgg { @@ -1386,19 +1409,12 @@ impl ToStream for LogicalAgg { panic!("the root PlanNode must be StreamHashAgg, StreamSimpleAgg, StreamGlobalApproxPercentile, or StreamRowMerge"); }; - let is_hash_agg = !self.group_key().is_empty(); - // "Simple Agg" includes normal simple agg, as well as approx percentile simple 2 phase agg. - let is_simple_agg = !is_hash_agg; - if self.agg_calls().len() == n_final_agg_calls && is_hash_agg { + if self.agg_calls().len() == n_final_agg_calls { // an existing `count(*)` is used as row count column in `StreamXxxAgg` Ok(plan) } else { - // For hash agg, a `count(*)` is appended, should project the output. - // For simple agg, we output every epoch, so we will always add a project - // to filter out no-op updates, and we don't need the following assert. - if is_hash_agg { - assert_eq!(self.agg_calls().len() + 1, n_final_agg_calls); - } + // a `count(*)` is appended, should project the output + assert_eq!(self.agg_calls().len() + 1, n_final_agg_calls); Ok(StreamProject::new(generic::Project::with_out_col_idx( plan, 0..self.schema().len(), @@ -1407,9 +1423,7 @@ impl ToStream for LogicalAgg { // Since it'll be pruned immediately in `StreamProject`, the update records are likely to be // no-op. So we set the hint to instruct the executor to eliminate them. // See https://github.com/risingwavelabs/risingwave/issues/17030. - // Further for simple agg, we also have to set the hint to eliminate no-op updates. - // Since we will output every epoch. - .with_noop_update_hint(self.agg_calls().is_empty() || is_simple_agg) + .with_noop_update_hint(self.agg_calls().is_empty()) .into()) } } diff --git a/src/frontend/src/optimizer/plan_node/logical_over_window.rs b/src/frontend/src/optimizer/plan_node/logical_over_window.rs index 7a81b164fbafe..bb78380482752 100644 --- a/src/frontend/src/optimizer/plan_node/logical_over_window.rs +++ b/src/frontend/src/optimizer/plan_node/logical_over_window.rs @@ -548,11 +548,10 @@ impl ColPrunable for LogicalOverWindow { let new_window_functions = req_cols_win_func_part .indices() .map(|idx| self.window_functions()[idx - input_len].clone()) - .map(|func| { + .inspect(|func| { tmp.extend(func.args.iter().map(|x| x.index())); tmp.extend(func.partition_by.iter().map(|x| x.index())); tmp.extend(func.order_by.iter().map(|x| x.column_index)); - func }) .collect_vec(); (tmp, new_window_functions) diff --git a/src/frontend/src/optimizer/plan_node/stream_simple_agg.rs b/src/frontend/src/optimizer/plan_node/stream_simple_agg.rs index 6ecaa4c308f5e..f9f125654f402 100644 --- a/src/frontend/src/optimizer/plan_node/stream_simple_agg.rs +++ b/src/frontend/src/optimizer/plan_node/stream_simple_agg.rs @@ -33,10 +33,18 @@ pub struct StreamSimpleAgg { /// The index of `count(*)` in `agg_calls`. row_count_idx: usize, + + // Required by the downstream `RowMerge`, + // currently only used by the `approx_percentile`'s two phase plan + must_output_per_barrier: bool, } impl StreamSimpleAgg { - pub fn new(core: generic::Agg, row_count_idx: usize) -> Self { + pub fn new( + core: generic::Agg, + row_count_idx: usize, + must_output_per_barrier: bool, + ) -> Self { assert_eq!(core.agg_calls[row_count_idx], PlanAggCall::count_star()); let input = core.input.clone(); @@ -62,6 +70,7 @@ impl StreamSimpleAgg { base, core, row_count_idx, + must_output_per_barrier, } } @@ -75,7 +84,11 @@ impl Distill for StreamSimpleAgg { let name = plan_node_name!("StreamSimpleAgg", { "append_only", self.input().append_only() }, ); - childless_record(name, self.core.fields_pretty()) + let mut vec = self.core.fields_pretty(); + if self.must_output_per_barrier { + vec.push(("must_output_per_barrier", "true".into())); + } + childless_record(name, vec) } } @@ -89,7 +102,7 @@ impl PlanTreeNodeUnary for StreamSimpleAgg { input, ..self.core.clone() }; - Self::new(logical, self.row_count_idx) + Self::new(logical, self.row_count_idx, self.must_output_per_barrier) } } impl_plan_tree_node_for_unary! { StreamSimpleAgg } @@ -137,6 +150,7 @@ impl StreamNode for StreamSimpleAgg { .collect(), row_count_index: self.row_count_idx as u32, version: PbAggNodeVersion::Issue13465 as _, + must_output_per_barrier: self.must_output_per_barrier, }) } } @@ -149,7 +163,7 @@ impl ExprRewritable for StreamSimpleAgg { fn rewrite_exprs(&self, r: &mut dyn ExprRewriter) -> PlanRef { let mut core = self.core.clone(); core.rewrite_exprs(r); - Self::new(core, self.row_count_idx).into() + Self::new(core, self.row_count_idx, self.must_output_per_barrier).into() } } diff --git a/src/frontend/src/optimizer/plan_node/stream_sink.rs b/src/frontend/src/optimizer/plan_node/stream_sink.rs index 2717c454e6435..3e34475c8d4bb 100644 --- a/src/frontend/src/optimizer/plan_node/stream_sink.rs +++ b/src/frontend/src/optimizer/plan_node/stream_sink.rs @@ -212,7 +212,7 @@ impl StreamSink { partition_info: Option, ) -> Result { let columns = derive_columns(input.schema(), out_names, &user_cols)?; - let (input, sink) = Self::derive_sink_desc( + let (input, mut sink) = Self::derive_sink_desc( input, user_distributed_by, name, @@ -241,8 +241,11 @@ impl StreamSink { if connector == TABLE_SINK && sink.target_table.is_none() { unsupported_sink(TABLE_SINK) } else { + SinkType::set_default_commit_checkpoint_interval( + &mut sink, + &input.ctx().session_ctx().config().sink_decouple(), + )?; SinkType::is_sink_decouple( - &sink, &input.ctx().session_ctx().config().sink_decouple(), ) } diff --git a/src/frontend/src/optimizer/plan_node/stream_stateless_simple_agg.rs b/src/frontend/src/optimizer/plan_node/stream_stateless_simple_agg.rs index 93c56efad3d5f..edb9121baf595 100644 --- a/src/frontend/src/optimizer/plan_node/stream_stateless_simple_agg.rs +++ b/src/frontend/src/optimizer/plan_node/stream_stateless_simple_agg.rs @@ -102,6 +102,7 @@ impl StreamNode for StreamStatelessSimpleAgg { is_append_only: self.input().append_only(), distinct_dedup_tables: Default::default(), version: AggNodeVersion::Issue13465 as _, + must_output_per_barrier: false, // this is not used }) } } diff --git a/src/frontend/src/optimizer/rule/index_selection_rule.rs b/src/frontend/src/optimizer/rule/index_selection_rule.rs index 548fda7b92af4..a995dd9878620 100644 --- a/src/frontend/src/optimizer/rule/index_selection_rule.rs +++ b/src/frontend/src/optimizer/rule/index_selection_rule.rs @@ -48,7 +48,7 @@ use std::cmp::min; use std::collections::hash_map::Entry::{Occupied, Vacant}; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::rc::Rc; use itertools::Itertools; @@ -962,17 +962,6 @@ impl ExprVisitor for TableScanIoEstimator<'_> { } } -#[derive(Default)] -struct ExprInputRefFinder { - pub input_ref_index_set: HashSet, -} - -impl ExprVisitor for ExprInputRefFinder { - fn visit_input_ref(&mut self, input_ref: &InputRef) { - self.input_ref_index_set.insert(input_ref.index); - } -} - struct ShiftInputRefRewriter { offset: usize, } diff --git a/src/frontend/src/scheduler/distributed/query_manager.rs b/src/frontend/src/scheduler/distributed/query_manager.rs index 86a54cf9c0f98..2d977cfb675e6 100644 --- a/src/frontend/src/scheduler/distributed/query_manager.rs +++ b/src/frontend/src/scheduler/distributed/query_manager.rs @@ -230,14 +230,13 @@ impl QueryManager { self.query_metrics.clone(), ) .await - .map_err(|err| { + .inspect_err(|_| { // Clean up query execution on error. context .session() .env() .query_manager() .delete_query(&query_id); - err })?; Ok(query_result_fetcher.stream_from_channel()) } diff --git a/src/frontend/src/scheduler/distributed/stage.rs b/src/frontend/src/scheduler/distributed/stage.rs index bb18e2143aa7f..e933d3f271108 100644 --- a/src/frontend/src/scheduler/distributed/stage.rs +++ b/src/frontend/src/scheduler/distributed/stage.rs @@ -1028,7 +1028,7 @@ impl StageRunner { .expect("no partition info for seq scan") .into_table() .expect("PartitionInfo should be TablePartitionInfo"); - scan_node.vnode_bitmap = Some(partition.vnode_bitmap); + scan_node.vnode_bitmap = Some(partition.vnode_bitmap.to_protobuf()); scan_node.scan_ranges = partition.scan_ranges; PbPlanNode { children: vec![], @@ -1045,7 +1045,7 @@ impl StageRunner { .expect("no partition info for seq scan") .into_table() .expect("PartitionInfo should be TablePartitionInfo"); - scan_node.vnode_bitmap = Some(partition.vnode_bitmap); + scan_node.vnode_bitmap = Some(partition.vnode_bitmap.to_protobuf()); PbPlanNode { children: vec![], identity, diff --git a/src/frontend/src/scheduler/local.rs b/src/frontend/src/scheduler/local.rs index a727ddd9db7dd..fcd15368bb5fc 100644 --- a/src/frontend/src/scheduler/local.rs +++ b/src/frontend/src/scheduler/local.rs @@ -500,7 +500,7 @@ impl LocalQueryExecution { let partition = partition .into_table() .expect("PartitionInfo should be TablePartitionInfo here"); - scan_node.vnode_bitmap = Some(partition.vnode_bitmap); + scan_node.vnode_bitmap = Some(partition.vnode_bitmap.to_protobuf()); scan_node.scan_ranges = partition.scan_ranges; } } @@ -522,7 +522,7 @@ impl LocalQueryExecution { let partition = partition .into_table() .expect("PartitionInfo should be TablePartitionInfo here"); - scan_node.vnode_bitmap = Some(partition.vnode_bitmap); + scan_node.vnode_bitmap = Some(partition.vnode_bitmap.to_protobuf()); } } _ => unreachable!(), diff --git a/src/frontend/src/scheduler/plan_fragmenter.rs b/src/frontend/src/scheduler/plan_fragmenter.rs index 09e4cbc0bfa03..63b6eef38da71 100644 --- a/src/frontend/src/scheduler/plan_fragmenter.rs +++ b/src/frontend/src/scheduler/plan_fragmenter.rs @@ -30,7 +30,7 @@ use risingwave_common::bail; use risingwave_common::bitmap::{Bitmap, BitmapBuilder}; use risingwave_common::catalog::{Schema, TableDesc}; use risingwave_common::hash::table_distribution::TableDistribution; -use risingwave_common::hash::{VirtualNode, WorkerSlotId, WorkerSlotMapping}; +use risingwave_common::hash::{WorkerSlotId, WorkerSlotMapping}; use risingwave_common::util::scan_range::ScanRange; use risingwave_connector::source::filesystem::opendal_source::opendal_enumerator::OpendalEnumerator; use risingwave_connector::source::filesystem::opendal_source::{ @@ -44,7 +44,6 @@ use risingwave_connector::source::{ }; use risingwave_pb::batch_plan::plan_node::NodeBody; use risingwave_pb::batch_plan::{ExchangeInfo, ScanRange as ScanRangeProto}; -use risingwave_pb::common::Buffer; use risingwave_pb::plan_common::Field as PbField; use risingwave_sqlparser::ast::AsOf; use serde::ser::SerializeStruct; @@ -311,9 +310,11 @@ impl SourceScanInfo { Ok(SourceScanInfo::Complete(split_info)) } ConnectorProperties::OpendalS3(prop) => { + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); + let lister: OpendalEnumerator = OpendalEnumerator::new_s3_source(prop.s3_properties, prop.assume_role)?; - let stream = build_opendal_fs_list_for_batch(lister); + let stream = build_opendal_fs_list_for_batch(lister, recursive_scan); let batch_res: Vec<_> = stream.try_collect().await?; let res = batch_res @@ -324,18 +325,22 @@ impl SourceScanInfo { Ok(SourceScanInfo::Complete(res)) } ConnectorProperties::Gcs(prop) => { + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); + let lister: OpendalEnumerator = OpendalEnumerator::new_gcs_source(*prop)?; - let stream = build_opendal_fs_list_for_batch(lister); + let stream = build_opendal_fs_list_for_batch(lister, recursive_scan); let batch_res: Vec<_> = stream.try_collect().await?; let res = batch_res.into_iter().map(SplitImpl::Gcs).collect_vec(); Ok(SourceScanInfo::Complete(res)) } ConnectorProperties::Azblob(prop) => { + let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); + let lister: OpendalEnumerator = OpendalEnumerator::new_azblob_source(*prop)?; - let stream = build_opendal_fs_list_for_batch(lister); + let stream = build_opendal_fs_list_for_batch(lister, recursive_scan); let batch_res: Vec<_> = stream.try_collect().await?; let res = batch_res.into_iter().map(SplitImpl::Azblob).collect_vec(); @@ -437,7 +442,7 @@ impl TableScanInfo { #[derive(Clone, Debug)] pub struct TablePartitionInfo { - pub vnode_bitmap: Buffer, + pub vnode_bitmap: Bitmap, pub scan_ranges: Vec, } @@ -922,8 +927,7 @@ impl BatchPlanFragmenter { .drain() .take(1) .update(|(_, info)| { - info.vnode_bitmap = - Bitmap::ones(VirtualNode::COUNT).to_protobuf(); + info.vnode_bitmap = Bitmap::ones(info.vnode_bitmap.len()); }) .collect(); } @@ -1230,7 +1234,7 @@ fn derive_partitions( table_desc: &TableDesc, vnode_mapping: &WorkerSlotMapping, ) -> SchedulerResult> { - let num_vnodes = vnode_mapping.len(); + let vnode_count = vnode_mapping.len(); let mut partitions: HashMap)> = HashMap::new(); if scan_ranges.is_empty() { @@ -1241,7 +1245,7 @@ fn derive_partitions( ( k, TablePartitionInfo { - vnode_bitmap: vnode_bitmap.to_protobuf(), + vnode_bitmap, scan_ranges: vec![], }, ) @@ -1250,7 +1254,7 @@ fn derive_partitions( } let table_distribution = TableDistribution::new_from_storage_table_desc( - Some(TableDistribution::all_vnodes()), + Some(Bitmap::ones(vnode_count).into()), &table_desc.try_to_protobuf()?, ); @@ -1263,7 +1267,7 @@ fn derive_partitions( |(worker_slot_id, vnode_bitmap)| { let (bitmap, scan_ranges) = partitions .entry(worker_slot_id) - .or_insert_with(|| (BitmapBuilder::zeroed(num_vnodes), vec![])); + .or_insert_with(|| (BitmapBuilder::zeroed(vnode_count), vec![])); vnode_bitmap .iter() .enumerate() @@ -1277,7 +1281,7 @@ fn derive_partitions( let worker_slot_id = vnode_mapping[vnode]; let (bitmap, scan_ranges) = partitions .entry(worker_slot_id) - .or_insert_with(|| (BitmapBuilder::zeroed(num_vnodes), vec![])); + .or_insert_with(|| (BitmapBuilder::zeroed(vnode_count), vec![])); bitmap.set(vnode.to_index(), true); scan_ranges.push(scan_range.to_protobuf()); } @@ -1290,7 +1294,7 @@ fn derive_partitions( ( k, TablePartitionInfo { - vnode_bitmap: bitmap.finish().to_protobuf(), + vnode_bitmap: bitmap.finish(), scan_ranges, }, ) diff --git a/src/frontend/src/session.rs b/src/frontend/src/session.rs index 16f0c7226be21..a1150798951cb 100644 --- a/src/frontend/src/session.rs +++ b/src/frontend/src/session.rs @@ -59,9 +59,10 @@ use risingwave_common::telemetry::manager::TelemetryManager; use risingwave_common::telemetry::telemetry_env_enabled; use risingwave_common::types::DataType; use risingwave_common::util::addr::HostAddr; +use risingwave_common::util::cluster_limit::ActorCountPerParallelism; use risingwave_common::util::iter_util::ZipEqFast; -use risingwave_common::util::resource_util; use risingwave_common::util::runtime::BackgroundShutdownRuntime; +use risingwave_common::util::{cluster_limit, resource_util}; use risingwave_common::{GIT_SHA, RW_VERSION}; use risingwave_common_heap_profiling::HeapProfiler; use risingwave_common_service::{MetricsManager, ObserverManager}; @@ -1194,6 +1195,47 @@ impl SessionImpl { pub fn temporary_source_manager(&self) -> TemporarySourceManager { self.temporary_source_manager.lock().clone() } + + pub async fn check_cluster_limits(&self) -> Result<()> { + if self.config().bypass_cluster_limits() { + return Ok(()); + } + + let gen_message = |violated_limit: &ActorCountPerParallelism, + exceed_hard_limit: bool| + -> String { + let (limit_type, action) = if exceed_hard_limit { + ("critical", "Please scale the cluster before proceeding!") + } else { + ("recommended", "Scaling the cluster is recommended.") + }; + format!( + "\n- {}\n- {}\n- {}\n- {}\n- {}\n{}", + format_args!("Actor count per parallelism exceeds the {} limit.", limit_type), + format_args!("Depending on your workload, this may overload the cluster and cause performance/stability issues. {}", action), + "Contact us via slack or https://risingwave.com/contact-us/ for further enquiry.", + "You can bypass this check via SQL `SET bypass_cluster_limits TO true`.", + "You can check actor count distribution via SQL `SELECT * FROM rw_worker_actor_count`.", + violated_limit, + ) + }; + + let limits = self.env().meta_client().get_cluster_limits().await?; + for limit in limits { + match limit { + cluster_limit::ClusterLimit::ActorCount(l) => { + if l.exceed_hard_limit() { + return Err(RwError::from(ErrorCode::ProtocolError(gen_message( + &l, true, + )))); + } else if l.exceed_soft_limit() { + self.notice_to_user(gen_message(&l, false)); + } + } + } + } + Ok(()) + } } pub static SESSION_MANAGER: std::sync::OnceLock> = diff --git a/src/frontend/src/stream_fragmenter/graph/fragment_graph.rs b/src/frontend/src/stream_fragmenter/graph/fragment_graph.rs index d1251f2295642..9ab491ec3a41f 100644 --- a/src/frontend/src/stream_fragmenter/graph/fragment_graph.rs +++ b/src/frontend/src/stream_fragmenter/graph/fragment_graph.rs @@ -19,8 +19,7 @@ use risingwave_pb::stream_plan::stream_fragment_graph::{ StreamFragment as StreamFragmentProto, StreamFragmentEdge as StreamFragmentEdgeProto, }; use risingwave_pb::stream_plan::{ - DispatchStrategy, FragmentTypeFlag, StreamContext, - StreamFragmentGraph as StreamFragmentGraphProto, StreamNode, + DispatchStrategy, FragmentTypeFlag, StreamFragmentGraph as StreamFragmentGraphProto, StreamNode, }; use thiserror_ext::AsReport; @@ -92,9 +91,6 @@ pub struct StreamFragmentGraph { /// stores edges between fragments: (upstream, downstream) => edge. edges: HashMap<(LocalFragmentId, LocalFragmentId), StreamFragmentEdgeProto>, - - /// Stores the streaming context for the streaming plan - ctx: StreamContext, } impl StreamFragmentGraph { @@ -106,8 +102,9 @@ impl StreamFragmentGraph { .map(|(k, v)| (*k, v.to_protobuf())) .collect(), edges: self.edges.values().cloned().collect(), - ctx: Some(self.ctx.clone()), - // To be filled later + + // Following fields will be filled later in `build_graph` based on session context. + ctx: None, dependent_table_ids: vec![], table_ids_cnt: 0, parallelism: None, diff --git a/src/frontend/src/stream_fragmenter/mod.rs b/src/frontend/src/stream_fragmenter/mod.rs index 66e9d5aff9e54..790f18d109a75 100644 --- a/src/frontend/src/stream_fragmenter/mod.rs +++ b/src/frontend/src/stream_fragmenter/mod.rs @@ -16,6 +16,7 @@ mod graph; use graph::*; use risingwave_common::util::recursive::{self, Recurse as _}; use risingwave_connector::WithPropertiesExt; +use risingwave_pb::stream_plan::stream_fragment_graph::Parallelism; use risingwave_pb::stream_plan::stream_node::NodeBody; mod rewrite; @@ -26,12 +27,13 @@ use educe::Educe; use risingwave_common::catalog::TableId; use risingwave_pb::plan_common::JoinType; use risingwave_pb::stream_plan::{ - DispatchStrategy, DispatcherType, ExchangeNode, FragmentTypeFlag, NoOpNode, + DispatchStrategy, DispatcherType, ExchangeNode, FragmentTypeFlag, NoOpNode, StreamContext, StreamFragmentGraph as StreamFragmentGraphProto, StreamNode, StreamScanType, }; use self::rewrite::build_delta_join_without_arrange; use crate::error::Result; +use crate::optimizer::plan_node::generic::GenericPlanRef; use crate::optimizer::plan_node::reorganize_elements_id; use crate::optimizer::PlanRef; use crate::scheduler::SchedulerResult; @@ -116,18 +118,38 @@ impl BuildFragmentGraphState { } pub fn build_graph(plan_node: PlanRef) -> SchedulerResult { + let ctx = plan_node.plan_base().ctx(); let plan_node = reorganize_elements_id(plan_node); let mut state = BuildFragmentGraphState::default(); let stream_node = plan_node.to_stream_prost(&mut state)?; generate_fragment_graph(&mut state, stream_node).unwrap(); let mut fragment_graph = state.fragment_graph.to_protobuf(); + + // Set table ids. fragment_graph.dependent_table_ids = state .dependent_table_ids .into_iter() .map(|id| id.table_id) .collect(); fragment_graph.table_ids_cnt = state.next_table_id; + + // Set parallelism. + { + let config = ctx.session_ctx().config(); + fragment_graph.parallelism = + config + .streaming_parallelism() + .map(|parallelism| Parallelism { + parallelism: parallelism.get(), + }); + } + + // Set timezone. + fragment_graph.ctx = Some(StreamContext { + timezone: ctx.get_session_timezone(), + }); + Ok(fragment_graph) } diff --git a/src/frontend/src/test_utils.rs b/src/frontend/src/test_utils.rs index ee6ff589e0cdb..6123889262155 100644 --- a/src/frontend/src/test_utils.rs +++ b/src/frontend/src/test_utils.rs @@ -30,6 +30,7 @@ use risingwave_common::catalog::{ }; use risingwave_common::session_config::SessionConfig; use risingwave_common::system_param::reader::SystemParamsReader; +use risingwave_common::util::cluster_limit::ClusterLimit; use risingwave_common::util::column_index_mapping::ColIndexMapping; use risingwave_hummock_sdk::version::{HummockVersion, HummockVersionDelta}; use risingwave_pb::backup_service::MetaSnapshotMetadata; @@ -1012,7 +1013,7 @@ impl FrontendMetaClient for MockFrontendMetaClient { Ok("".to_string()) } - async fn list_ddl_progress(&self) -> RpcResult> { + async fn get_ddl_progress(&self) -> RpcResult> { Ok(vec![]) } @@ -1065,7 +1066,7 @@ impl FrontendMetaClient for MockFrontendMetaClient { } async fn list_all_nodes(&self) -> RpcResult> { - unimplemented!() + Ok(vec![]) } async fn list_compact_task_progress(&self) -> RpcResult> { @@ -1097,6 +1098,10 @@ impl FrontendMetaClient for MockFrontendMetaClient { ) -> RpcResult> { unimplemented!() } + + async fn get_cluster_limits(&self) -> RpcResult> { + Ok(vec![]) + } } #[cfg(test)] diff --git a/src/jni_core/src/lib.rs b/src/jni_core/src/lib.rs index 419f4ffd21cb5..8b771629df315 100644 --- a/src/jni_core/src/lib.rs +++ b/src/jni_core/src/lib.rs @@ -320,6 +320,7 @@ impl<'a> Deref for JavaBindingIterator<'a> { #[no_mangle] extern "system" fn Java_com_risingwave_java_binding_Binding_vnodeCount(_env: EnvParam<'_>) -> jint { + // TODO(var-vnode): use vnode count from config VirtualNode::COUNT as jint } diff --git a/src/license/Cargo.toml b/src/license/Cargo.toml index 47e00228626b8..b435747467e21 100644 --- a/src/license/Cargo.toml +++ b/src/license/Cargo.toml @@ -15,7 +15,10 @@ ignored = ["workspace-hack"] normal = ["workspace-hack"] [dependencies] +jsonbb = { workspace = true } jsonwebtoken = "9" +risingwave_pb = { workspace = true } +risingwave_telemetry_event = { workspace = true } serde = { version = "1", features = ["derive"] } thiserror = "1" thiserror-ext = { workspace = true } diff --git a/src/license/src/feature.rs b/src/license/src/feature.rs index b7082c01dd7b4..0b888986db5c2 100644 --- a/src/license/src/feature.rs +++ b/src/license/src/feature.rs @@ -14,7 +14,7 @@ use thiserror::Error; -use super::{License, LicenseKeyError, LicenseManager, Tier}; +use super::{report_telemetry, License, LicenseKeyError, LicenseManager, Tier}; /// Define all features that are available based on the tier of the license. /// @@ -57,7 +57,6 @@ macro_rules! for_all_features { { SqlServerCdcSource, Paid, "CDC source connector for Sql Server." }, { CdcAutoSchemaChange, Paid, "Auto replicate upstream DDL to CDC Table." }, { IcebergSinkWithGlue, Paid, "Delivering data to Iceberg with Glue catalog." }, - { FileSink, Paid, "Delivering data to object storage."}, } }; } @@ -84,6 +83,14 @@ macro_rules! def_feature { )* } } + + fn get_feature_name(&self) -> &'static str { + match &self { + $( + Self::$name => stringify!($name), + )* + } + } } }; } @@ -113,7 +120,7 @@ pub enum FeatureNotAvailable { impl Feature { /// Check whether the feature is available based on the current license. pub fn check_available(self) -> Result<(), FeatureNotAvailable> { - match LicenseManager::get().license() { + let check_res = match LicenseManager::get().license() { Ok(license) => { if license.tier >= self.min_tier() { Ok(()) @@ -136,6 +143,10 @@ impl Feature { }) } } - } + }; + + report_telemetry(&self, self.get_feature_name(), check_res.is_ok()); + + check_res } } diff --git a/src/license/src/lib.rs b/src/license/src/lib.rs index e2a3275780098..cf62dbab1d491 100644 --- a/src/license/src/lib.rs +++ b/src/license/src/lib.rs @@ -20,3 +20,26 @@ mod manager; pub use feature::*; pub use key::*; pub use manager::*; +use risingwave_pb::telemetry::PbTelemetryEventStage; +use risingwave_telemetry_event::report_event_common; + +pub(crate) fn report_telemetry(feature: &Feature, feature_name: &str, success_flag: bool) { + if matches!(feature, Feature::TestPaid) { + let mut attr_builder = jsonbb::Builder::>::new(); + attr_builder.begin_object(); + attr_builder.add_string("success"); + attr_builder.add_value(jsonbb::ValueRef::Bool(success_flag)); + attr_builder.end_object(); + let attr = attr_builder.finish(); + + report_event_common( + PbTelemetryEventStage::Unspecified, + feature_name, + 0, + None, + None, + Some(attr), + "paywall".to_string(), + ); + } +} diff --git a/src/meta/Cargo.toml b/src/meta/Cargo.toml index 4511e9f61d894..a7f37bf505910 100644 --- a/src/meta/Cargo.toml +++ b/src/meta/Cargo.toml @@ -28,6 +28,7 @@ clap = { workspace = true } comfy-table = "7" crepe = "0.1" easy-ext = "1" +educe = "0.6" either = "1" enum-as-inner = "0.6" etcd-client = { workspace = true } diff --git a/src/meta/model_v2/migration/src/lib.rs b/src/meta/model_v2/migration/src/lib.rs index 08291e5b163d5..0b09f3c4d4e11 100644 --- a/src/meta/model_v2/migration/src/lib.rs +++ b/src/meta/model_v2/migration/src/lib.rs @@ -20,6 +20,7 @@ mod m20240702_080451_system_param_value; mod m20240702_084927_unnecessary_fk; mod m20240726_063833_auto_schema_change; mod m20240806_143329_add_rate_limit_to_source_catalog; +mod m20240820_081248_add_time_travel_per_table_epoch; pub struct Migrator; @@ -45,6 +46,7 @@ impl MigratorTrait for Migrator { Box::new(m20240702_084927_unnecessary_fk::Migration), Box::new(m20240726_063833_auto_schema_change::Migration), Box::new(m20240806_143329_add_rate_limit_to_source_catalog::Migration), + Box::new(m20240820_081248_add_time_travel_per_table_epoch::Migration), ] } } diff --git a/src/meta/model_v2/migration/src/m20240820_081248_add_time_travel_per_table_epoch.rs b/src/meta/model_v2/migration/src/m20240820_081248_add_time_travel_per_table_epoch.rs new file mode 100644 index 0000000000000..85d9475aa8f01 --- /dev/null +++ b/src/meta/model_v2/migration/src/m20240820_081248_add_time_travel_per_table_epoch.rs @@ -0,0 +1,197 @@ +use sea_orm_migration::prelude::*; + +#[derive(DeriveMigrationName)] +pub struct Migration; + +const TABLE_NAME: &str = "hummock_epoch_to_version"; + +#[async_trait::async_trait] +impl MigrationTrait for Migration { + async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> { + // modify PK + match manager.get_database_backend() { + sea_orm::DatabaseBackend::MySql => { + manager + .alter_table( + Table::alter() + .table(HummockEpochToVersion::Table) + .add_column( + ColumnDef::new(HummockEpochToVersion::TableId).big_integer(), + ) + .to_owned(), + ) + .await?; + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::MySql, + format!("ALTER TABLE {TABLE_NAME} DROP PRIMARY KEY, ADD PRIMARY KEY (epoch, table_id)"), + )) + .await?; + } + sea_orm::DatabaseBackend::Postgres => { + manager + .alter_table( + Table::alter() + .table(HummockEpochToVersion::Table) + .add_column( + ColumnDef::new(HummockEpochToVersion::TableId).big_integer(), + ) + .to_owned(), + ) + .await?; + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::Postgres, + format!("ALTER TABLE {TABLE_NAME} DROP CONSTRAINT {TABLE_NAME}_pkey"), + )) + .await?; + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::Postgres, + format!("ALTER TABLE {TABLE_NAME} ADD PRIMARY KEY (epoch, table_id)"), + )) + .await?; + } + sea_orm::DatabaseBackend::Sqlite => { + // sqlite is not for prod usage, so recreating the table is fine. + manager + .drop_table( + sea_orm_migration::prelude::Table::drop() + .table(HummockEpochToVersion::Table) + .if_exists() + .cascade() + .to_owned(), + ) + .await?; + + manager + .create_table( + Table::create() + .table(HummockEpochToVersion::Table) + .if_not_exists() + .col( + ColumnDef::new(HummockEpochToVersion::Epoch) + .big_integer() + .not_null(), + ) + .col( + ColumnDef::new(HummockEpochToVersion::TableId) + .big_integer() + .not_null(), + ) + .col( + ColumnDef::new(HummockEpochToVersion::VersionId) + .big_integer() + .not_null(), + ) + .primary_key( + Index::create() + .col(HummockEpochToVersion::Epoch) + .col(HummockEpochToVersion::TableId), + ) + .to_owned(), + ) + .await?; + } + } + Ok(()) + } + + async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> { + // The downgrade for MySql and Postgres may not work due to PK confliction. + match manager.get_database_backend() { + sea_orm::DatabaseBackend::MySql => { + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::MySql, + format!("ALTER TABLE {TABLE_NAME} DROP PRIMARY KEY"), + )) + .await?; + manager + .alter_table( + Table::alter() + .table(HummockEpochToVersion::Table) + .drop_column(HummockEpochToVersion::TableId) + .to_owned(), + ) + .await?; + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::MySql, + format!("ALTER TABLE {TABLE_NAME} ADD PRIMARY KEY (epoch)"), + )) + .await?; + } + sea_orm::DatabaseBackend::Postgres => { + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::Postgres, + format!("ALTER TABLE {TABLE_NAME} DROP CONSTRAINT {TABLE_NAME}_pkey"), + )) + .await?; + manager + .alter_table( + Table::alter() + .table(HummockEpochToVersion::Table) + .drop_column(HummockEpochToVersion::TableId) + .to_owned(), + ) + .await?; + manager + .get_connection() + .execute(sea_orm::Statement::from_string( + sea_orm::DatabaseBackend::Postgres, + format!("ALTER TABLE {TABLE_NAME} ADD PRIMARY KEY (epoch)"), + )) + .await?; + } + sea_orm::DatabaseBackend::Sqlite => { + manager + .drop_table( + sea_orm_migration::prelude::Table::drop() + .table(HummockEpochToVersion::Table) + .if_exists() + .cascade() + .to_owned(), + ) + .await?; + + manager + .create_table( + Table::create() + .table(HummockEpochToVersion::Table) + .if_not_exists() + .col( + ColumnDef::new(HummockEpochToVersion::Epoch) + .big_integer() + .not_null() + .primary_key(), + ) + .col( + ColumnDef::new(HummockEpochToVersion::VersionId) + .big_integer() + .not_null(), + ) + .to_owned(), + ) + .await?; + } + } + + Ok(()) + } +} + +#[derive(DeriveIden)] +enum HummockEpochToVersion { + Table, + Epoch, + TableId, + VersionId, +} diff --git a/src/meta/model_v2/src/hummock_epoch_to_version.rs b/src/meta/model_v2/src/hummock_epoch_to_version.rs index 181b1b320bc54..f54551aa80178 100644 --- a/src/meta/model_v2/src/hummock_epoch_to_version.rs +++ b/src/meta/model_v2/src/hummock_epoch_to_version.rs @@ -22,6 +22,8 @@ use crate::{Epoch, HummockVersionId}; pub struct Model { #[sea_orm(primary_key, auto_increment = false)] pub epoch: Epoch, + #[sea_orm(primary_key, auto_increment = false)] + pub table_id: i64, pub version_id: HummockVersionId, } diff --git a/src/meta/node/src/lib.rs b/src/meta/node/src/lib.rs index 049519372c81e..6fa88fd412e31 100644 --- a/src/meta/node/src/lib.rs +++ b/src/meta/node/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] #![feature(let_chains)] #![cfg_attr(coverage, feature(coverage_attribute))] @@ -457,6 +456,14 @@ pub fn start( table_info_statistic_history_times: config .storage .table_info_statistic_history_times, + actor_cnt_per_worker_parallelism_hard_limit: config + .meta + .developer + .actor_cnt_per_worker_parallelism_hard_limit, + actor_cnt_per_worker_parallelism_soft_limit: config + .meta + .developer + .actor_cnt_per_worker_parallelism_soft_limit, }, config.system.into_init_system_params(), Default::default(), diff --git a/src/meta/node/src/server.rs b/src/meta/node/src/server.rs index 1f0f7f6a3fe8e..11b22014f9f98 100644 --- a/src/meta/node/src/server.rs +++ b/src/meta/node/src/server.rs @@ -27,7 +27,6 @@ use risingwave_common::telemetry::manager::TelemetryManager; use risingwave_common::telemetry::{report_scarf_enabled, report_to_scarf, telemetry_env_enabled}; use risingwave_common::util::tokio_util::sync::CancellationToken; use risingwave_common_service::{MetricsManager, TracingExtractLayer}; -use risingwave_meta::barrier::StreamRpcManager; use risingwave_meta::controller::catalog::CatalogController; use risingwave_meta::controller::cluster::ClusterController; use risingwave_meta::manager::{ @@ -40,6 +39,7 @@ use risingwave_meta::stream::ScaleController; use risingwave_meta::MetaStoreBackend; use risingwave_meta_service::backup_service::BackupServiceImpl; use risingwave_meta_service::cloud_service::CloudServiceImpl; +use risingwave_meta_service::cluster_limit_service::ClusterLimitServiceImpl; use risingwave_meta_service::cluster_service::ClusterServiceImpl; use risingwave_meta_service::ddl_service::DdlServiceImpl; use risingwave_meta_service::event_log_service::EventLogServiceImpl; @@ -63,6 +63,7 @@ use risingwave_pb::connector_service::sink_coordination_service_server::SinkCoor use risingwave_pb::ddl_service::ddl_service_server::DdlServiceServer; use risingwave_pb::health::health_server::HealthServer; use risingwave_pb::hummock::hummock_manager_service_server::HummockManagerServiceServer; +use risingwave_pb::meta::cluster_limit_service_server::ClusterLimitServiceServer; use risingwave_pb::meta::cluster_service_server::ClusterServiceServer; use risingwave_pb::meta::event_log_service_server::EventLogServiceServer; use risingwave_pb::meta::heartbeat_service_server::HeartbeatServiceServer; @@ -550,12 +551,9 @@ pub async fn start_service_as_election_leader( // TODO(shutdown): remove this as there's no need to gracefully shutdown some of these sub-tasks. let mut sub_tasks = vec![shutdown_handle]; - let stream_rpc_manager = StreamRpcManager::new(env.clone()); - let scale_controller = Arc::new(ScaleController::new( &metadata_manager, source_manager.clone(), - stream_rpc_manager.clone(), env.clone(), )); @@ -567,7 +565,6 @@ pub async fn start_service_as_election_leader( source_manager.clone(), sink_manager.clone(), meta_metrics.clone(), - stream_rpc_manager.clone(), scale_controller.clone(), ) .await; @@ -585,7 +582,6 @@ pub async fn start_service_as_election_leader( metadata_manager.clone(), barrier_scheduler.clone(), source_manager.clone(), - stream_rpc_manager, scale_controller.clone(), ) .unwrap(), @@ -657,6 +653,7 @@ pub async fn start_service_as_election_leader( ServingServiceImpl::new(serving_vnode_mapping.clone(), metadata_manager.clone()); let cloud_srv = CloudServiceImpl::new(metadata_manager.clone(), aws_cli); let event_log_srv = EventLogServiceImpl::new(env.event_log_manager_ref()); + let cluster_limit_srv = ClusterLimitServiceImpl::new(env.clone(), metadata_manager.clone()); if let Some(prometheus_addr) = address_info.prometheus_addr { MetricsManager::boot_metrics_service(prometheus_addr.to_string()) @@ -795,7 +792,8 @@ pub async fn start_service_as_election_leader( .add_service(ServingServiceServer::new(serving_srv)) .add_service(CloudServiceServer::new(cloud_srv)) .add_service(SinkCoordinationServiceServer::new(sink_coordination_srv)) - .add_service(EventLogServiceServer::new(event_log_srv)); + .add_service(EventLogServiceServer::new(event_log_srv)) + .add_service(ClusterLimitServiceServer::new(cluster_limit_srv)); #[cfg(not(madsim))] // `otlp-embedded` does not use madsim-patched tonic let server_builder = server_builder.add_service(TraceServiceServer::new(trace_srv)); diff --git a/src/meta/service/src/cluster_limit_service.rs b/src/meta/service/src/cluster_limit_service.rs new file mode 100644 index 0000000000000..df19b24b234e6 --- /dev/null +++ b/src/meta/service/src/cluster_limit_service.rs @@ -0,0 +1,107 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use risingwave_common::util::cluster_limit::{ + ActorCountPerParallelism, ClusterLimit, WorkerActorCount, +}; +use risingwave_meta::manager::{MetaSrvEnv, MetadataManager, WorkerId}; +use risingwave_meta::MetaResult; +use risingwave_pb::common::worker_node::State; +use risingwave_pb::common::WorkerType; +use risingwave_pb::meta::cluster_limit_service_server::ClusterLimitService; +use risingwave_pb::meta::{GetClusterLimitsRequest, GetClusterLimitsResponse}; +use tonic::{Request, Response, Status}; + +#[derive(Clone)] +pub struct ClusterLimitServiceImpl { + env: MetaSrvEnv, + metadata_manager: MetadataManager, +} + +impl ClusterLimitServiceImpl { + pub fn new(env: MetaSrvEnv, metadata_manager: MetadataManager) -> Self { + ClusterLimitServiceImpl { + env, + metadata_manager, + } + } + + async fn get_active_actor_limit(&self) -> MetaResult> { + let (soft_limit, hard_limit) = ( + self.env.opts.actor_cnt_per_worker_parallelism_soft_limit, + self.env.opts.actor_cnt_per_worker_parallelism_hard_limit, + ); + + let running_worker_parallelism: HashMap = self + .metadata_manager + .list_worker_node(Some(WorkerType::ComputeNode), Some(State::Running)) + .await? + .into_iter() + .map(|e| (e.id, e.parallelism())) + .collect(); + let worker_actor_count: HashMap = self + .metadata_manager + .worker_actor_count() + .await? + .into_iter() + .filter_map(|(worker_id, actor_count)| { + running_worker_parallelism + .get(&worker_id) + .map(|parallelism| { + ( + worker_id, + WorkerActorCount { + actor_count, + parallelism: *parallelism, + }, + ) + }) + }) + .collect(); + + let limit = ActorCountPerParallelism { + worker_id_to_actor_count: worker_actor_count, + hard_limit, + soft_limit, + }; + + if limit.exceed_limit() { + Ok(Some(ClusterLimit::ActorCount(limit))) + } else { + Ok(None) + } + } +} + +#[async_trait::async_trait] +impl ClusterLimitService for ClusterLimitServiceImpl { + #[cfg_attr(coverage, coverage(off))] + async fn get_cluster_limits( + &self, + _request: Request, + ) -> Result, Status> { + // TODO: support more limits + match self.get_active_actor_limit().await { + Ok(Some(limit)) => Ok(Response::new(GetClusterLimitsResponse { + active_limits: vec![limit.into()], + })), + Ok(None) => Ok(Response::new(GetClusterLimitsResponse { + active_limits: vec![], + })), + Err(e) => Err(e.into()), + } + } +} diff --git a/src/meta/service/src/hummock_service.rs b/src/meta/service/src/hummock_service.rs index 21e203d8440bd..c3fc2da229585 100644 --- a/src/meta/service/src/hummock_service.rs +++ b/src/meta/service/src/hummock_service.rs @@ -457,7 +457,7 @@ impl HummockManagerService for HummockServiceImpl { let req = request.into_inner(); let new_group_id = self .hummock_manager - .split_compaction_group(req.group_id, &req.table_ids) + .split_compaction_group(req.group_id, &req.table_ids, req.partition_vnode_count) .await?; Ok(Response::new(SplitCompactionGroupResponse { new_group_id })) } @@ -710,12 +710,26 @@ impl HummockManagerService for HummockServiceImpl { &self, request: Request, ) -> Result, Status> { - let GetVersionByEpochRequest { epoch } = request.into_inner(); - let version = self.hummock_manager.epoch_to_version(epoch).await?; + let GetVersionByEpochRequest { epoch, table_id } = request.into_inner(); + let version = self + .hummock_manager + .epoch_to_version(epoch, table_id) + .await?; Ok(Response::new(GetVersionByEpochResponse { version: Some(version.to_protobuf()), })) } + + async fn merge_compaction_group( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + self.hummock_manager + .merge_compaction_group(req.left_group_id, req.right_group_id) + .await?; + Ok(Response::new(MergeCompactionGroupResponse {})) + } } #[cfg(test)] diff --git a/src/meta/service/src/lib.rs b/src/meta/service/src/lib.rs index 9ab248802772e..2e327dc47a59e 100644 --- a/src/meta/service/src/lib.rs +++ b/src/meta/service/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] #![feature(let_chains)] #![feature(impl_trait_in_assoc_type)] #![cfg_attr(coverage, feature(coverage_attribute))] @@ -21,6 +20,7 @@ use risingwave_meta::*; pub mod backup_service; pub mod cloud_service; +pub mod cluster_limit_service; pub mod cluster_service; pub mod ddl_service; pub mod event_log_service; diff --git a/src/meta/src/backup_restore/restore_impl/v2.rs b/src/meta/src/backup_restore/restore_impl/v2.rs index a887293e0c8ef..938050ce4d300 100644 --- a/src/meta/src/backup_restore/restore_impl/v2.rs +++ b/src/meta/src/backup_restore/restore_impl/v2.rs @@ -106,8 +106,8 @@ impl Writer for WriterModelV2ToMetaStoreV2 { insert_models(metadata.workers.clone(), db).await?; insert_models(metadata.worker_properties.clone(), db).await?; insert_models(metadata.users.clone(), db).await?; - insert_models(metadata.user_privileges.clone(), db).await?; insert_models(metadata.objects.clone(), db).await?; + insert_models(metadata.user_privileges.clone(), db).await?; insert_models(metadata.object_dependencies.clone(), db).await?; insert_models(metadata.databases.clone(), db).await?; insert_models(metadata.schemas.clone(), db).await?; diff --git a/src/meta/src/barrier/command.rs b/src/meta/src/barrier/command.rs index 0bea5f37940d6..577a0bef25360 100644 --- a/src/meta/src/barrier/command.rs +++ b/src/meta/src/barrier/command.rs @@ -16,7 +16,6 @@ use std::collections::{HashMap, HashSet}; use std::fmt::Formatter; use futures::future::try_join_all; -use itertools::Itertools; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::TableId; use risingwave_common::hash::ActorMapping; @@ -78,6 +77,7 @@ pub struct Reschedule { /// Reassigned splits for source actors. /// It becomes the `actor_splits` in [`UpdateMutation`]. + /// `Source` and `SourceBackfill` are handled together here. pub actor_splits: HashMap>, /// Whether this fragment is injectable. The injectable means whether the fragment contains @@ -146,8 +146,10 @@ impl ReplaceTablePlan { } } -#[derive(Debug, Clone)] +#[derive(educe::Educe, Clone)] +#[educe(Debug)] pub struct CreateStreamingJobCommandInfo { + #[educe(Debug(ignore))] pub table_fragments: TableFragments, /// Refer to the doc on [`MetadataManager::get_upstream_root_fragments`] for the meaning of "root". pub upstream_root_actors: HashMap>, @@ -496,16 +498,16 @@ impl CommandContext { } } -impl CommandContext { +impl Command { /// Generate a mutation for the given command. - pub fn to_mutation(&self) -> Option { + pub fn to_mutation(&self, current_paused_reason: Option<&PausedReason>) -> Option { let mutation = - match &self.command { + match self { Command::Plain(mutation) => mutation.clone(), Command::Pause(_) => { // Only pause when the cluster is not already paused. - if self.current_paused_reason.is_none() { + if current_paused_reason.is_none() { Some(Mutation::Pause(PauseMutation {})) } else { None @@ -514,7 +516,7 @@ impl CommandContext { Command::Resume(reason) => { // Only resume when the cluster is paused with the same reason. - if self.current_paused_reason == Some(*reason) { + if current_paused_reason == Some(reason) { Some(Mutation::Resume(ResumeMutation {})) } else { None @@ -606,7 +608,7 @@ impl CommandContext { added_actors, actor_splits, // If the cluster is already paused, the new actors should be paused too. - pause: self.current_paused_reason.is_some(), + pause: current_paused_reason.is_some(), subscriptions_to_add, })); @@ -845,7 +847,7 @@ impl CommandContext { } pub fn actors_to_create(&self) -> Option>> { - match &self.command { + match self { Command::CreateStreamingJob { info, job_type } => { let mut map = match job_type { CreateStreamingJobType::Normal => HashMap::new(), @@ -913,6 +915,13 @@ impl CommandContext { ..Default::default() })) } +} + +impl CommandContext { + pub fn to_mutation(&self) -> Option { + self.command + .to_mutation(self.current_paused_reason.as_ref()) + } /// Returns the paused reason after executing the current command. pub fn next_paused_reason(&self) -> Option { @@ -951,19 +960,6 @@ impl Command { } impl CommandContext { - /// Clean up actors in CNs if needed, used by drop, cancel and reschedule commands. - async fn clean_up(&self, actors: Vec) -> MetaResult<()> { - self.barrier_manager_context - .stream_rpc_manager - .drop_actors( - &self.node_map, - self.node_map - .keys() - .map(|worker_id| (*worker_id, actors.clone())), - ) - .await - } - pub async fn wait_epoch_commit(&self, epoch: HummockEpoch) -> MetaResult<()> { let futures = self.node_map.values().map(|worker_node| async { let client = self @@ -1013,13 +1009,9 @@ impl CommandContext { } Command::DropStreamingJobs { - actors, unregistered_state_table_ids, .. } => { - // Tell compute nodes to drop actors. - self.clean_up(actors.clone()).await?; - self.barrier_manager_context .hummock_manager .unregister_table_ids(unregistered_state_table_ids.iter().cloned()) @@ -1028,7 +1020,6 @@ impl CommandContext { Command::CancelStreamingJob(table_fragments) => { tracing::debug!(id = ?table_fragments.table_id(), "cancelling stream job"); - self.clean_up(table_fragments.actor_ids()).await?; // NOTE(kwannoel): At this point, meta has already registered the table ids. // We should unregister them. @@ -1128,8 +1119,6 @@ impl CommandContext { .. }) = job_type { - self.clean_up(old_table_fragments.actor_ids()).await?; - // Drop fragment info in meta store. mgr.fragment_manager .post_replace_table( @@ -1156,13 +1145,9 @@ impl CommandContext { new_table_fragments, dispatchers, init_split_assignment, - old_table_fragments, .. }) = job_type { - // Tell compute nodes to drop actors. - self.clean_up(old_table_fragments.actor_ids()).await?; - mgr.catalog_controller .post_collect_table_fragments( new_table_fragments.table_id().table_id as _, @@ -1193,11 +1178,6 @@ impl CommandContext { table_parallelism, .. } => { - let removed_actors = reschedules - .values() - .flat_map(|reschedule| reschedule.removed_actors.clone().into_iter()) - .collect_vec(); - self.clean_up(removed_actors).await?; self.barrier_manager_context .scale_controller .post_apply_reschedule(reschedules, table_parallelism) @@ -1212,8 +1192,6 @@ impl CommandContext { init_split_assignment, .. }) => { - self.clean_up(old_table_fragments.actor_ids()).await?; - match &self.barrier_manager_context.metadata_manager { MetadataManager::V1(mgr) => { // Drop fragment info in meta store. diff --git a/src/meta/src/barrier/creating_job/mod.rs b/src/meta/src/barrier/creating_job/mod.rs index c5a52437e2b7d..9e4e52b0e36b8 100644 --- a/src/meta/src/barrier/creating_job/mod.rs +++ b/src/meta/src/barrier/creating_job/mod.rs @@ -28,6 +28,7 @@ use risingwave_common::util::epoch::Epoch; use risingwave_pb::common::WorkerNode; use risingwave_pb::ddl_service::DdlProgress; use risingwave_pb::hummock::HummockVersionStats; +use risingwave_pb::stream_plan::barrier_mutation::Mutation; use risingwave_pb::stream_service::{BarrierCompleteResponse, BuildActorInfo}; use tracing::{debug, info}; @@ -67,6 +68,7 @@ impl CreatingStreamingJobControl { backfill_epoch: u64, version_stat: &HummockVersionStats, metrics: &MetaMetrics, + initial_mutation: Mutation, ) -> Self { info!( table_id = info.table_fragments.table_id().table_id, @@ -108,7 +110,7 @@ impl CreatingStreamingJobControl { backfill_epoch, pending_non_checkpoint_barriers: vec![], snapshot_backfill_actors, - actors_to_create: Some( + initial_barrier_info: Some(( actors_to_create .into_iter() .map(|(worker_id, actors)| { @@ -124,7 +126,8 @@ impl CreatingStreamingJobControl { ) }) .collect(), - ), + initial_mutation, + )), }, upstream_lag: metrics .snapshot_backfill_lag @@ -283,11 +286,12 @@ impl CreatingStreamingJobControl { prev_epoch, kind, new_actors, + mutation, } in barriers_to_inject { let node_to_collect = control_stream_manager.inject_barrier( Some(table_id), - None, + mutation, (&curr_epoch, &prev_epoch), &kind, graph_info, diff --git a/src/meta/src/barrier/creating_job/status.rs b/src/meta/src/barrier/creating_job/status.rs index 0569752b1056b..f5d4c37d247a6 100644 --- a/src/meta/src/barrier/creating_job/status.rs +++ b/src/meta/src/barrier/creating_job/status.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use risingwave_common::util::epoch::Epoch; use risingwave_pb::hummock::HummockVersionStats; +use risingwave_pb::stream_plan::barrier_mutation::Mutation; use risingwave_pb::stream_service::barrier_complete_response::CreateMviewProgress; use risingwave_pb::stream_service::BuildActorInfo; @@ -40,7 +41,9 @@ pub(super) enum CreatingStreamingJobStatus { /// The `prev_epoch` of pending non checkpoint barriers pending_non_checkpoint_barriers: Vec, snapshot_backfill_actors: HashMap>, - actors_to_create: Option>>, + /// Info of the first barrier: (`actors_to_create`, `mutation`) + /// Take the mutation out when injecting the first barrier + initial_barrier_info: Option<(HashMap>, Mutation)>, }, ConsumingLogStore { graph_info: InflightGraphInfo, @@ -60,6 +63,7 @@ pub(super) struct CreatingJobInjectBarrierInfo { pub prev_epoch: TracedEpoch, pub kind: BarrierKind, pub new_actors: Option>>, + pub mutation: Option, } impl CreatingStreamingJobStatus { @@ -104,12 +108,12 @@ impl CreatingStreamingJobStatus { graph_info, pending_non_checkpoint_barriers, ref backfill_epoch, - actors_to_create, + initial_barrier_info, .. } = self { if create_mview_tracker.has_pending_finished_jobs() { - assert!(actors_to_create.is_none()); + assert!(initial_barrier_info.is_none()); pending_non_checkpoint_barriers.push(*backfill_epoch); let prev_epoch = Epoch::from_physical_time(*prev_epoch_fake_physical_time); @@ -119,6 +123,7 @@ impl CreatingStreamingJobStatus { prev_epoch: TracedEpoch::new(prev_epoch), kind: BarrierKind::Checkpoint(take(pending_non_checkpoint_barriers)), new_actors: None, + mutation: None, }] .into_iter() .chain(pending_commands.drain(..).map(|command_ctx| { @@ -127,6 +132,7 @@ impl CreatingStreamingJobStatus { prev_epoch: command_ctx.prev_epoch.clone(), kind: command_ctx.kind.clone(), new_actors: None, + mutation: None, } })) .collect(); @@ -145,12 +151,19 @@ impl CreatingStreamingJobStatus { } else { BarrierKind::Barrier }; + let (new_actors, mutation) = + if let Some((new_actors, mutation)) = initial_barrier_info.take() { + (Some(new_actors), Some(mutation)) + } else { + Default::default() + }; Some(( vec![CreatingJobInjectBarrierInfo { curr_epoch, prev_epoch, kind, - new_actors: actors_to_create.take(), + new_actors, + mutation, }], None, )) diff --git a/src/meta/src/barrier/mod.rs b/src/meta/src/barrier/mod.rs index 5fc9dc5112a65..0772bac6699e1 100644 --- a/src/meta/src/barrier/mod.rs +++ b/src/meta/src/barrier/mod.rs @@ -86,7 +86,6 @@ pub use self::command::{ Reschedule, SnapshotBackfillInfo, }; pub use self::info::InflightSubscriptionInfo; -pub use self::rpc::StreamRpcManager; pub use self::schedule::BarrierScheduler; pub use self::trace::TracedEpoch; @@ -172,8 +171,6 @@ pub struct GlobalBarrierManagerContext { pub(super) metrics: Arc, - stream_rpc_manager: StreamRpcManager, - env: MetaSrvEnv, } @@ -596,7 +593,6 @@ impl GlobalBarrierManager { source_manager: SourceManagerRef, sink_manager: SinkCoordinatorManager, metrics: Arc, - stream_rpc_manager: StreamRpcManager, scale_controller: ScaleControllerRef, ) -> Self { let enable_recovery = env.opts.enable_recovery; @@ -624,7 +620,6 @@ impl GlobalBarrierManager { scale_controller, sink_manager, metrics, - stream_rpc_manager, env: env.clone(), }; @@ -768,7 +763,9 @@ impl GlobalBarrierManager { if let Some(request) = request { match request { BarrierManagerRequest::GetDdlProgress(result_tx) => { + // Progress of normal backfill let mut progress = self.checkpoint_control.create_mview_tracker.gen_ddl_progress(); + // Progress of snapshot backfill for creating_job in self.checkpoint_control.creating_streaming_job_controls.values() { progress.extend([(creating_job.info.table_fragments.table_id().table_id, creating_job.gen_ddl_progress())]); } @@ -965,6 +962,19 @@ impl GlobalBarrierManager { info, } = &command { + if self.state.paused_reason().is_some() { + warn!("cannot create streaming job with snapshot backfill when paused"); + for notifier in notifiers { + notifier.notify_start_failed( + anyhow!("cannot create streaming job with snapshot backfill when paused",) + .into(), + ); + } + return Ok(()); + } + let mutation = command + .to_mutation(None) + .expect("should have some mutation in `CreateStreamingJob` command"); self.checkpoint_control .creating_streaming_job_controls .insert( @@ -975,6 +985,7 @@ impl GlobalBarrierManager { prev_epoch.value().0, &self.checkpoint_control.hummock_version_stats, &self.context.metrics, + mutation, ), ); } @@ -1625,6 +1636,7 @@ impl GlobalBarrierManagerContext { Ok(info) } + /// Serving `SHOW JOBS / SELECT * FROM rw_ddl_progress` pub async fn get_ddl_progress(&self) -> MetaResult> { let mut ddl_progress = { let (tx, rx) = oneshot::channel(); diff --git a/src/meta/src/barrier/progress.rs b/src/meta/src/barrier/progress.rs index 5754e4c60e364..2e1b6f9dc397a 100644 --- a/src/meta/src/barrier/progress.rs +++ b/src/meta/src/barrier/progress.rs @@ -55,6 +55,7 @@ pub(super) struct Progress { upstream_mv_count: HashMap, /// Total key count in the upstream materialized view + /// TODO: implement this for source backfill upstream_total_key_count: u64, /// Consumed rows @@ -122,6 +123,12 @@ impl Progress { /// Returns whether all backfill executors are done. fn is_done(&self) -> bool { + tracing::trace!( + "Progress::is_done? {}, {}, {:?}", + self.done_count, + self.states.len(), + self.states + ); self.done_count == self.states.len() } @@ -274,6 +281,7 @@ pub(super) struct TrackingCommand { /// 4. With `actor_map` we can use an actor's `ActorId` to find the ID of the `StreamJob`. #[derive(Default, Debug)] pub(super) struct CreateMviewProgressTracker { + // TODO: add a specialized progress for source /// Progress of the create-mview DDL indicated by the `TableId`. progress_map: HashMap, @@ -494,6 +502,7 @@ impl CreateMviewProgressTracker { replace_table: Option<&ReplaceTablePlan>, version_stats: &HummockVersionStats, ) -> Option { + tracing::trace!(?info, "add job to track"); let (info, actors, replace_table_info) = { let CreateStreamingJobCommandInfo { table_fragments, .. @@ -596,6 +605,7 @@ impl CreateMviewProgressTracker { progress: &CreateMviewProgress, version_stats: &HummockVersionStats, ) -> Option { + tracing::trace!(?progress, "update progress"); let actor = progress.backfill_actor_id; let Some(table_id) = self.actor_map.get(&actor).copied() else { // On restart, backfill will ALWAYS notify CreateMviewProgressTracker, diff --git a/src/meta/src/barrier/recovery.rs b/src/meta/src/barrier/recovery.rs index 25fe1fd2ceff7..63cd4c16d9aaf 100644 --- a/src/meta/src/barrier/recovery.rs +++ b/src/meta/src/barrier/recovery.rs @@ -1121,6 +1121,14 @@ impl GlobalBarrierManagerContext { return Err(anyhow!("actors dropped during update").into()); } + { + for (node_id, actors) in &info.actor_map { + if !actors.is_empty() && !all_node_actors.contains_key(node_id) { + return Err(anyhow!("streaming job dropped during update").into()); + } + } + } + Ok(all_node_actors) } } diff --git a/src/meta/src/barrier/rpc.rs b/src/meta/src/barrier/rpc.rs index 7ad468b04aa4c..97b3636e8dba3 100644 --- a/src/meta/src/barrier/rpc.rs +++ b/src/meta/src/barrier/rpc.rs @@ -14,14 +14,13 @@ use std::collections::{HashMap, HashSet}; use std::error::Error; -use std::future::Future; use std::time::Duration; use anyhow::anyhow; use fail::fail_point; use futures::future::try_join_all; use futures::stream::{BoxStream, FuturesUnordered}; -use futures::{pin_mut, FutureExt, StreamExt}; +use futures::StreamExt; use itertools::Itertools; use risingwave_common::catalog::TableId; use risingwave_common::hash::ActorId; @@ -34,11 +33,9 @@ use risingwave_pb::stream_service::build_actor_info::SubscriptionIds; use risingwave_pb::stream_service::streaming_control_stream_request::RemovePartialGraphRequest; use risingwave_pb::stream_service::{ streaming_control_stream_request, streaming_control_stream_response, BarrierCompleteResponse, - BuildActorInfo, DropActorsRequest, InjectBarrierRequest, StreamingControlStreamRequest, + BuildActorInfo, InjectBarrierRequest, StreamingControlStreamRequest, StreamingControlStreamResponse, }; -use risingwave_rpc_client::error::RpcError; -use risingwave_rpc_client::StreamClient; use rw_futures_util::pending_on_none; use thiserror_ext::AsReport; use tokio::sync::mpsc::UnboundedSender; @@ -50,7 +47,7 @@ use uuid::Uuid; use super::command::CommandContext; use super::{BarrierKind, GlobalBarrierManagerContext, TracedEpoch}; use crate::barrier::info::InflightGraphInfo; -use crate::manager::{MetaSrvEnv, WorkerId}; +use crate::manager::WorkerId; use crate::{MetaError, MetaResult}; const COLLECT_ERROR_TIMEOUT: Duration = Duration::from_secs(3); @@ -60,33 +57,47 @@ struct ControlStreamNode { sender: UnboundedSender, } -fn into_future( - worker_id: WorkerId, - stream: BoxStream< - 'static, - risingwave_rpc_client::error::Result, - >, -) -> ResponseStreamFuture { - stream.into_future().map(move |(opt, stream)| { - ( - worker_id, - stream, - opt.ok_or_else(|| anyhow!("end of stream").into()) - .and_then(|result| result.map_err(|e| e.into())), - ) - }) +mod response_stream_future { + use std::future::Future; + + use anyhow::anyhow; + use futures::stream::BoxStream; + use futures::{FutureExt, StreamExt}; + use risingwave_pb::stream_service::StreamingControlStreamResponse; + + use crate::manager::WorkerId; + use crate::MetaResult; + + pub(super) fn into_future( + worker_id: WorkerId, + stream: BoxStream< + 'static, + risingwave_rpc_client::error::Result, + >, + ) -> ResponseStreamFuture { + stream.into_future().map(move |(opt, stream)| { + ( + worker_id, + stream, + opt.ok_or_else(|| anyhow!("end of stream").into()) + .and_then(|result| result.map_err(|e| e.into())), + ) + }) + } + + pub(super) type ResponseStreamFuture = impl Future< + Output = ( + WorkerId, + BoxStream< + 'static, + risingwave_rpc_client::error::Result, + >, + MetaResult, + ), + > + 'static; } -type ResponseStreamFuture = impl Future< - Output = ( - WorkerId, - BoxStream< - 'static, - risingwave_rpc_client::error::Result, - >, - MetaResult, - ), - > + 'static; +use response_stream_future::*; pub(super) struct ControlStreamManager { context: GlobalBarrierManagerContext, @@ -263,39 +274,42 @@ impl ControlStreamManager { pre_applied_graph_info, applied_graph_info, actor_ids_to_pre_sync_mutation, - command_ctx.actors_to_create().map(|actors_to_create| { - actors_to_create - .into_iter() - .map(|(worker_id, actors)| { - ( - worker_id, - actors - .into_iter() - .map(|actor| BuildActorInfo { - actor: Some(actor), - // TODO: consider subscriber of backfilling mv - related_subscriptions: command_ctx - .subscription_info - .mv_depended_subscriptions - .iter() - .map(|(table_id, subscriptions)| { - ( - table_id.table_id, - SubscriptionIds { - subscription_ids: subscriptions - .keys() - .cloned() - .collect(), - }, - ) - }) - .collect(), - }) - .collect_vec(), - ) - }) - .collect() - }), + command_ctx + .command + .actors_to_create() + .map(|actors_to_create| { + actors_to_create + .into_iter() + .map(|(worker_id, actors)| { + ( + worker_id, + actors + .into_iter() + .map(|actor| BuildActorInfo { + actor: Some(actor), + // TODO: consider subscriber of backfilling mv + related_subscriptions: command_ctx + .subscription_info + .mv_depended_subscriptions + .iter() + .map(|(table_id, subscriptions)| { + ( + table_id.table_id, + SubscriptionIds { + subscription_ids: subscriptions + .keys() + .cloned() + .collect(), + }, + ) + }) + .collect(), + }) + .collect_vec(), + ) + }) + .collect() + }), ) } @@ -359,7 +373,7 @@ impl ControlStreamManager { self.nodes .iter_mut() - .map(|(node_id, node)| { + .try_for_each(|(node_id, node)| { let actor_ids_to_collect: Vec<_> = pre_applied_graph_info .actor_ids_to_collect(*node_id) .collect(); @@ -390,7 +404,7 @@ impl ControlStreamManager { request: Some( streaming_control_stream_request::Request::InjectBarrier( InjectBarrierRequest { - request_id: StreamRpcManager::new_request_id(), + request_id: Uuid::new_v4().to_string(), barrier: Some(barrier), actor_ids_to_collect, table_ids_to_sync, @@ -426,7 +440,6 @@ impl ControlStreamManager { Result::<_, MetaError>::Ok(()) } }) - .try_collect() .inspect_err(|e| { // Record failure in event log. use risingwave_pb::meta::event_log; @@ -509,95 +522,6 @@ impl GlobalBarrierManagerContext { } } -#[derive(Clone)] -pub struct StreamRpcManager { - env: MetaSrvEnv, -} - -impl StreamRpcManager { - pub fn new(env: MetaSrvEnv) -> Self { - Self { env } - } - - async fn make_request> + 'static>( - &self, - request: impl Iterator, - f: impl Fn(StreamClient, REQ) -> Fut, - ) -> MetaResult> { - let pool = self.env.stream_client_pool(); - let f = &f; - let iters = request.map(|(node, input)| async move { - let client = pool.get(node).await.map_err(|e| (node.id, e))?; - f(client, input).await.map_err(|e| (node.id, e)) - }); - let result = try_join_all_with_error_timeout(iters, COLLECT_ERROR_TIMEOUT).await; - result.map_err(|results_err| merge_node_rpc_errors("merged RPC Error", results_err)) - } - - fn new_request_id() -> String { - Uuid::new_v4().to_string() - } - - pub async fn drop_actors( - &self, - node_map: &HashMap, - node_actors: impl Iterator)>, - ) -> MetaResult<()> { - self.make_request( - node_actors - .map(|(worker_id, actor_ids)| (node_map.get(&worker_id).unwrap(), actor_ids)), - |client, actor_ids| async move { - client - .drop_actors(DropActorsRequest { - request_id: Self::new_request_id(), - actor_ids, - }) - .await - }, - ) - .await?; - Ok(()) - } -} - -/// This function is similar to `try_join_all`, but it attempts to collect as many error as possible within `error_timeout`. -async fn try_join_all_with_error_timeout( - iters: I, - error_timeout: Duration, -) -> Result, Vec> -where - I: IntoIterator, - F: Future>, -{ - let stream = FuturesUnordered::from_iter(iters); - pin_mut!(stream); - let mut results_ok = vec![]; - let mut results_err = vec![]; - while let Some(result) = stream.next().await { - match result { - Ok(rsp) => { - results_ok.push(rsp); - } - Err(err) => { - results_err.push(err); - break; - } - } - } - if results_err.is_empty() { - return Ok(results_ok); - } - let _ = timeout(error_timeout, async { - while let Some(result) = stream.next().await { - if let Err(err) = result { - results_err.push(err); - } - } - }) - .await; - Err(results_err) -} - pub(super) fn merge_node_rpc_errors( message: &str, errors: impl IntoIterator, diff --git a/src/meta/src/controller/fragment.rs b/src/meta/src/controller/fragment.rs index 16228a06d0a9a..31575e72804f9 100644 --- a/src/meta/src/controller/fragment.rs +++ b/src/meta/src/controller/fragment.rs @@ -1411,7 +1411,7 @@ mod tests { use std::collections::{BTreeMap, HashMap}; use itertools::Itertools; - use risingwave_common::hash::ActorMapping; + use risingwave_common::hash::{ActorMapping, VirtualNode}; use risingwave_common::util::iter_util::ZipEqDebug; use risingwave_common::util::stream_graph_visitor::visit_stream_node; use risingwave_meta_model_v2::actor::ActorStatus; @@ -1497,8 +1497,11 @@ mod tests { }) .collect(); - let actor_bitmaps = - ActorMapping::new_uniform((0..actor_count).map(|i| i as _)).to_bitmaps(); + let actor_bitmaps = ActorMapping::new_uniform( + (0..actor_count).map(|i| i as _), + VirtualNode::COUNT_FOR_TEST, + ) + .to_bitmaps(); let pb_actors = (0..actor_count) .map(|actor_id| { @@ -1610,8 +1613,11 @@ mod tests { }) .collect(); - let mut actor_bitmaps = - ActorMapping::new_uniform((0..actor_count).map(|i| i as _)).to_bitmaps(); + let mut actor_bitmaps = ActorMapping::new_uniform( + (0..actor_count).map(|i| i as _), + VirtualNode::COUNT_FOR_TEST, + ) + .to_bitmaps(); let actors = (0..actor_count) .map(|actor_id| { diff --git a/src/meta/src/hummock/manager/checkpoint.rs b/src/meta/src/hummock/manager/checkpoint.rs index bc3701a6b9d82..f678014d440c8 100644 --- a/src/meta/src/hummock/manager/checkpoint.rs +++ b/src/meta/src/hummock/manager/checkpoint.rs @@ -156,8 +156,8 @@ impl HummockManager { .hummock_version_deltas .range((Excluded(old_checkpoint_id), Included(new_checkpoint_id))) { - for group_deltas in version_delta.group_deltas.values() { - let summary = summarize_group_deltas(group_deltas); + for (group_id, group_deltas) in &version_delta.group_deltas { + let summary = summarize_group_deltas(group_deltas, *group_id); object_sizes.extend( summary .insert_table_infos diff --git a/src/meta/src/hummock/manager/commit_epoch.rs b/src/meta/src/hummock/manager/commit_epoch.rs index 08428e5472e23..e92e91c8503d0 100644 --- a/src/meta/src/hummock/manager/commit_epoch.rs +++ b/src/meta/src/hummock/manager/commit_epoch.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use risingwave_common::catalog::TableId; use risingwave_hummock_sdk::change_log::ChangeLogDelta; @@ -220,23 +220,8 @@ impl HummockManager { NewTableFragmentInfo::None => (HashMap::new(), None, None), }; - let mut group_members_table_ids: HashMap> = HashMap::new(); - { - // expand group_members_table_ids - for (table_id, group_id) in &table_compaction_group_mapping { - group_members_table_ids - .entry(*group_id) - .or_default() - .insert(*table_id); - } - } - let commit_sstables = self - .correct_commit_ssts( - sstables, - &table_compaction_group_mapping, - &group_members_table_ids, - ) + .correct_commit_ssts(sstables, &table_compaction_group_mapping) .await?; let modified_compaction_groups: Vec<_> = commit_sstables.keys().cloned().collect(); @@ -247,7 +232,7 @@ impl HummockManager { is_visible_table_committed_epoch, new_compaction_group, commit_sstables, - new_table_ids, + &new_table_ids, new_table_watermarks, change_log_delta, ); @@ -304,6 +289,9 @@ impl HummockManager { .values() .map(|g| (g.group_id, g.parent_group_id)) .collect(); + let time_travel_tables_to_commit = table_compaction_group_mapping + .iter() + .filter(|(table_id, _)| tables_to_commit.contains(table_id)); let mut txn = sql_store.conn.begin().await?; let version_snapshot_sst_ids = self .write_time_travel_metadata( @@ -312,6 +300,8 @@ impl HummockManager { time_travel_delta, &group_parents, &versioning.last_time_travel_snapshot_sst_ids, + time_travel_tables_to_commit, + committed_epoch, ) .await?; commit_multi_var_with_provided_txn!( @@ -389,7 +379,6 @@ impl HummockManager { &self, sstables: Vec, table_compaction_group_mapping: &HashMap, - group_members_table_ids: &HashMap>, ) -> Result>> { let mut new_sst_id_number = 0; let mut sst_to_cg_vec = Vec::with_capacity(sstables.len()); @@ -413,7 +402,7 @@ impl HummockManager { } } - new_sst_id_number += group_table_ids.len(); + new_sst_id_number += group_table_ids.len() * 2; // `split_sst` will split the SST into two parts and consumer 2 SST IDs sst_to_cg_vec.push((commit_sst, group_table_ids)); } @@ -424,17 +413,16 @@ impl HummockManager { let mut commit_sstables: BTreeMap> = BTreeMap::new(); for (mut sst, group_table_ids) in sst_to_cg_vec { - for (group_id, match_ids) in group_table_ids { - let group_members_table_ids = group_members_table_ids.get(&group_id).unwrap(); - if match_ids - .iter() - .all(|id| group_members_table_ids.contains(&TableId::new(*id))) - { + let len = group_table_ids.len(); + for (index, (group_id, match_ids)) in group_table_ids.into_iter().enumerate() { + if sst.sst_info.table_ids == match_ids { + // The SST contains all the tables in the group should be last key + assert!(index == len - 1); commit_sstables .entry(group_id) .or_default() - .push(sst.sst_info.clone()); - continue; + .push(sst.sst_info); + break; } let origin_sst_size = sst.sst_info.sst_size; diff --git a/src/meta/src/hummock/manager/compaction_group_manager.rs b/src/meta/src/hummock/manager/compaction/compaction_group_manager.rs similarity index 96% rename from src/meta/src/hummock/manager/compaction_group_manager.rs rename to src/meta/src/hummock/manager/compaction/compaction_group_manager.rs index c68fc4222f283..807ba6f3fd35f 100644 --- a/src/meta/src/hummock/manager/compaction_group_manager.rs +++ b/src/meta/src/hummock/manager/compaction/compaction_group_manager.rs @@ -54,7 +54,7 @@ use crate::model::{ type CompactionGroupTransaction<'a> = BTreeMapTransaction<'a, CompactionGroupId, CompactionGroup>; impl CompactionGroupManager { - pub(super) async fn new(env: &MetaSrvEnv) -> Result { + pub(crate) async fn new(env: &MetaSrvEnv) -> Result { let default_config = match env.opts.compaction_config.as_ref() { None => CompactionConfigBuilder::new().build(), Some(opt) => CompactionConfigBuilder::with_opt(opt).build(), @@ -62,7 +62,7 @@ impl CompactionGroupManager { Self::new_with_config(env, default_config).await } - pub(super) async fn new_with_config( + pub(crate) async fn new_with_config( env: &MetaSrvEnv, default_config: CompactionConfig, ) -> Result { @@ -231,12 +231,9 @@ impl HummockManager { let mut is_group_init = false; group_id = *new_compaction_group_id .get_or_try_init(|| async { - next_compaction_group_id(&self.env) - .await - .map(|new_group_id| { - is_group_init = true; - new_group_id - }) + next_compaction_group_id(&self.env).await.inspect(|_| { + is_group_init = true; + }) }) .await?; if is_group_init { @@ -428,24 +425,6 @@ impl HummockManager { results } - /// Splits a compaction group into two. The new one will contain `table_ids`. - /// Returns the newly created compaction group id. - pub async fn split_compaction_group( - &self, - parent_group_id: CompactionGroupId, - table_ids: &[StateTableId], - ) -> Result { - let result = self - .move_state_table_to_compaction_group( - parent_group_id, - table_ids, - self.env.opts.partition_vnode_count, - ) - .await?; - - Ok(result) - } - /// move some table to another compaction-group. Create a new compaction group if it does not /// exist. pub async fn move_state_table_to_compaction_group( @@ -651,7 +630,7 @@ impl HummockManager { infos } - pub(super) async fn initial_compaction_group_config_after_load( + pub(crate) async fn initial_compaction_group_config_after_load( &self, versioning_guard: &Versioning, compaction_group_manager: &mut CompactionGroupManager, @@ -675,7 +654,7 @@ impl HummockManager { /// 1. initialize default static compaction group. /// 2. register new table to new compaction group. /// 3. move existent table to new compaction group. -pub(super) struct CompactionGroupManager { +pub(crate) struct CompactionGroupManager { compaction_groups: BTreeMap, default_config: Arc, /// Tables that write limit is trigger for. @@ -709,7 +688,7 @@ impl CompactionGroupManager { } /// Tries to get compaction group config for `compaction_group_id`. - pub(super) fn try_get_compaction_group_config( + pub(crate) fn try_get_compaction_group_config( &self, compaction_group_id: CompactionGroupId, ) -> Option { @@ -717,7 +696,7 @@ impl CompactionGroupManager { } /// Tries to get compaction group config for `compaction_group_id`. - pub(super) fn default_compaction_config(&self) -> Arc { + pub(crate) fn default_compaction_config(&self) -> Arc { self.default_config.clone() } } @@ -814,7 +793,7 @@ impl<'a> CompactionGroupTransaction<'a> { } /// Tries to get compaction group config for `compaction_group_id`. - pub(super) fn try_get_compaction_group_config( + pub(crate) fn try_get_compaction_group_config( &self, compaction_group_id: CompactionGroupId, ) -> Option<&CompactionGroup> { @@ -822,7 +801,7 @@ impl<'a> CompactionGroupTransaction<'a> { } /// Removes stale group configs. - fn purge(&mut self, existing_groups: HashSet) { + pub fn purge(&mut self, existing_groups: HashSet) { let stale_group = self .tree_ref() .keys() @@ -837,7 +816,7 @@ impl<'a> CompactionGroupTransaction<'a> { } } - pub(super) fn update_compaction_config( + pub(crate) fn update_compaction_config( &mut self, compaction_group_ids: &[CompactionGroupId], config_to_update: &[MutableConfig], diff --git a/src/meta/src/hummock/manager/compaction/compaction_group_schedule.rs b/src/meta/src/hummock/manager/compaction/compaction_group_schedule.rs new file mode 100644 index 0000000000000..93103ca87abf5 --- /dev/null +++ b/src/meta/src/hummock/manager/compaction/compaction_group_schedule.rs @@ -0,0 +1,359 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet, VecDeque}; +use std::ops::DerefMut; + +use itertools::Itertools; +use risingwave_common::catalog::TableId; +use risingwave_hummock_sdk::compact_task::ReportTask; +use risingwave_hummock_sdk::compaction_group::hummock_version_ext::TableGroupInfo; +use risingwave_hummock_sdk::compaction_group::{StateTableId, StaticCompactionGroupId}; +use risingwave_hummock_sdk::version::{GroupDelta, GroupDeltas}; +use risingwave_hummock_sdk::{can_concat, CompactionGroupId}; +use risingwave_pb::hummock::compact_task::TaskStatus; +use risingwave_pb::hummock::{PbGroupMerge, PbStateTableInfoDelta}; +use thiserror_ext::AsReport; + +use crate::hummock::error::{Error, Result}; +use crate::hummock::manager::transaction::HummockVersionTransaction; +use crate::hummock::manager::{commit_multi_var, HummockManager}; +use crate::hummock::metrics_utils::remove_compaction_group_in_sst_stat; + +impl HummockManager { + /// Splits a compaction group into two. The new one will contain `table_ids`. + /// Returns the newly created compaction group id. + pub async fn split_compaction_group( + &self, + parent_group_id: CompactionGroupId, + table_ids: &[StateTableId], + partition_vnode_count: u32, + ) -> Result { + let result = self + .move_state_table_to_compaction_group(parent_group_id, table_ids, partition_vnode_count) + .await?; + + Ok(result) + } + + pub async fn merge_compaction_group( + &self, + group_1: CompactionGroupId, + group_2: CompactionGroupId, + ) -> Result<()> { + let compaction_guard = self.compaction.write().await; + let mut versioning_guard = self.versioning.write().await; + let versioning = versioning_guard.deref_mut(); + // Validate parameters. + if !versioning.current_version.levels.contains_key(&group_1) { + return Err(Error::CompactionGroup(format!("invalid group {}", group_1))); + } + + if !versioning.current_version.levels.contains_key(&group_2) { + return Err(Error::CompactionGroup(format!("invalid group {}", group_2))); + } + + let state_table_info = versioning.current_version.state_table_info.clone(); + let mut member_table_ids_1 = state_table_info + .compaction_group_member_table_ids(group_1) + .iter() + .cloned() + .collect_vec(); + + let mut member_table_ids_2 = state_table_info + .compaction_group_member_table_ids(group_2) + .iter() + .cloned() + .collect_vec(); + + debug_assert!(!member_table_ids_1.is_empty()); + debug_assert!(!member_table_ids_2.is_empty()); + assert!(member_table_ids_1.is_sorted()); + assert!(member_table_ids_2.is_sorted()); + + // Make sure `member_table_ids_1` is smaller than `member_table_ids_2` + let (left_group_id, right_group_id) = + if member_table_ids_1.first().unwrap() < member_table_ids_2.first().unwrap() { + (group_1, group_2) + } else { + std::mem::swap(&mut member_table_ids_1, &mut member_table_ids_2); + (group_2, group_1) + }; + + // We can only merge two groups with non-overlapping member table ids + if member_table_ids_1.last().unwrap() >= member_table_ids_2.first().unwrap() { + return Err(Error::CompactionGroup(format!( + "invalid merge group_1 {} group_2 {}", + left_group_id, right_group_id + ))); + } + + let combined_member_table_ids = member_table_ids_1 + .iter() + .chain(member_table_ids_2.iter()) + .collect_vec(); + assert!(combined_member_table_ids.is_sorted()); + + // check duplicated sst_id + let mut sst_id_set = HashSet::new(); + for sst_id in versioning + .current_version + .get_sst_ids_by_group_id(left_group_id) + .chain( + versioning + .current_version + .get_sst_ids_by_group_id(right_group_id), + ) + { + if !sst_id_set.insert(sst_id) { + return Err(Error::CompactionGroup(format!( + "invalid merge group_1 {} group_2 {} duplicated sst_id {}", + left_group_id, right_group_id, sst_id + ))); + } + } + + // check branched sst on non-overlap level + { + let left_levels = versioning + .current_version + .get_compaction_group_levels(group_1); + + let right_levels = versioning + .current_version + .get_compaction_group_levels(group_2); + + // we can not check the l0 sub level, because the sub level id will be rewritten when merge + // This check will ensure that other non-overlapping level ssts can be concat and that the key_range is correct. + let max_level = std::cmp::max(left_levels.levels.len(), right_levels.levels.len()); + for level_idx in 1..=max_level { + let left_level = left_levels.get_level(level_idx); + let right_level = right_levels.get_level(level_idx); + if left_level.table_infos.is_empty() || right_level.table_infos.is_empty() { + continue; + } + + let left_last_sst = left_level.table_infos.last().unwrap().clone(); + let right_first_sst = right_level.table_infos.first().unwrap().clone(); + let left_sst_id = left_last_sst.sst_id; + let right_sst_id = right_first_sst.sst_id; + let left_obj_id = left_last_sst.object_id; + let right_obj_id = right_first_sst.object_id; + + // Since the sst key_range within a group is legal, we only need to check the ssts adjacent to the two groups. + if !can_concat(&[left_last_sst, right_first_sst]) { + return Err(Error::CompactionGroup(format!( + "invalid merge group_1 {} group_2 {} level_idx {} left_last_sst_id {} right_first_sst_id {} left_obj_id {} right_obj_id {}", + left_group_id, right_group_id, level_idx, left_sst_id, right_sst_id, left_obj_id, right_obj_id + ))); + } + } + } + + let mut version = HummockVersionTransaction::new( + &mut versioning.current_version, + &mut versioning.hummock_version_deltas, + self.env.notification_manager(), + &self.metrics, + ); + let mut new_version_delta = version.new_delta(); + + let target_compaction_group_id = { + // merge right_group_id to left_group_id and remove right_group_id + new_version_delta.group_deltas.insert( + left_group_id, + GroupDeltas { + group_deltas: vec![GroupDelta::GroupMerge(PbGroupMerge { + left_group_id, + right_group_id, + })], + }, + ); + left_group_id + }; + + // TODO: remove compaciton group_id from state_table_info + // rewrite compaction_group_id for all tables + new_version_delta.with_latest_version(|version, new_version_delta| { + for table_id in combined_member_table_ids { + let table_id = TableId::new(table_id.table_id()); + let info = version + .state_table_info + .info() + .get(&table_id) + .expect("have check exist previously"); + assert!(new_version_delta + .state_table_info_delta + .insert( + table_id, + PbStateTableInfoDelta { + committed_epoch: info.committed_epoch, + safe_epoch: info.safe_epoch, + compaction_group_id: target_compaction_group_id, + } + ) + .is_none()); + } + }); + + { + let mut compaction_group_manager = self.compaction_group_manager.write().await; + let mut compaction_groups_txn = compaction_group_manager.start_compaction_groups_txn(); + + // for metrics reclaim + { + let right_group_max_level = new_version_delta + .latest_version() + .get_compaction_group_levels(right_group_id) + .levels + .len(); + + remove_compaction_group_in_sst_stat( + &self.metrics, + right_group_id, + right_group_max_level, + ); + } + + new_version_delta.pre_apply(); + + // remove right_group_id + compaction_groups_txn.remove(right_group_id); + commit_multi_var!(self.meta_store_ref(), version, compaction_groups_txn)?; + } + + // Instead of handling DeltaType::GroupConstruct for time travel, simply enforce a version snapshot. + versioning.mark_next_time_travel_version_snapshot(); + + // cancel tasks + let mut canceled_tasks = vec![]; + // after merge, all tasks in right_group_id should be canceled + // otherwise, pending size calculation by level handler will make some mistake + for task_assignment in compaction_guard.compact_task_assignment.values() { + if let Some(task) = task_assignment.compact_task.as_ref() { + let need_cancel = task.compaction_group_id == right_group_id; + if need_cancel { + canceled_tasks.push(ReportTask { + task_id: task.task_id, + task_status: TaskStatus::ManualCanceled, + table_stats_change: HashMap::default(), + sorted_output_ssts: vec![], + }); + } + } + } + + drop(versioning_guard); + drop(compaction_guard); + self.report_compact_tasks(canceled_tasks).await?; + + Ok(()) + } + + pub async fn try_split_compaction_group( + &self, + table_write_throughput: &HashMap>, + checkpoint_secs: u64, + group: &TableGroupInfo, + created_tables: &HashSet, + ) { + // split high throughput table to dedicated compaction group + for (table_id, table_size) in &group.table_statistic { + self.try_move_table_to_dedicated_cg( + table_write_throughput, + table_id, + table_size, + !created_tables.contains(table_id), + checkpoint_secs, + group.group_id, + group.group_size, + ) + .await; + } + } + + pub async fn try_move_table_to_dedicated_cg( + &self, + table_write_throughput: &HashMap>, + table_id: &u32, + table_size: &u64, + is_creating_table: bool, + checkpoint_secs: u64, + parent_group_id: u64, + group_size: u64, + ) { + let default_group_id: CompactionGroupId = StaticCompactionGroupId::StateDefault.into(); + let mv_group_id: CompactionGroupId = StaticCompactionGroupId::MaterializedView.into(); + let partition_vnode_count = self.env.opts.partition_vnode_count; + let window_size = + self.env.opts.table_info_statistic_history_times / (checkpoint_secs as usize); + + let mut is_high_write_throughput = false; + let mut is_low_write_throughput = true; + if let Some(history) = table_write_throughput.get(table_id) { + if history.len() >= window_size { + is_high_write_throughput = history.iter().all(|throughput| { + *throughput / checkpoint_secs > self.env.opts.table_write_throughput_threshold + }); + is_low_write_throughput = history.iter().any(|throughput| { + *throughput / checkpoint_secs < self.env.opts.min_table_split_write_throughput + }); + } + } + + let state_table_size = *table_size; + + // 1. Avoid splitting a creating table + // 2. Avoid splitting a is_low_write_throughput creating table + // 3. Avoid splitting a non-high throughput medium-sized table + if is_creating_table + || (is_low_write_throughput) + || (state_table_size < self.env.opts.min_table_split_size && !is_high_write_throughput) + { + return; + } + + // do not split a large table and a small table because it would increase IOPS + // of small table. + if parent_group_id != default_group_id && parent_group_id != mv_group_id { + let rest_group_size = group_size - state_table_size; + if rest_group_size < state_table_size + && rest_group_size < self.env.opts.min_table_split_size + { + return; + } + } + + let ret = self + .move_state_table_to_compaction_group( + parent_group_id, + &[*table_id], + partition_vnode_count, + ) + .await; + match ret { + Ok(new_group_id) => { + tracing::info!("move state table [{}] from group-{} to group-{} success table_vnode_partition_count {:?}", table_id, parent_group_id, new_group_id, partition_vnode_count); + } + Err(e) => { + tracing::info!( + error = %e.as_report(), + "failed to move state table [{}] from group-{}", + table_id, + parent_group_id, + ) + } + } + } +} diff --git a/src/meta/src/hummock/manager/compaction.rs b/src/meta/src/hummock/manager/compaction/mod.rs similarity index 95% rename from src/meta/src/hummock/manager/compaction.rs rename to src/meta/src/hummock/manager/compaction/mod.rs index 4696c07452018..8f2ecc33c60b0 100644 --- a/src/meta/src/hummock/manager/compaction.rs +++ b/src/meta/src/hummock/manager/compaction/mod.rs @@ -27,7 +27,7 @@ // limitations under the License. use std::cmp::min; -use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::sync::{Arc, LazyLock}; use std::time::{Instant, SystemTime}; @@ -43,7 +43,6 @@ use rand::thread_rng; use risingwave_common::util::epoch::Epoch; use risingwave_hummock_sdk::compact_task::{CompactTask, ReportTask}; use risingwave_hummock_sdk::compaction_group::hummock_version_ext::HummockLevelsExt; -use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; use risingwave_hummock_sdk::key_range::KeyRange; use risingwave_hummock_sdk::level::{InputLevel, Level, Levels}; use risingwave_hummock_sdk::sstable_info::SstableInfo; @@ -96,6 +95,9 @@ use crate::hummock::{commit_multi_var, start_measure_real_process_timer, Hummock use crate::manager::{MetadataManager, META_NODE_ID}; use crate::model::BTreeMapTransaction; +pub mod compaction_group_manager; +pub mod compaction_group_schedule; + const MAX_SKIP_TIMES: usize = 8; const MAX_REPORT_COUNT: usize = 16; @@ -1567,80 +1569,6 @@ impl HummockManager { .retain(|table_id, _| compact_task.existing_table_ids.contains(table_id)); } } - - pub async fn try_move_table_to_dedicated_cg( - &self, - table_write_throughput: &HashMap>, - table_id: &u32, - table_size: &u64, - is_creating_table: bool, - checkpoint_secs: u64, - parent_group_id: u64, - group_size: u64, - ) { - let default_group_id: CompactionGroupId = StaticCompactionGroupId::StateDefault.into(); - let mv_group_id: CompactionGroupId = StaticCompactionGroupId::MaterializedView.into(); - let partition_vnode_count = self.env.opts.partition_vnode_count; - let window_size = - self.env.opts.table_info_statistic_history_times / (checkpoint_secs as usize); - - let mut is_high_write_throughput = false; - let mut is_low_write_throughput = true; - if let Some(history) = table_write_throughput.get(table_id) { - if history.len() >= window_size { - is_high_write_throughput = history.iter().all(|throughput| { - *throughput / checkpoint_secs > self.env.opts.table_write_throughput_threshold - }); - is_low_write_throughput = history.iter().any(|throughput| { - *throughput / checkpoint_secs < self.env.opts.min_table_split_write_throughput - }); - } - } - - let state_table_size = *table_size; - - // 1. Avoid splitting a creating table - // 2. Avoid splitting a is_low_write_throughput creating table - // 3. Avoid splitting a non-high throughput medium-sized table - if is_creating_table - || (is_low_write_throughput) - || (state_table_size < self.env.opts.min_table_split_size && !is_high_write_throughput) - { - return; - } - - // do not split a large table and a small table because it would increase IOPS - // of small table. - if parent_group_id != default_group_id && parent_group_id != mv_group_id { - let rest_group_size = group_size - state_table_size; - if rest_group_size < state_table_size - && rest_group_size < self.env.opts.min_table_split_size - { - return; - } - } - - let ret = self - .move_state_table_to_compaction_group( - parent_group_id, - &[*table_id], - partition_vnode_count, - ) - .await; - match ret { - Ok(new_group_id) => { - tracing::info!("move state table [{}] from group-{} to group-{} success table_vnode_partition_count {:?}", table_id, parent_group_id, new_group_id, partition_vnode_count); - } - Err(e) => { - tracing::info!( - error = %e.as_report(), - "failed to move state table [{}] from group-{}", - table_id, - parent_group_id, - ) - } - } - } } #[cfg(any(test, feature = "test"))] diff --git a/src/meta/src/hummock/manager/mod.rs b/src/meta/src/hummock/manager/mod.rs index ded8d507dbffc..d43b1cc6f5421 100644 --- a/src/meta/src/hummock/manager/mod.rs +++ b/src/meta/src/hummock/manager/mod.rs @@ -50,7 +50,6 @@ use crate::manager::{MetaSrvEnv, MetaStoreImpl, MetadataManager}; use crate::model::{ClusterId, MetadataModel, MetadataModelError}; use crate::rpc::metrics::MetaMetrics; -mod compaction_group_manager; mod context; mod gc; mod tests; diff --git a/src/meta/src/hummock/manager/tests.rs b/src/meta/src/hummock/manager/tests.rs index 56b4836f585a1..d0183d84d23c5 100644 --- a/src/meta/src/hummock/manager/tests.rs +++ b/src/meta/src/hummock/manager/tests.rs @@ -17,7 +17,6 @@ use std::borrow::Borrow; use std::cmp::Ordering; use std::collections::HashMap; -use std::sync::Arc; use itertools::Itertools; use prometheus::Registry; @@ -1327,7 +1326,22 @@ async fn test_split_compaction_group_on_commit() { sst_size: 100, ..Default::default() }, - table_stats: Default::default(), + table_stats: HashMap::from([ + ( + 100, + TableStats { + total_compressed_size: 50, + ..Default::default() + }, + ), + ( + 101, + TableStats { + total_compressed_size: 50, + ..Default::default() + }, + ), + ]), }; hummock_manager .commit_epoch_for_test(30, vec![sst_1], HashMap::from([(10, context_id)])) @@ -1378,13 +1392,13 @@ async fn test_split_compaction_group_on_demand_basic() { assert_eq!(original_groups, vec![2, 3]); let err = hummock_manager - .split_compaction_group(100, &[0]) + .split_compaction_group(100, &[0], 0) .await .unwrap_err(); assert_eq!("compaction group error: invalid group 100", err.to_string()); let err = hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap_err(); assert_eq!( @@ -1446,7 +1460,7 @@ async fn test_split_compaction_group_on_demand_basic() { .unwrap(); let err = hummock_manager - .split_compaction_group(2, &[100, 101]) + .split_compaction_group(2, &[100, 101], 0) .await .unwrap_err(); assert_eq!( @@ -1462,7 +1476,7 @@ async fn test_split_compaction_group_on_demand_basic() { .unwrap(); hummock_manager - .split_compaction_group(2, &[100, 101]) + .split_compaction_group(2, &[100, 101], 0) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; @@ -1530,7 +1544,7 @@ async fn test_split_compaction_group_on_demand_non_trivial() { .unwrap(); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); @@ -1658,7 +1672,7 @@ async fn test_split_compaction_group_trivial_expired() { .unwrap(); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); @@ -1830,7 +1844,7 @@ async fn test_split_compaction_group_on_demand_bottom_levels() { ); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; @@ -1939,7 +1953,7 @@ async fn test_compaction_task_expiration_due_to_split_group() { let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 2); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); @@ -2023,7 +2037,7 @@ async fn test_move_tables_between_compaction_group() { ); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; @@ -2122,11 +2136,9 @@ async fn test_partition_level() { .level0_overlapping_sub_level_compact_level_count(3) .build(); let registry = Registry::new(); - let (_env, hummock_manager, _, worker_node) = + let (env, hummock_manager, _, worker_node) = setup_compute_env_with_metric(80, config.clone(), Some(MetaMetrics::for_test(®istry))) .await; - let config = Arc::new(config); - let context_id = worker_node.id; hummock_manager @@ -2161,7 +2173,7 @@ async fn test_partition_level() { .unwrap()); hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], env.opts.partition_vnode_count) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; @@ -2303,7 +2315,7 @@ async fn test_unregister_moved_table() { .unwrap(); let new_group_id = hummock_manager - .split_compaction_group(2, &[100]) + .split_compaction_group(2, &[100], 0) .await .unwrap(); assert_ne!(new_group_id, 2); diff --git a/src/meta/src/hummock/manager/time_travel.rs b/src/meta/src/hummock/manager/time_travel.rs index 61c1e820fab0c..0b6ef73e52605 100644 --- a/src/meta/src/hummock/manager/time_travel.rs +++ b/src/meta/src/hummock/manager/time_travel.rs @@ -16,6 +16,7 @@ use std::collections::{HashMap, HashSet, VecDeque}; use anyhow::anyhow; use itertools::Itertools; +use risingwave_common::catalog::TableId; use risingwave_common::system_param::reader::SystemParamsRead; use risingwave_common::util::epoch::Epoch; use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; @@ -36,7 +37,7 @@ use risingwave_pb::hummock::{PbHummockVersion, PbHummockVersionDelta}; use sea_orm::sea_query::OnConflict; use sea_orm::ActiveValue::Set; use sea_orm::{ - ColumnTrait, DatabaseTransaction, EntityTrait, QueryFilter, QueryOrder, QuerySelect, + ColumnTrait, Condition, DatabaseTransaction, EntityTrait, QueryFilter, QueryOrder, QuerySelect, TransactionTrait, }; @@ -101,6 +102,7 @@ impl HummockManager { .lt(risingwave_meta_model_v2::Epoch::try_from(epoch_watermark).unwrap()), ) .order_by_desc(hummock_epoch_to_version::Column::Epoch) + .order_by_asc(hummock_epoch_to_version::Column::VersionId) .one(&txn) .await?; let Some(version_watermark) = version_watermark else { @@ -275,9 +277,19 @@ impl HummockManager { /// The version is retrieved from `hummock_epoch_to_version`, selecting the entry with the largest epoch that's lte `query_epoch`. /// /// The resulted version is complete, i.e. with correct `SstableInfo`. - pub async fn epoch_to_version(&self, query_epoch: HummockEpoch) -> Result { + pub async fn epoch_to_version( + &self, + query_epoch: HummockEpoch, + table_id: u32, + ) -> Result { let sql_store = self.sql_store().ok_or_else(require_sql_meta_store_err)?; let epoch_to_version = hummock_epoch_to_version::Entity::find() + .filter( + Condition::any() + .add(hummock_epoch_to_version::Column::TableId.eq(i64::from(table_id))) + // for backward compatibility + .add(hummock_epoch_to_version::Column::TableId.eq(0)), + ) .filter( hummock_epoch_to_version::Column::Epoch .lte(risingwave_meta_model_v2::Epoch::try_from(query_epoch).unwrap()), @@ -362,7 +374,19 @@ impl HummockManager { delta: HummockVersionDelta, group_parents: &HashMap, skip_sst_ids: &HashSet, + tables_to_commit: impl Iterator, + committed_epoch: u64, ) -> Result>> { + let select_groups = group_parents + .iter() + .filter_map(|(cg_id, _)| { + if should_ignore_group(find_root_group(*cg_id, group_parents)) { + None + } else { + Some(*cg_id) + } + }) + .collect::>(); async fn write_sstable_infos( sst_infos: impl Iterator, txn: &DatabaseTransaction, @@ -388,35 +412,23 @@ impl HummockManager { Ok(count) } - let epoch = delta.visible_table_committed_epoch(); - let version_id: u64 = delta.id.to_u64(); - let m = hummock_epoch_to_version::ActiveModel { - epoch: Set(epoch.try_into().unwrap()), - version_id: Set(version_id.try_into().unwrap()), - }; - hummock_epoch_to_version::Entity::insert(m) - .on_conflict( - OnConflict::column(hummock_epoch_to_version::Column::Epoch) - // The existing row must be inserted by the common committed epoch of created MVs. - // While any duplicate row must be inserted by MVs still in creation. - // So the row shouldn't be updated. - .do_nothing() - .to_owned(), - ) - .do_nothing() - .exec(txn) - .await?; + for (table_id, cg_id) in tables_to_commit { + if !select_groups.contains(cg_id) { + continue; + } + let version_id: u64 = delta.id.to_u64(); + let m = hummock_epoch_to_version::ActiveModel { + epoch: Set(committed_epoch.try_into().unwrap()), + table_id: Set(table_id.table_id.into()), + version_id: Set(version_id.try_into().unwrap()), + }; + // There should be no conflict rows. + hummock_epoch_to_version::Entity::insert(m) + .exec(txn) + .await?; + } + let mut version_sst_ids = None; - let select_groups = group_parents - .iter() - .filter_map(|(cg_id, _)| { - if should_ignore_group(find_root_group(*cg_id, group_parents)) { - None - } else { - Some(*cg_id) - } - }) - .collect::>(); if let Some(version) = version { version_sst_ids = Some( version diff --git a/src/meta/src/hummock/manager/timer_task.rs b/src/meta/src/hummock/manager/timer_task.rs index ec0f77ac88a8a..94537e9c33e1f 100644 --- a/src/meta/src/hummock/manager/timer_task.rs +++ b/src/meta/src/hummock/manager/timer_task.rs @@ -43,7 +43,7 @@ impl HummockManager { const COMPACTION_HEARTBEAT_PERIOD_SEC: u64 = 1; pub enum HummockTimerEvent { - GroupSplit, + GroupSchedule, CheckDeadTask, Report, CompactionHeartBeatExpiredCheck, @@ -158,7 +158,7 @@ impl HummockManager { .set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); let split_group_trigger = IntervalStream::new(split_group_trigger_interval) - .map(|_| HummockTimerEvent::GroupSplit); + .map(|_| HummockTimerEvent::GroupSchedule); triggers.push(Box::pin(split_group_trigger)); } @@ -189,12 +189,12 @@ impl HummockManager { hummock_manager.check_dead_task().await; } - HummockTimerEvent::GroupSplit => { + HummockTimerEvent::GroupSchedule => { if hummock_manager.env.opts.compaction_deterministic_test { continue; } - hummock_manager.on_handle_check_split_multi_group().await; + hummock_manager.on_handle_schedule_group().await; } HummockTimerEvent::Report => { @@ -443,7 +443,7 @@ impl HummockManager { /// throughput keep larger than `table_write_throughput_threshold` for a long time. /// * For state-table whose throughput less than `min_table_split_write_throughput`, do not /// increase it size of base-level. - async fn on_handle_check_split_multi_group(&self) { + async fn on_handle_schedule_group(&self) { let params = self.env.system_params_reader().await; let barrier_interval_ms = params.barrier_interval_ms() as u64; let checkpoint_secs = std::cmp::max( @@ -469,18 +469,13 @@ impl HummockManager { continue; } - for (table_id, table_size) in &group.table_statistic { - self.try_move_table_to_dedicated_cg( - &table_write_throughput, - table_id, - table_size, - !created_tables.contains(table_id), - checkpoint_secs, - group.group_id, - group.group_size, - ) - .await; - } + self.try_split_compaction_group( + &table_write_throughput, + checkpoint_secs, + group, + &created_tables, + ) + .await; } } diff --git a/src/meta/src/hummock/manager/transaction.rs b/src/meta/src/hummock/manager/transaction.rs index aa0ead3cef2aa..9a795608f7e1a 100644 --- a/src/meta/src/hummock/manager/transaction.rs +++ b/src/meta/src/hummock/manager/transaction.rs @@ -122,7 +122,7 @@ impl<'a> HummockVersionTransaction<'a> { is_visible_table_committed_epoch: bool, new_compaction_group: Option<(CompactionGroupId, CompactionConfig)>, commit_sstables: BTreeMap>, - new_table_ids: HashMap, + new_table_ids: &HashMap, new_table_watermarks: HashMap, change_log_delta: HashMap, ) -> HummockVersionDelta { @@ -175,7 +175,7 @@ impl<'a> HummockVersionTransaction<'a> { // update state table info new_version_delta.with_latest_version(|version, delta| { - for (table_id, cg_id) in &new_table_ids { + for (table_id, cg_id) in new_table_ids { assert!( !version.state_table_info.info().contains_key(table_id), "newly added table exists previously: {:?}", diff --git a/src/meta/src/hummock/mock_hummock_meta_client.rs b/src/meta/src/hummock/mock_hummock_meta_client.rs index 1cdd8547c8247..499d9df0958c4 100644 --- a/src/meta/src/hummock/mock_hummock_meta_client.rs +++ b/src/meta/src/hummock/mock_hummock_meta_client.rs @@ -348,7 +348,11 @@ impl HummockMetaClient for MockHummockMetaClient { )) } - async fn get_version_by_epoch(&self, _epoch: HummockEpoch) -> Result { + async fn get_version_by_epoch( + &self, + _epoch: HummockEpoch, + _table_id: u32, + ) -> Result { unimplemented!() } } diff --git a/src/meta/src/lib.rs b/src/meta/src/lib.rs index 61e29b2fb1129..eab9dd1287ebf 100644 --- a/src/meta/src/lib.rs +++ b/src/meta/src/lib.rs @@ -15,7 +15,6 @@ #![allow(clippy::derive_partial_eq_without_eq)] #![feature(trait_alias)] #![feature(type_alias_impl_trait)] -#![feature(lint_reasons)] #![feature(map_try_insert)] #![feature(extract_if)] #![feature(hash_extract_if)] diff --git a/src/meta/src/manager/catalog/mod.rs b/src/meta/src/manager/catalog/mod.rs index 81e4f1c4d96c3..4db6711862810 100644 --- a/src/meta/src/manager/catalog/mod.rs +++ b/src/meta/src/manager/catalog/mod.rs @@ -625,20 +625,21 @@ impl CatalogManager { .notify_frontend(Operation::Delete, Info::Database(database)) .await; - let catalog_deleted_ids = tables_to_drop + let streaming_job_deleted_ids = tables_to_drop .into_iter() .filter(|table| valid_table_name(&table.name)) .map(|table| StreamingJobId::new(table.id)) + .chain(sources_to_drop.iter().filter_map(|source| { + source + .info + .as_ref() + .and_then(|info| info.is_shared().then(|| StreamingJobId::new(source.id))) + })) .chain( sinks_to_drop .into_iter() .map(|sink| StreamingJobId::new(sink.id)), ) - .chain( - subscriptions_to_drop - .into_iter() - .map(|subscription| StreamingJobId::new(subscription.id)), - ) .collect_vec(); let source_deleted_ids = sources_to_drop .into_iter() @@ -647,7 +648,7 @@ impl CatalogManager { Ok(( version, - catalog_deleted_ids, + streaming_job_deleted_ids, source_deleted_ids, connections_dropped, )) @@ -1810,15 +1811,11 @@ impl CatalogManager { all_table_ids.extend(index_table_ids.iter().cloned()); for index_table_id in &index_table_ids { - let internal_table_ids = match fragment_manager + let internal_table_ids = fragment_manager .select_table_fragments_by_table_id(&(index_table_id.into())) .await .map(|fragments| fragments.internal_table_ids()) - { - Ok(v) => v, - // Handle backwards compat with no state persistence. - Err(_) => vec![], - }; + .unwrap_or_default(); // 1 should be used by table scan. if internal_table_ids.len() == 1 { @@ -1900,15 +1897,11 @@ impl CatalogManager { } all_table_ids.insert(index.index_table_id); - let internal_table_ids = match fragment_manager + let internal_table_ids = fragment_manager .select_table_fragments_by_table_id(&(index.index_table_id.into())) .await .map(|fragments| fragments.internal_table_ids()) - { - Ok(v) => v, - // Handle backwards compat with no state persistence. - Err(_) => vec![], - }; + .unwrap_or_default(); // 1 should be used by table scan. if internal_table_ids.len() == 1 { diff --git a/src/meta/src/manager/catalog/user.rs b/src/meta/src/manager/catalog/user.rs index 81181b0fc1e17..68e5e31395c0d 100644 --- a/src/meta/src/manager/catalog/user.rs +++ b/src/meta/src/manager/catalog/user.rs @@ -74,6 +74,7 @@ impl UserManager { .values() .map(|connection| connection.owner), ) + .chain(database.secrets.values().map(|secret| secret.owner)) .for_each(|owner_id| user_manager.increase_ref(owner_id)); Ok(user_manager) diff --git a/src/meta/src/manager/env.rs b/src/meta/src/manager/env.rs index 22f88bd9c0a75..ed18be6b0f483 100644 --- a/src/meta/src/manager/env.rs +++ b/src/meta/src/manager/env.rs @@ -294,6 +294,10 @@ pub struct MetaOpts { pub temp_secret_file_dir: String, pub table_info_statistic_history_times: usize, + + // Cluster limits + pub actor_cnt_per_worker_parallelism_hard_limit: usize, + pub actor_cnt_per_worker_parallelism_soft_limit: usize, } impl MetaOpts { @@ -358,6 +362,8 @@ impl MetaOpts { secret_store_private_key: Some("0123456789abcdef".as_bytes().to_vec()), temp_secret_file_dir: "./secrets".to_string(), table_info_statistic_history_times: 240, + actor_cnt_per_worker_parallelism_hard_limit: usize::MAX, + actor_cnt_per_worker_parallelism_soft_limit: usize::MAX, } } } @@ -408,9 +414,11 @@ impl MetaSrvEnv { (ClusterId::new(), true) }; - // For new clusters, the name of the object store needs to be prefixed according to the object id. - // For old clusters, the prefix is ​​not divided for the sake of compatibility. - + // For new clusters: + // - the name of the object store needs to be prefixed according to the object id. + // + // For old clusters + // - the prefix is ​​not divided for the sake of compatibility. init_system_params.use_new_object_prefix_strategy = Some(cluster_first_launch); let system_params_manager = Arc::new( SystemParamsManager::new( @@ -455,7 +463,7 @@ impl MetaSrvEnv { } } MetaStoreImpl::Sql(sql_meta_store) => { - let is_sql_backend_cluster_first_launch = + let cluster_first_launch = is_first_launch_for_sql_backend_cluster(sql_meta_store).await?; // Try to upgrade if any new model changes are added. Migrator::up(&sql_meta_store.conn, None) @@ -469,10 +477,14 @@ impl MetaSrvEnv { .await? .map(|c| c.cluster_id.to_string().into()) .unwrap(); - init_system_params.use_new_object_prefix_strategy = - Some(is_sql_backend_cluster_first_launch); - // For new clusters, the name of the object store needs to be prefixed according to the object id. - // For old clusters, the prefix is ​​not divided for the sake of compatibility. + + // For new clusters: + // - the name of the object store needs to be prefixed according to the object id. + // + // For old clusters + // - the prefix is ​​not divided for the sake of compatibility. + init_system_params.use_new_object_prefix_strategy = Some(cluster_first_launch); + let system_param_controller = Arc::new( SystemParamsController::new( sql_meta_store.clone(), diff --git a/src/meta/src/manager/metadata.rs b/src/meta/src/manager/metadata.rs index 52fc811787d30..935d4773865ed 100644 --- a/src/meta/src/manager/metadata.rs +++ b/src/meta/src/manager/metadata.rs @@ -917,6 +917,7 @@ impl MetadataManager { &self, job: &StreamingJob, ) -> MetaResult { + tracing::debug!("wait_streaming_job_finished: {job:?}"); match self { MetadataManager::V1(mgr) => mgr.wait_streaming_job_finished(job).await, MetadataManager::V2(mgr) => mgr.wait_streaming_job_finished(job.id() as _).await, diff --git a/src/meta/src/manager/sink_coordination/coordinator_worker.rs b/src/meta/src/manager/sink_coordination/coordinator_worker.rs index 8409e714852c2..8ed063e5325c0 100644 --- a/src/meta/src/manager/sink_coordination/coordinator_worker.rs +++ b/src/meta/src/manager/sink_coordination/coordinator_worker.rs @@ -12,64 +12,191 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::future::{poll_fn, Future}; use std::pin::pin; +use std::task::Poll; +use std::time::{Duration, Instant}; use anyhow::anyhow; use futures::future::{select, Either}; -use futures::stream::FuturesUnordered; -use futures::{StreamExt, TryStreamExt}; +use futures::pin_mut; +use itertools::Itertools; use risingwave_common::bitmap::Bitmap; -use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; +use risingwave_common::hash::VirtualNode; use risingwave_connector::dispatch_sink; use risingwave_connector::sink::{build_sink, Sink, SinkCommitCoordinator, SinkParam}; -use risingwave_pb::connector_service::coordinate_request::CommitRequest; -use risingwave_pb::connector_service::coordinate_response::{ - CommitResponse, StartCoordinationResponse, -}; -use risingwave_pb::connector_service::{ - coordinate_request, coordinate_response, CoordinateRequest, CoordinateResponse, SinkMetadata, -}; +use risingwave_pb::connector_service::SinkMetadata; use thiserror_ext::AsReport; +use tokio::select; use tokio::sync::mpsc::UnboundedReceiver; +use tokio::time::sleep; use tonic::Status; use tracing::{error, warn}; -use crate::manager::sink_coordination::{ - NewSinkWriterRequest, SinkCoordinatorResponseSender, SinkWriterRequestStream, -}; +use crate::manager::sink_coordination::handle::SinkWriterCoordinationHandle; -macro_rules! send_await_with_err_check { - ($tx:expr, $msg:expr) => { - if $tx.send($msg).await.is_err() { - error!("unable to send msg"); +async fn run_future_with_periodic_fn( + future: F, + interval: Duration, + mut f: impl FnMut(), +) -> F::Output { + pin_mut!(future); + loop { + match select(&mut future, pin!(sleep(interval))).await { + Either::Left((output, _)) => { + break output; + } + Either::Right(_) => f(), } - }; + } } -pub struct CoordinatorWorker { +struct EpochCommitRequests { + epoch: u64, + metadatas: Vec, + handle_ids: HashSet, + bitmap: Bitmap, +} + +impl EpochCommitRequests { + fn new(epoch: u64) -> Self { + Self { + epoch, + metadatas: vec![], + handle_ids: Default::default(), + bitmap: Bitmap::zeros(VirtualNode::COUNT), + } + } + + fn add_new_request( + &mut self, + handle_id: usize, + metadata: SinkMetadata, + vnode_bitmap: Bitmap, + ) -> anyhow::Result<()> { + self.metadatas.push(metadata); + assert!(self.handle_ids.insert(handle_id)); + let check_bitmap = (&self.bitmap) & &vnode_bitmap; + if check_bitmap.count_ones() > 0 { + return Err(anyhow!( + "duplicate vnode {:?} on epoch {}. request vnode: {:?}, prev vnode: {:?}", + check_bitmap.iter_ones().collect_vec(), + self.epoch, + vnode_bitmap, + self.bitmap + )); + } + self.bitmap |= &vnode_bitmap; + Ok(()) + } + + fn can_commit(&self) -> bool { + self.bitmap.count_ones() == VirtualNode::COUNT + } +} + +struct CoordinationHandleManager { param: SinkParam, - request_streams: Vec, - response_senders: Vec, - request_rx: UnboundedReceiver, + writer_handles: HashMap, + next_handle_id: usize, + request_rx: UnboundedReceiver, +} + +impl CoordinationHandleManager { + fn ack_commit( + &mut self, + epoch: u64, + handle_ids: impl IntoIterator, + ) -> anyhow::Result<()> { + for handle_id in handle_ids { + let handle = self.writer_handles.get_mut(&handle_id).ok_or_else(|| { + anyhow!( + "fail to find handle for {} when ack commit on epoch {}", + handle_id, + epoch + ) + })?; + handle.ack_commit(epoch).map_err(|_| { + anyhow!( + "fail to ack commit on epoch {} for handle {}", + epoch, + handle_id + ) + })?; + } + Ok(()) + } + + async fn next_commit_request_inner( + writer_handles: &mut HashMap, + ) -> anyhow::Result<(usize, Bitmap, u64, SinkMetadata)> { + poll_fn(|cx| 'outer: loop { + for (handle_id, handle) in writer_handles.iter_mut() { + if let Poll::Ready(result) = handle.poll_next_commit_request(cx) { + match result { + Ok(Some((epoch, metadata))) => { + return Poll::Ready(Ok(( + *handle_id, + handle.vnode_bitmap().clone(), + epoch, + metadata, + ))); + } + Ok(None) => { + let handle_id = *handle_id; + writer_handles.remove(&handle_id); + continue 'outer; + } + Err(e) => { + return Poll::Ready(Err(e)); + } + } + } + } + return Poll::Pending; + }) + .await + } + + async fn next_commit_request(&mut self) -> anyhow::Result<(usize, Bitmap, u64, SinkMetadata)> { + loop { + select! { + handle = self.request_rx.recv() => { + let mut handle = handle.ok_or_else(|| anyhow!("end of writer request stream"))?; + if handle.param() != &self.param { + warn!(prev_param = ?self.param, new_param = ?handle.param(), "sink param mismatch"); + } + handle.start()?; + let handle_id = self.next_handle_id; + self.next_handle_id += 1; + self.writer_handles.insert(handle_id, handle); + } + result = Self::next_commit_request_inner(&mut self.writer_handles) => { + break result; + } + } + } + } +} + +pub struct CoordinatorWorker { + handle_manager: CoordinationHandleManager, + pending_epochs: BTreeMap, } impl CoordinatorWorker { pub async fn run( - first_writer_request: NewSinkWriterRequest, - request_rx: UnboundedReceiver, + param: SinkParam, + request_rx: UnboundedReceiver, ) { - let sink = match build_sink(first_writer_request.param.clone()) { + let sink = match build_sink(param.clone()) { Ok(sink) => sink, Err(e) => { error!( error = %e.as_report(), "unable to build sink with param {:?}", - first_writer_request.param - ); - send_await_with_err_check!( - first_writer_request.response_tx, - Err(Status::invalid_argument("failed to build sink")) + param ); return; } @@ -81,247 +208,77 @@ impl CoordinatorWorker { error!( error = %e.as_report(), "unable to build coordinator with param {:?}", - first_writer_request.param - ); - send_await_with_err_check!( - first_writer_request.response_tx, - Err(Status::invalid_argument("failed to build coordinator")) + param ); return; } }; - Self::execute_coordinator(first_writer_request, request_rx, coordinator).await + Self::execute_coordinator(param, request_rx, coordinator).await }); } pub async fn execute_coordinator( - first_writer_request: NewSinkWriterRequest, - request_rx: UnboundedReceiver, + param: SinkParam, + request_rx: UnboundedReceiver, coordinator: impl SinkCommitCoordinator, ) { let mut worker = CoordinatorWorker { - param: first_writer_request.param, - request_streams: vec![first_writer_request.request_stream], - response_senders: vec![first_writer_request.response_tx], - request_rx, + handle_manager: CoordinationHandleManager { + param, + writer_handles: HashMap::new(), + next_handle_id: 0, + request_rx, + }, + pending_epochs: Default::default(), }; - if let Err(e) = worker - .wait_for_writers(first_writer_request.vnode_bitmap) - .await - { - error!(error = %e.as_report(), "failed to wait for all writers"); - worker - .send_to_all_sink_writers(|| { - Err(Status::cancelled("failed to wait for all writers")) - }) - .await; - } - - worker.start_coordination(coordinator).await; - } - - async fn send_to_all_sink_writers( - &mut self, - new_msg: impl Fn() -> Result, - ) { - for sender in &self.response_senders { - send_await_with_err_check!(sender, new_msg()); - } - } - - async fn next_new_writer(&mut self) -> anyhow::Result { - // TODO: add timeout log - match select( - pin!(self.request_rx.recv()), - pin!(FuturesUnordered::from_iter( - self.request_streams - .iter_mut() - .map(|stream| stream.try_next()), - ) - .next()), - ) - .await - { - Either::Left((Some(req), _)) => Ok(req), - Either::Left((None, _)) => Err(anyhow!("manager request stream reaches the end")), - Either::Right((Some(Ok(Some(request))), _)) => Err(anyhow!( - "get new request from sink writer before initialize: {:?}", - request - )), - Either::Right((Some(Ok(None)), _)) => Err(anyhow!( - "one sink writer stream reaches the end before initialize" - )), - Either::Right((Some(Err(e)), _)) => { - Err(anyhow!(e).context("unable to poll one sink writer stream")) + if let Err(e) = worker.run_coordination(coordinator).await { + for handle in worker.handle_manager.writer_handles.into_values() { + handle.abort(Status::internal(format!( + "failed to run coordination: {:?}", + e.as_report() + ))) } - Either::Right((None, _)) => unreachable!("request_streams must not be empty"), } } - async fn wait_for_writers(&mut self, first_vnode_bitmap: Bitmap) -> anyhow::Result<()> { - let mut remaining_count = VirtualNode::COUNT; - let mut registered_vnode = HashSet::with_capacity(VirtualNode::COUNT); - - for vnode in first_vnode_bitmap.iter_vnodes() { - remaining_count -= 1; - registered_vnode.insert(vnode); - } - - while remaining_count > 0 { - let new_writer_request = self.next_new_writer().await?; - if self.param != new_writer_request.param { - // TODO: may return error. - warn!( - "get different param {:?} while current param {:?}", - new_writer_request.param, self.param - ); - } - self.request_streams.push(new_writer_request.request_stream); - self.response_senders.push(new_writer_request.response_tx); - - for vnode in new_writer_request.vnode_bitmap.iter_vnodes() { - if registered_vnode.contains(&vnode) { - return Err(anyhow!( - "get overlapped vnode: {}, current vnode {:?}", - vnode, - registered_vnode - )); - } - registered_vnode.insert(vnode); - remaining_count -= 1; - } - } - - self.send_to_all_sink_writers(|| { - Ok(CoordinateResponse { - msg: Some(coordinate_response::Msg::StartResponse( - StartCoordinationResponse {}, - )), - }) - }) - .await; - Ok(()) - } - - async fn collect_all_metadata(&mut self) -> anyhow::Result<(u64, Vec)> { - let mut epoch = None; - let mut metadata_list = Vec::with_capacity(self.request_streams.len()); - let mut uncollected_futures = FuturesUnordered::from_iter( - self.request_streams - .iter_mut() - .map(|stream| stream.try_next()), - ); - + async fn run_coordination( + &mut self, + mut coordinator: impl SinkCommitCoordinator, + ) -> anyhow::Result<()> { + coordinator.init().await?; loop { - match select( - pin!(self.request_rx.recv()), - pin!(uncollected_futures.next()), - ) - .await + let (handle_id, vnode_bitmap, epoch, metadata) = + self.handle_manager.next_commit_request().await?; + self.pending_epochs + .entry(epoch) + .or_insert_with(|| EpochCommitRequests::new(epoch)) + .add_new_request(handle_id, metadata, vnode_bitmap)?; + if self + .pending_epochs + .first_key_value() + .expect("non-empty") + .1 + .can_commit() { - Either::Left((Some(new_request), _)) => { - warn!("get new writer request while collecting metadata"); - send_await_with_err_check!( - new_request.response_tx, - Err(Status::already_exists( - "coordinator already running, should not get new request" - )) - ); - continue; - } - Either::Left((None, _)) => { - return Err(anyhow!( - "coordinator get notified to stop while collecting metadata" - )); - } - Either::Right((Some(next_result), _)) => match next_result { - Ok(Some(CoordinateRequest { - msg: - Some(coordinate_request::Msg::CommitRequest(CommitRequest { - epoch: request_epoch, - metadata: Some(metadata), - })), - })) => { - match &epoch { - Some(epoch) => { - if *epoch != request_epoch { - warn!( - "current epoch is {} but get request from {}", - epoch, request_epoch - ); - } - } - None => { - epoch = Some(request_epoch); - } - } - metadata_list.push(metadata); - } - Ok(Some(req)) => { - return Err(anyhow!("expect commit request but get {:?}", req)); - } - Ok(None) => { - return Err(anyhow!( - "sink writer input reaches the end while collecting metadata" - )); - } - Err(e) => { - return Err( - anyhow!(e).context("failed to poll one of the writer request streams") - ); - } - }, - Either::Right((None, _)) => { - break; - } - } - } - Ok(( - epoch.expect("should not be empty when have at least one writer"), - metadata_list, - )) - } - - async fn start_coordination(&mut self, mut coordinator: impl SinkCommitCoordinator) { - let result: Result<(), String> = try { - coordinator.init().await.map_err(|e| { - error!(error = %e.as_report(), "failed to initialize coordinator"); - format!("failed to initialize coordinator: {:?}", e.as_report()) - })?; - loop { - let (epoch, metadata_list) = self.collect_all_metadata().await.map_err(|e| { - error!(error = %e.as_report(), "failed to collect all metadata"); - format!("failed to collect all metadata: {:?}", e.as_report()) - })?; + let (epoch, requests) = self.pending_epochs.pop_first().expect("non-empty"); // TODO: measure commit time - coordinator - .commit(epoch, metadata_list) - .await - .map_err(|e| { - error!(epoch, error = %e.as_report(), "failed to commit metadata of epoch"); - format!("failed to commit: {:?}", e.as_report()) - })?; - - self.send_to_all_sink_writers(|| { - Ok(CoordinateResponse { - msg: Some(coordinate_response::Msg::CommitResponse(CommitResponse { - epoch, - })), - }) - }) - .await; + let start_time = Instant::now(); + run_future_with_periodic_fn( + coordinator.commit(epoch, requests.metadatas), + Duration::from_secs(5), + || { + warn!( + elapsed = ?start_time.elapsed(), + sink_id = self.handle_manager.param.sink_id.sink_id, + "committing" + ); + }, + ) + .await + .map_err(|e| anyhow!(e))?; + self.handle_manager.ack_commit(epoch, requests.handle_ids)?; } - }; - - if let Err(err_str) = result { - self.send_to_all_sink_writers(|| { - Err(Status::aborted(format!( - "failed to run coordination: {}", - err_str - ))) - }) - .await; } } } diff --git a/src/meta/src/manager/sink_coordination/handle.rs b/src/meta/src/manager/sink_coordination/handle.rs new file mode 100644 index 0000000000000..60b49cfd623ab --- /dev/null +++ b/src/meta/src/manager/sink_coordination/handle.rs @@ -0,0 +1,139 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::pin::pin; +use std::task::{Context, Poll}; + +use anyhow::anyhow; +use futures::{Future, TryStreamExt}; +use risingwave_common::bitmap::Bitmap; +use risingwave_connector::sink::SinkParam; +use risingwave_pb::connector_service::coordinate_response::{ + CommitResponse, StartCoordinationResponse, +}; +use risingwave_pb::connector_service::{ + coordinate_request, coordinate_response, CoordinateResponse, SinkMetadata, +}; +use tonic::Status; + +use crate::manager::sink_coordination::{SinkCoordinatorResponseSender, SinkWriterRequestStream}; + +pub(super) struct SinkWriterCoordinationHandle { + request_stream: SinkWriterRequestStream, + response_tx: SinkCoordinatorResponseSender, + param: SinkParam, + vnode_bitmap: Bitmap, + prev_epoch: Option, +} + +impl SinkWriterCoordinationHandle { + pub(super) fn new( + request_stream: SinkWriterRequestStream, + response_tx: SinkCoordinatorResponseSender, + param: SinkParam, + vnode_bitmap: Bitmap, + ) -> Self { + Self { + request_stream, + response_tx, + param, + vnode_bitmap, + prev_epoch: None, + } + } + + pub(super) fn param(&self) -> &SinkParam { + &self.param + } + + pub(super) fn vnode_bitmap(&self) -> &Bitmap { + &self.vnode_bitmap + } + + pub(super) fn start(&mut self) -> anyhow::Result<()> { + self.response_tx + .send(Ok(CoordinateResponse { + msg: Some(coordinate_response::Msg::StartResponse( + StartCoordinationResponse {}, + )), + })) + .map_err(|_| anyhow!("fail to send start response")) + } + + pub(super) fn abort(self, status: Status) { + let _ = self.response_tx.send(Err(status)); + } + + pub(super) fn ack_commit(&mut self, epoch: u64) -> anyhow::Result<()> { + self.response_tx + .send(Ok(CoordinateResponse { + msg: Some(coordinate_response::Msg::CommitResponse(CommitResponse { + epoch, + })), + })) + .map_err(|_| anyhow!("fail to send commit response of epoch {}", epoch)) + } + + pub(super) fn poll_next_commit_request( + &mut self, + cx: &mut Context<'_>, + ) -> Poll>> { + let future = self.next_commit_request(); + let future = pin!(future); + future.poll(cx) + } + + async fn next_commit_request(&mut self) -> anyhow::Result> { + loop { + let request = self + .request_stream + .try_next() + .await? + .ok_or_else(|| anyhow!("end of request stream"))?; + match request.msg.ok_or_else(|| anyhow!("None msg in request"))? { + coordinate_request::Msg::StartRequest(_) => { + return Err(anyhow!("should have started")); + } + coordinate_request::Msg::CommitRequest(request) => { + if let Some(prev_epoch) = self.prev_epoch { + if request.epoch < prev_epoch { + return Err(anyhow!( + "invalid commit epoch {}, prev_epoch {}", + request.epoch, + prev_epoch + )); + } + } + let Some(metadata) = request.metadata else { + return Err(anyhow!("empty commit metadata")); + }; + self.prev_epoch = Some(request.epoch); + return Ok(Some((request.epoch, metadata))); + } + coordinate_request::Msg::UpdateVnodeRequest(request) => { + let bitmap = Bitmap::from( + &request + .vnode_bitmap + .ok_or_else(|| anyhow!("empty vnode bitmap"))?, + ); + self.vnode_bitmap = bitmap; + continue; + } + coordinate_request::Msg::Stop(_) => { + return Ok(None); + } + } + } + } +} diff --git a/src/meta/src/manager/sink_coordination/manager.rs b/src/meta/src/manager/sink_coordination/manager.rs index fd2b986be28e7..2fe2e8bfb3b8c 100644 --- a/src/meta/src/manager/sink_coordination/manager.rs +++ b/src/meta/src/manager/sink_coordination/manager.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::hash_map::Entry; use std::collections::HashMap; use std::pin::pin; @@ -30,12 +29,13 @@ use tokio::sync::mpsc; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; use tokio::sync::oneshot::{channel, Receiver, Sender}; use tokio::task::{JoinError, JoinHandle}; -use tokio_stream::wrappers::ReceiverStream; +use tokio_stream::wrappers::UnboundedReceiverStream; use tonic::Status; use tracing::{debug, error, info, warn}; use crate::manager::sink_coordination::coordinator_worker::CoordinatorWorker; -use crate::manager::sink_coordination::{NewSinkWriterRequest, SinkWriterRequestStream}; +use crate::manager::sink_coordination::handle::SinkWriterCoordinationHandle; +use crate::manager::sink_coordination::SinkWriterRequestStream; macro_rules! send_with_err_check { ($tx:expr, $msg:expr) => { @@ -56,7 +56,7 @@ macro_rules! send_await_with_err_check { const BOUNDED_CHANNEL_SIZE: usize = 16; enum ManagerRequest { - NewSinkWriter(NewSinkWriterRequest), + NewSinkWriter(SinkWriterCoordinationHandle), StopCoordinator { finish_notifier: Sender<()>, /// sink id to stop. When `None`, stop all sink coordinator @@ -71,11 +71,8 @@ pub struct SinkCoordinatorManager { impl SinkCoordinatorManager { pub fn start_worker() -> (Self, (JoinHandle<()>, Sender<()>)) { - Self::start_worker_with_spawn_worker(|writer_request, manager_request_stream| { - tokio::spawn(CoordinatorWorker::run( - writer_request, - manager_request_stream, - )) + Self::start_worker_with_spawn_worker(|param, manager_request_stream| { + tokio::spawn(CoordinatorWorker::run(param, manager_request_stream)) }) } @@ -111,14 +108,11 @@ impl SinkCoordinatorManager { ))); } }; - let (response_tx, response_rx) = mpsc::channel(BOUNDED_CHANNEL_SIZE); + let (response_tx, response_rx) = mpsc::unbounded_channel(); self.request_tx - .send(ManagerRequest::NewSinkWriter(NewSinkWriterRequest { - request_stream, - response_tx, - param, - vnode_bitmap, - })) + .send(ManagerRequest::NewSinkWriter( + SinkWriterCoordinationHandle::new(request_stream, response_tx, param, vnode_bitmap), + )) .await .map_err(|_| { Status::unavailable( @@ -126,7 +120,7 @@ impl SinkCoordinatorManager { ) })?; - Ok(ReceiverStream::new(response_rx)) + Ok(UnboundedReceiverStream::new(response_rx)) } async fn stop_coordinator(&self, sink_id: Option) { @@ -155,7 +149,7 @@ impl SinkCoordinatorManager { struct CoordinatorWorkerHandle { /// Sender to coordinator worker. Drop the sender as a stop signal - request_sender: Option>, + request_sender: Option>, /// Notify when the coordinator worker stops finish_notifiers: Vec>, } @@ -163,7 +157,7 @@ struct CoordinatorWorkerHandle { struct ManagerWorker { request_rx: mpsc::Receiver, // Make it option so that it can be polled with &mut SinkManagerWorker - shutdown_rx: Option>, + shutdown_rx: Receiver<()>, running_coordinator_worker_join_handles: FuturesUnordered)>>, @@ -178,7 +172,7 @@ enum ManagerEvent { }, } -trait SpawnCoordinatorFn = FnMut(NewSinkWriterRequest, UnboundedReceiver) -> JoinHandle<()> +trait SpawnCoordinatorFn = FnMut(SinkParam, UnboundedReceiver) -> JoinHandle<()> + Send + 'static; @@ -186,7 +180,7 @@ impl ManagerWorker { fn new(request_rx: mpsc::Receiver, shutdown_rx: Receiver<()>) -> Self { ManagerWorker { request_rx, - shutdown_rx: Some(shutdown_rx), + shutdown_rx, running_coordinator_worker_join_handles: Default::default(), running_coordinator_worker: Default::default(), } @@ -237,7 +231,6 @@ impl ManagerWorker { } async fn next_event(&mut self) -> Option { - let shutdown_rx = self.shutdown_rx.take().expect("should not be empty"); match select( select( pin!(self.request_rx.recv()), @@ -245,23 +238,20 @@ impl ManagerWorker { self.running_coordinator_worker_join_handles.next() )), ), - shutdown_rx, + &mut self.shutdown_rx, ) .await { - Either::Left((either, shutdown_rx)) => { - self.shutdown_rx = Some(shutdown_rx); - match either { - Either::Left((Some(request), _)) => Some(ManagerEvent::NewRequest(request)), - Either::Left((None, _)) => None, - Either::Right(((sink_id, join_result), _)) => { - Some(ManagerEvent::CoordinatorWorkerFinished { - sink_id, - join_result, - }) - } + Either::Left((either, _)) => match either { + Either::Left((Some(request), _)) => Some(ManagerEvent::NewRequest(request)), + Either::Left((None, _)) => None, + Either::Right(((sink_id, join_result), _)) => { + Some(ManagerEvent::CoordinatorWorkerFinished { + sink_id, + join_result, + }) } - } + }, Either::Right(_) => None, } } @@ -309,39 +299,39 @@ impl ManagerWorker { fn handle_new_sink_writer( &mut self, - request: NewSinkWriterRequest, + new_writer: SinkWriterCoordinationHandle, spawn_coordinator_worker: &mut impl SpawnCoordinatorFn, ) { - let param = &request.param; + let param = new_writer.param(); let sink_id = param.sink_id; - // Launch the coordinator worker task if it is the first - match self.running_coordinator_worker.entry(param.sink_id) { - Entry::Occupied(mut entry) => { - if let Some(sender) = entry.get_mut().request_sender.as_mut() { - send_with_err_check!(sender, request); - } else { - warn!( - "handle a new request while the sink coordinator is being stopped: {:?}", - param - ); - drop(request.response_tx); - } - } - Entry::Vacant(entry) => { + let handle = self + .running_coordinator_worker + .entry(param.sink_id) + .or_insert_with(|| { + // Launch the coordinator worker task if it is the first let (request_tx, request_rx) = unbounded_channel(); - let join_handle = spawn_coordinator_worker(request, request_rx); + let join_handle = spawn_coordinator_worker(param.clone(), request_rx); self.running_coordinator_worker_join_handles.push( join_handle .map(move |join_result| (sink_id, join_result)) .boxed(), ); - entry.insert(CoordinatorWorkerHandle { + CoordinatorWorkerHandle { request_sender: Some(request_tx), finish_notifiers: Vec::new(), - }); - } - }; + } + }); + + if let Some(sender) = handle.request_sender.as_mut() { + send_with_err_check!(sender, new_writer); + } else { + warn!( + "handle a new request while the sink coordinator is being stopped: {:?}", + param + ); + new_writer.abort(Status::internal("the sink is being stopped")); + } } } @@ -357,7 +347,7 @@ mod tests { use futures::{FutureExt, StreamExt}; use itertools::Itertools; use rand::seq::SliceRandom; - use risingwave_common::bitmap::{Bitmap, BitmapBuilder}; + use risingwave_common::bitmap::BitmapBuilder; use risingwave_common::hash::VirtualNode; use risingwave_connector::sink::catalog::{SinkId, SinkType}; use risingwave_connector::sink::{SinkCommitCoordinator, SinkError, SinkParam}; @@ -367,7 +357,7 @@ mod tests { use tokio_stream::wrappers::ReceiverStream; use crate::manager::sink_coordination::coordinator_worker::CoordinatorWorker; - use crate::manager::sink_coordination::{NewSinkWriterRequest, SinkCoordinatorManager}; + use crate::manager::sink_coordination::SinkCoordinatorManager; struct MockCoordinator, &mut C) -> Result<(), SinkError>> { context: C, @@ -434,16 +424,16 @@ mod tests { let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker_with_spawn_worker({ - let param = param.clone(); + let expected_param = param.clone(); let metadata = metadata.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx| { - let param = param.clone(); + move |param, new_writer_rx| { let metadata = metadata.clone(); + let expected_param = expected_param.clone(); tokio::spawn(async move { // validate the start request - assert_eq!(first_request.param, param); + assert_eq!(param, expected_param); CoordinatorWorker::execute_coordinator( - first_request, + param.clone(), new_writer_rx, MockCoordinator::new(0, |epoch, metadata_list, count: &mut usize| { *count += 1; @@ -497,14 +487,8 @@ mod tests { .unwrap() }; - let mut build_client_future1 = pin!(build_client(vnode1)); - assert!( - poll_fn(|cx| Poll::Ready(build_client_future1.as_mut().poll(cx))) - .await - .is_pending() - ); let (mut client1, mut client2) = - join(build_client_future1, pin!(build_client(vnode2))).await; + join(build_client(vnode1), pin!(build_client(vnode2))).await; { // commit epoch1 @@ -598,16 +582,16 @@ mod tests { let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker_with_spawn_worker({ - let param = param.clone(); + let expected_param = param.clone(); let metadata = metadata.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx| { - let param = param.clone(); + move |param, new_writer_rx| { let metadata = metadata.clone(); + let expected_param = expected_param.clone(); tokio::spawn(async move { // validate the start request - assert_eq!(first_request.param, param); + assert_eq!(param, expected_param); CoordinatorWorker::execute_coordinator( - first_request, + param.clone(), new_writer_rx, MockCoordinator::new(0, |epoch, metadata_list, count: &mut usize| { *count += 1; @@ -686,46 +670,6 @@ mod tests { .unwrap(); } - #[tokio::test] - async fn test_drop_sink_while_init() { - let sink_id = SinkId::from(1); - let param = SinkParam { - sink_id, - sink_name: "test".into(), - properties: Default::default(), - columns: vec![], - downstream_pk: vec![], - sink_type: SinkType::AppendOnly, - format_desc: None, - db_name: "test".into(), - sink_from_name: "test".into(), - }; - - let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker(); - - let mut build_client_future1 = pin!(CoordinatorStreamHandle::new_with_init_stream( - param.to_proto(), - Bitmap::zeros(VirtualNode::COUNT), - |rx| async { - Ok(tonic::Response::new( - manager - .handle_new_request(ReceiverStream::new(rx).map(Ok).boxed()) - .await - .unwrap() - .boxed(), - )) - }, - )); - assert!( - poll_fn(|cx| Poll::Ready(build_client_future1.as_mut().poll(cx))) - .await - .is_pending() - ); - manager.stop_sink_coordinator(sink_id).await; - - assert!(build_client_future1.await.is_err()); - } - #[tokio::test] async fn test_partial_commit() { let param = SinkParam { @@ -757,14 +701,14 @@ mod tests { let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker_with_spawn_worker({ - let param = param.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx| { - let param = param.clone(); + let expected_param = param.clone(); + move |param, new_writer_rx| { + let expected_param = expected_param.clone(); tokio::spawn(async move { // validate the start request - assert_eq!(first_request.param, param); + assert_eq!(param, expected_param); CoordinatorWorker::execute_coordinator( - first_request, + param, new_writer_rx, MockCoordinator::new((), |_, _, _| unreachable!()), ) @@ -836,14 +780,14 @@ mod tests { let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker_with_spawn_worker({ - let param = param.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx| { - let param = param.clone(); + let expected_param = param.clone(); + move |param, new_writer_rx| { + let expected_param = expected_param.clone(); tokio::spawn(async move { // validate the start request - assert_eq!(first_request.param, param); + assert_eq!(param, expected_param); CoordinatorWorker::execute_coordinator( - first_request, + param, new_writer_rx, MockCoordinator::new((), |_, _, _| { Err(SinkError::Coordinator(anyhow!("failed to commit"))) @@ -897,4 +841,269 @@ mod tests { assert!(result1.is_err()); assert!(result2.is_err()); } + + #[tokio::test] + async fn test_update_vnode_bitmap() { + let param = SinkParam { + sink_id: SinkId::from(1), + sink_name: "test".into(), + properties: Default::default(), + columns: vec![], + downstream_pk: vec![], + sink_type: SinkType::AppendOnly, + format_desc: None, + db_name: "test".into(), + sink_from_name: "test".into(), + }; + + let epoch1 = 233; + let epoch2 = 234; + let epoch3 = 235; + let epoch4 = 236; + + let mut all_vnode = (0..VirtualNode::COUNT).collect_vec(); + all_vnode.shuffle(&mut rand::thread_rng()); + let (first, second) = all_vnode.split_at(VirtualNode::COUNT / 2); + let build_bitmap = |indexes: &[usize]| { + let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT); + for i in indexes { + builder.set(*i, true); + } + builder.finish() + }; + let vnode1 = build_bitmap(first); + let vnode2 = build_bitmap(second); + + let metadata = [ + [vec![1u8, 2u8], vec![3u8, 4u8]], + [vec![5u8, 6u8], vec![7u8, 8u8]], + ]; + + let metadata_scale_out = [vec![9u8, 10u8], vec![11u8, 12u8], vec![13u8, 14u8]]; + let metadata_scale_in = [vec![13u8, 14u8], vec![15u8, 16u8]]; + + let (manager, (_join_handle, _stop_tx)) = + SinkCoordinatorManager::start_worker_with_spawn_worker({ + let expected_param = param.clone(); + let metadata = metadata.clone(); + let metadata_scale_out = metadata_scale_out.clone(); + let metadata_scale_in = metadata_scale_in.clone(); + move |param, new_writer_rx| { + let metadata = metadata.clone(); + let metadata_scale_out = metadata_scale_out.clone(); + let metadata_scale_in = metadata_scale_in.clone(); + let expected_param = expected_param.clone(); + tokio::spawn(async move { + // validate the start request + assert_eq!(param, expected_param); + CoordinatorWorker::execute_coordinator( + param.clone(), + new_writer_rx, + MockCoordinator::new(0, |epoch, metadata_list, count: &mut usize| { + *count += 1; + let mut metadata_list = metadata_list + .into_iter() + .map(|metadata| match metadata { + SinkMetadata { + metadata: + Some(Metadata::Serialized(SerializedMetadata { + metadata, + })), + } => metadata, + _ => unreachable!(), + }) + .collect_vec(); + metadata_list.sort(); + let (expected_epoch, expected_metadata_list) = match *count { + 1 => (epoch1, metadata[0].as_slice()), + 2 => (epoch2, metadata[1].as_slice()), + 3 => (epoch3, metadata_scale_out.as_slice()), + 4 => (epoch4, metadata_scale_in.as_slice()), + _ => unreachable!(), + }; + assert_eq!(expected_epoch, epoch); + assert_eq!(expected_metadata_list, &metadata_list); + Ok(()) + }), + ) + .await; + }) + } + }); + + let build_client = |vnode| async { + CoordinatorStreamHandle::new_with_init_stream(param.to_proto(), vnode, |rx| async { + Ok(tonic::Response::new( + manager + .handle_new_request(ReceiverStream::new(rx).map(Ok).boxed()) + .await + .unwrap() + .boxed(), + )) + }) + .await + .unwrap() + }; + + let (mut client1, mut client2) = + join(build_client(vnode1), pin!(build_client(vnode2))).await; + + { + // commit epoch1 + let mut commit_future = pin!(client2 + .commit( + epoch1, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata[0][1].clone(), + })), + }, + ) + .map(|result| result.unwrap())); + assert!(poll_fn(|cx| Poll::Ready(commit_future.as_mut().poll(cx))) + .await + .is_pending()); + join( + commit_future, + client1 + .commit( + epoch1, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata[0][0].clone(), + })), + }, + ) + .map(|result| result.unwrap()), + ) + .await; + } + + let (vnode1, vnode2, vnode3) = { + let (first, second) = all_vnode.split_at(VirtualNode::COUNT / 3); + let (second, third) = second.split_at(VirtualNode::COUNT / 3); + ( + build_bitmap(first), + build_bitmap(second), + build_bitmap(third), + ) + }; + + let mut client3 = build_client(vnode3).await; + { + let mut commit_future3 = pin!(client3 + .commit( + epoch3, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata_scale_out[2].clone(), + })), + }, + ) + .map(|result| result.unwrap())); + assert!(poll_fn(|cx| Poll::Ready(commit_future3.as_mut().poll(cx))) + .await + .is_pending()); + + { + // commit epoch2 + let mut commit_future = pin!(client1 + .commit( + epoch2, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata[1][0].clone(), + })), + }, + ) + .map(|result| result.unwrap())); + assert!(poll_fn(|cx| Poll::Ready(commit_future.as_mut().poll(cx))) + .await + .is_pending()); + join( + commit_future, + client2 + .commit( + epoch2, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata[1][1].clone(), + })), + }, + ) + .map(|result| result.unwrap()), + ) + .await; + } + + client1.update_vnode_bitmap(&vnode1).await.unwrap(); + client2.update_vnode_bitmap(&vnode2).await.unwrap(); + let mut commit_future1 = pin!(client1 + .commit( + epoch3, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata_scale_out[0].clone(), + })), + }, + ) + .map(|result| result.unwrap())); + assert!(poll_fn(|cx| Poll::Ready(commit_future1.as_mut().poll(cx))) + .await + .is_pending()); + assert!(poll_fn(|cx| Poll::Ready(commit_future3.as_mut().poll(cx))) + .await + .is_pending()); + client2 + .commit( + epoch3, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata_scale_out[1].clone(), + })), + }, + ) + .map(|result| result.unwrap()) + .await; + } + + let (vnode2, vnode3) = { + let (first, second) = all_vnode.split_at(VirtualNode::COUNT / 3); + (build_bitmap(first), build_bitmap(second)) + }; + + // client1.stop().await.unwrap(); + client2.update_vnode_bitmap(&vnode2).await.unwrap(); + client3.update_vnode_bitmap(&vnode3).await.unwrap(); + + { + let mut commit_future = pin!(client2 + .commit( + epoch4, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata_scale_in[0].clone(), + })), + }, + ) + .map(|result| result.unwrap())); + assert!(poll_fn(|cx| Poll::Ready(commit_future.as_mut().poll(cx))) + .await + .is_pending()); + join( + commit_future, + client3 + .commit( + epoch4, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata_scale_in[1].clone(), + })), + }, + ) + .map(|result| result.unwrap()), + ) + .await; + } + } } diff --git a/src/meta/src/manager/sink_coordination/mod.rs b/src/meta/src/manager/sink_coordination/mod.rs index ab44965891d5f..2f5f4d6ba62b1 100644 --- a/src/meta/src/manager/sink_coordination/mod.rs +++ b/src/meta/src/manager/sink_coordination/mod.rs @@ -13,22 +13,14 @@ // limitations under the License. mod coordinator_worker; +mod handle; mod manager; use futures::stream::BoxStream; pub use manager::SinkCoordinatorManager; -use risingwave_common::bitmap::Bitmap; -use risingwave_connector::sink::SinkParam; use risingwave_pb::connector_service::{CoordinateRequest, CoordinateResponse}; -use tokio::sync::mpsc::Sender; +use tokio::sync::mpsc::UnboundedSender; use tonic::Status; pub type SinkWriterRequestStream = BoxStream<'static, Result>; -pub type SinkCoordinatorResponseSender = Sender>; - -pub struct NewSinkWriterRequest { - pub request_stream: SinkWriterRequestStream, - pub response_tx: SinkCoordinatorResponseSender, - pub param: SinkParam, - pub vnode_bitmap: Bitmap, -} +pub type SinkCoordinatorResponseSender = UnboundedSender>; diff --git a/src/meta/src/model/stream.rs b/src/meta/src/model/stream.rs index bec6b95cfb0f9..aaff076688785 100644 --- a/src/meta/src/model/stream.rs +++ b/src/meta/src/model/stream.rs @@ -106,7 +106,8 @@ pub struct TableFragments { /// The status of actors pub actor_status: BTreeMap, - /// The splits of actors + /// The splits of actors, + /// incl. both `Source` and `SourceBackfill` actors. pub actor_splits: HashMap>, /// The streaming context associated with this stream plan and its fragments @@ -362,7 +363,9 @@ impl TableFragments { return vec![]; } if (fragment.fragment_type_mask - & (FragmentTypeFlag::Values as u32 | FragmentTypeFlag::StreamScan as u32)) + & (FragmentTypeFlag::Values as u32 + | FragmentTypeFlag::StreamScan as u32 + | FragmentTypeFlag::SourceScan as u32)) != 0 { actor_ids.extend(fragment.actors.iter().map(|actor| actor.actor_id)); diff --git a/src/meta/src/rpc/ddl_controller.rs b/src/meta/src/rpc/ddl_controller.rs index feb7a959083bb..4c1988a37d44c 100644 --- a/src/meta/src/rpc/ddl_controller.rs +++ b/src/meta/src/rpc/ddl_controller.rs @@ -368,12 +368,14 @@ impl DdlController { } } + #[tracing::instrument(skip(self), level = "debug")] pub async fn alter_parallelism( &self, table_id: u32, parallelism: PbTableParallelism, mut deferred: bool, ) -> MetaResult<()> { + tracing::info!("alter parallelism"); if self.barrier_manager.check_status_running().is_err() { tracing::info!( "alter parallelism is set to deferred mode because the system is in recovery state" @@ -1612,6 +1614,7 @@ impl DdlController { let parallelism = self.resolve_stream_parallelism(specified_parallelism, &cluster_info)?; + // TODO(var-vnode): use vnode count from config const MAX_PARALLELISM: NonZeroUsize = NonZeroUsize::new(VirtualNode::COUNT).unwrap(); let parallelism_limited = parallelism > MAX_PARALLELISM; @@ -1643,7 +1646,7 @@ impl DdlController { // Otherwise, it defaults to FIXED based on deduction. let table_parallelism = match (specified_parallelism, &self.env.opts.default_parallelism) { (None, DefaultParallelism::Full) if parallelism_limited => { - tracing::warn!("Parallelism limited to 256 in ADAPTIVE mode"); + tracing::warn!("Parallelism limited to {MAX_PARALLELISM} in ADAPTIVE mode"); TableParallelism::Adaptive } (None, DefaultParallelism::Full) => TableParallelism::Adaptive, diff --git a/src/meta/src/serving/mod.rs b/src/meta/src/serving/mod.rs index 69e17a978212e..30f1466eae7f7 100644 --- a/src/meta/src/serving/mod.rs +++ b/src/meta/src/serving/mod.rs @@ -16,7 +16,7 @@ use std::collections::HashMap; use std::sync::Arc; use parking_lot::RwLock; -use risingwave_common::hash::WorkerSlotMapping; +use risingwave_common::hash::{VirtualNode, WorkerSlotMapping}; use risingwave_common::vnode_mapping::vnode_placement::place_vnode; use risingwave_pb::common::{WorkerNode, WorkerType}; use risingwave_pb::meta::subscribe_response::{Info, Operation}; @@ -57,7 +57,8 @@ impl ServingVnodeMapping { } else { None }; - place_vnode(old_mapping, workers, max_parallelism) + // TODO(var-vnode): use vnode count from config + place_vnode(old_mapping, workers, max_parallelism, VirtualNode::COUNT) }; match new_mapping { None => { @@ -192,7 +193,16 @@ pub async fn start_serving_vnode_mapping_worker( continue; } let (workers, streaming_parallelisms) = fetch_serving_infos(&metadata_manager).await; - let (upserted, failed) = serving_vnode_mapping.upsert(streaming_parallelisms, &workers); + let filtered_streaming_parallelisms = fragment_ids.iter().filter_map(|frag_id|{ + match streaming_parallelisms.get(frag_id) { + Some(parallelism) => Some((*frag_id, *parallelism)), + None => { + tracing::warn!(fragment_id = *frag_id, "streaming parallelism not found"); + None + } + } + }).collect(); + let (upserted, failed) = serving_vnode_mapping.upsert(filtered_streaming_parallelisms, &workers); if !upserted.is_empty() { tracing::debug!("Update serving vnode mapping for fragments {:?}.", upserted.keys()); notification_manager.notify_frontend_without_version(Operation::Update, Info::ServingWorkerSlotMappings(FragmentWorkerSlotMappings{ mappings: to_fragment_worker_slot_mapping(&upserted) })); diff --git a/src/meta/src/stream/scale.rs b/src/meta/src/stream/scale.rs index d10fa83710d85..08a36ce3f7275 100644 --- a/src/meta/src/stream/scale.rs +++ b/src/meta/src/stream/scale.rs @@ -31,7 +31,7 @@ use risingwave_common::catalog::TableId; use risingwave_common::hash::{ActorMapping, VirtualNode}; use risingwave_common::util::iter_util::ZipEqDebug; use risingwave_meta_model_v2::{actor, fragment, ObjectId, StreamingParallelism}; -use risingwave_pb::common::{Buffer, PbActorLocation, WorkerNode, WorkerType}; +use risingwave_pb::common::{PbActorLocation, WorkerNode, WorkerType}; use risingwave_pb::meta::subscribe_response::{Info, Operation}; use risingwave_pb::meta::table_fragments::actor_status::ActorState; use risingwave_pb::meta::table_fragments::fragment::{ @@ -49,7 +49,7 @@ use tokio::sync::{oneshot, RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::task::JoinHandle; use tokio::time::{Instant, MissedTickBehavior}; -use crate::barrier::{Command, Reschedule, StreamRpcManager}; +use crate::barrier::{Command, Reschedule}; use crate::controller::scale::RescheduleWorkingSet; use crate::manager::{ IdCategory, IdGenManagerImpl, LocalNotification, MetaSrvEnv, MetadataManager, @@ -126,7 +126,8 @@ pub struct CustomActorInfo { pub fragment_id: u32, pub dispatcher: Vec, pub upstream_actor_id: Vec, - pub vnode_bitmap: Option, + /// `None` if singleton. + pub vnode_bitmap: Option, } impl From<&PbStreamActor> for CustomActorInfo { @@ -145,7 +146,7 @@ impl From<&PbStreamActor> for CustomActorInfo { fragment_id: *fragment_id, dispatcher: dispatcher.clone(), upstream_actor_id: upstream_actor_id.clone(), - vnode_bitmap: vnode_bitmap.clone(), + vnode_bitmap: vnode_bitmap.as_ref().map(Bitmap::from), } } } @@ -183,17 +184,26 @@ impl CustomFragmentInfo { } } +use educe::Educe; + +// The debug implementation is arbitrary. Just used in debug logs. +#[derive(Educe)] +#[educe(Debug)] pub struct RescheduleContext { /// Meta information for all Actors + #[educe(Debug(ignore))] actor_map: HashMap, /// Status of all Actors, used to find the location of the `Actor` actor_status: BTreeMap, /// Meta information of all `Fragment`, used to find the `Fragment`'s `Actor` + #[educe(Debug(ignore))] fragment_map: HashMap, /// Index of all `Actor` upstreams, specific to `Dispatcher` upstream_dispatchers: HashMap>, - /// Fragments with stream source + /// Fragments with `StreamSource` stream_source_fragment_ids: HashSet, + /// Fragments with `StreamSourceBackfill` + stream_source_backfill_fragment_ids: HashSet, /// Target fragments in `NoShuffle` relation no_shuffle_target_fragment_ids: HashSet, /// Source fragments in `NoShuffle` relation @@ -252,6 +262,13 @@ pub fn rebalance_actor_vnode( let target_actor_count = actors.len() - actors_to_remove.len() + actors_to_create.len(); assert!(target_actor_count > 0); + // `vnode_bitmap` must be set on distributed fragments. + let vnode_count = actors[0] + .vnode_bitmap + .as_ref() + .expect("vnode bitmap unset") + .len(); + // represents the balance of each actor, used to sort later #[derive(Debug)] struct Balance { @@ -259,7 +276,7 @@ pub fn rebalance_actor_vnode( balance: i32, builder: BitmapBuilder, } - let (expected, mut remain) = VirtualNode::COUNT.div_rem(&target_actor_count); + let (expected, mut remain) = vnode_count.div_rem(&target_actor_count); tracing::debug!( "expected {}, remain {}, prev actors {}, target actors {}", @@ -271,11 +288,11 @@ pub fn rebalance_actor_vnode( let (mut removed, mut rest): (Vec<_>, Vec<_>) = actors .iter() - .filter_map(|actor| { - actor - .vnode_bitmap - .as_ref() - .map(|buffer| (actor.actor_id as ActorId, Bitmap::from(buffer))) + .map(|actor| { + ( + actor.actor_id as ActorId, + actor.vnode_bitmap.clone().expect("vnode bitmap unset"), + ) }) .partition(|(actor_id, _)| actors_to_remove.contains(actor_id)); @@ -294,7 +311,7 @@ pub fn rebalance_actor_vnode( builder }; - let (prev_expected, _) = VirtualNode::COUNT.div_rem(&actors.len()); + let (prev_expected, _) = vnode_count.div_rem(&actors.len()); let prev_remain = removed .iter() @@ -327,7 +344,7 @@ pub fn rebalance_actor_vnode( .map(|actor_id| Balance { actor_id: *actor_id, balance: -(expected as i32), - builder: BitmapBuilder::zeroed(VirtualNode::COUNT), + builder: BitmapBuilder::zeroed(vnode_count), }) .collect_vec(); @@ -389,7 +406,7 @@ pub fn rebalance_actor_vnode( let n = min(abs(src.balance), abs(dst.balance)); let mut moved = 0; - for idx in (0..VirtualNode::COUNT).rev() { + for idx in (0..vnode_count).rev() { if moved >= n { break; } @@ -437,10 +454,10 @@ pub struct ScaleController { pub source_manager: SourceManagerRef, - pub stream_rpc_manager: StreamRpcManager, - pub env: MetaSrvEnv, + /// We will acquire lock during DDL to prevent scaling operations on jobs that are in the creating state. + /// e.g., a MV cannot be rescheduled during foreground backfill. pub reschedule_lock: RwLock<()>, } @@ -448,11 +465,9 @@ impl ScaleController { pub fn new( metadata_manager: &MetadataManager, source_manager: SourceManagerRef, - stream_rpc_manager: StreamRpcManager, env: MetaSrvEnv, ) -> Self { Self { - stream_rpc_manager, metadata_manager: metadata_manager.clone(), source_manager, env, @@ -605,7 +620,7 @@ impl ScaleController { .flatten() .map(|id| *id as _) .collect(), - vnode_bitmap: vnode_bitmap.map(|bitmap| bitmap.to_protobuf()), + vnode_bitmap: vnode_bitmap.map(|b| Bitmap::from(&b.to_protobuf())), }; actor_map.insert(actor_id as _, actor_info.clone()); @@ -657,7 +672,7 @@ impl ScaleController { fragment_id: fragment_id as _, dispatcher, upstream_actor_id, - vnode_bitmap, + vnode_bitmap: vnode_bitmap.map(|b| b.to_protobuf()), // todo, we need to fill this part mview_definition: "".to_string(), expr_context: expr_contexts @@ -770,6 +785,7 @@ impl ScaleController { } let mut stream_source_fragment_ids = HashSet::new(); + let mut stream_source_backfill_fragment_ids = HashSet::new(); let mut no_shuffle_reschedule = HashMap::new(); for (fragment_id, WorkerReschedule { worker_actor_diff }) in &*reschedule { let fragment = fragment_map @@ -798,6 +814,7 @@ impl ScaleController { // correspondence, so we need to clone the reschedule plan to the downstream of all // cascading relations. if no_shuffle_source_fragment_ids.contains(fragment_id) { + // This fragment is a NoShuffle's upstream. let mut queue: VecDeque<_> = fragment_dispatcher_map .get(fragment_id) .unwrap() @@ -887,6 +904,17 @@ impl ScaleController { "reschedule plan rewritten with NoShuffle reschedule {:?}", no_shuffle_reschedule ); + + for noshuffle_downstream in no_shuffle_reschedule.keys() { + let fragment = fragment_map.get(noshuffle_downstream).unwrap(); + // SourceScan is always a NoShuffle downstream, rescheduled together with the upstream Source. + if (fragment.get_fragment_type_mask() & FragmentTypeFlag::SourceScan as u32) != 0 { + let stream_node = fragment.actor_template.nodes.as_ref().unwrap(); + if stream_node.find_source_backfill().is_some() { + stream_source_backfill_fragment_ids.insert(fragment.fragment_id); + } + } + } } // Modifications for NoShuffle downstream. @@ -898,6 +926,7 @@ impl ScaleController { fragment_map, upstream_dispatchers, stream_source_fragment_ids, + stream_source_backfill_fragment_ids, no_shuffle_target_fragment_ids, no_shuffle_source_fragment_ids, fragment_dispatcher_map, @@ -924,9 +953,11 @@ impl ScaleController { HashMap, HashMap>, )> { + tracing::debug!("build_reschedule_context, reschedules: {:#?}", reschedules); let ctx = self .build_reschedule_context(&mut reschedules, options, table_parallelisms) .await?; + tracing::debug!("reschedule context: {:#?}", ctx); let reschedules = reschedules; // Here, the plan for both upstream and downstream of the NO_SHUFFLE Fragment should already have been populated. @@ -1264,9 +1295,9 @@ impl ScaleController { } } - // For stream source fragments, we need to reallocate the splits. + // For stream source & source backfill fragments, we need to reallocate the splits. // Because we are in the Pause state, so it's no problem to reallocate - let mut fragment_stream_source_actor_splits = HashMap::new(); + let mut fragment_actor_splits = HashMap::new(); for fragment_id in reschedules.keys() { let actors_after_reschedule = fragment_actors_after_reschedule.get(fragment_id).unwrap(); @@ -1284,13 +1315,51 @@ impl ScaleController { let actor_splits = self .source_manager - .migrate_splits(*fragment_id, &prev_actor_ids, &curr_actor_ids) + .migrate_splits_for_source_actors( + *fragment_id, + &prev_actor_ids, + &curr_actor_ids, + ) .await?; - fragment_stream_source_actor_splits.insert(*fragment_id, actor_splits); + tracing::debug!( + "source actor splits: {:?}, fragment_id: {}", + actor_splits, + fragment_id + ); + fragment_actor_splits.insert(*fragment_id, actor_splits); + } + } + // We use 2 iterations to make sure source actors are migrated first, and then align backfill actors + if !ctx.stream_source_backfill_fragment_ids.is_empty() { + for fragment_id in reschedules.keys() { + let actors_after_reschedule = + fragment_actors_after_reschedule.get(fragment_id).unwrap(); + + if ctx + .stream_source_backfill_fragment_ids + .contains(fragment_id) + { + let fragment = ctx.fragment_map.get(fragment_id).unwrap(); + + let curr_actor_ids = actors_after_reschedule.keys().cloned().collect_vec(); + + let actor_splits = self.source_manager.migrate_splits_for_backfill_actors( + *fragment_id, + &fragment.upstream_fragment_ids, + &curr_actor_ids, + &fragment_actor_splits, + &no_shuffle_upstream_actor_map, + )?; + tracing::debug!( + "source backfill actor splits: {:?}, fragment_id: {}", + actor_splits, + fragment_id + ); + fragment_actor_splits.insert(*fragment_id, actor_splits); + } } } - // TODO: support migrate splits for SourceBackfill // Generate fragment reschedule plan let mut reschedule_fragment: HashMap = @@ -1409,9 +1478,7 @@ impl ScaleController { if let Some(actor) = ctx.actor_map.get(actor_id) { let bitmap = vnode_bitmap_updates.get(actor_id).unwrap(); - if let Some(buffer) = actor.vnode_bitmap.as_ref() { - let prev_bitmap = Bitmap::from(buffer); - + if let Some(prev_bitmap) = actor.vnode_bitmap.as_ref() { if prev_bitmap.eq(bitmap) { vnode_bitmap_updates.remove(actor_id); } @@ -1428,7 +1495,7 @@ impl ScaleController { let upstream_fragment_dispatcher_ids = upstream_fragment_dispatcher_set.into_iter().collect_vec(); - let actor_splits = fragment_stream_source_actor_splits + let actor_splits = fragment_actor_splits .get(&fragment_id) .cloned() .unwrap_or_default(); @@ -1479,6 +1546,8 @@ impl ScaleController { .pre_apply_reschedules(fragment_created_actors) .await; + tracing::debug!("analyze_reschedule_plan result: {:#?}", reschedule_fragment); + Ok((reschedule_fragment, applied_reschedules)) } @@ -1813,6 +1882,9 @@ impl ScaleController { &self, policy: TableResizePolicy, ) -> MetaResult> { + // TODO(var-vnode): use vnode count from config + let max_parallelism = VirtualNode::COUNT; + let TableResizePolicy { worker_ids, table_parallelisms, @@ -1867,12 +1939,12 @@ impl ScaleController { actor_location: &mut HashMap, table_fragment_id_map: &mut HashMap>, fragment_actor_id_map: &mut HashMap>, - table_fragments: &BTreeMap, + all_table_fragments: &BTreeMap, ) -> MetaResult<()> { // This is only for assertion purposes and will be removed once the dispatcher_id is guaranteed to always correspond to the downstream fragment_id, // such as through the foreign key constraints in the SQL backend. let mut actor_fragment_id_map_for_check = HashMap::new(); - for table_fragments in table_fragments.values() { + for table_fragments in all_table_fragments.values() { for (fragment_id, fragment) in &table_fragments.fragments { for actor in &fragment.actors { let prev = @@ -1883,7 +1955,7 @@ impl ScaleController { } } - for (table_id, table_fragments) in table_fragments { + for (table_id, table_fragments) in all_table_fragments { for (fragment_id, fragment) in &table_fragments.fragments { for actor in &fragment.actors { fragment_actor_id_map @@ -1911,8 +1983,15 @@ impl ScaleController { dispatcher.dispatcher_id as FragmentId ); } else { + tracing::error!( + "downstream actor id {} from actor {} (fragment {}) not found in actor_fragment_id_map_for_check: {actor_fragment_id_map_for_check:?}\n\ndispatchers: {:#?}", + downstream_actor_id, + actor.actor_id, + actor.fragment_id, + actor.dispatcher + ); bail!( - "downstream actor id {} from actor {} not found in fragment_actor_id_map", + "downstream actor id {} from actor {} not found", downstream_actor_id, actor.actor_id, ); @@ -2029,6 +2108,17 @@ impl ScaleController { .await?; } } + tracing::debug!( + ?worker_ids, + ?table_parallelisms, + ?no_shuffle_source_fragment_ids, + ?no_shuffle_target_fragment_ids, + ?fragment_distribution_map, + ?actor_location, + ?table_fragment_id_map, + ?fragment_actor_id_map, + "generate_table_resize_plan, after build_index" + ); let mut target_plan = HashMap::new(); @@ -2096,12 +2186,12 @@ impl ScaleController { } FragmentDistributionType::Hash => match parallelism { TableParallelism::Adaptive => { - if all_available_slots > VirtualNode::COUNT { - tracing::warn!("available parallelism for table {table_id} is larger than VirtualNode::COUNT, force limit to VirtualNode::COUNT"); - // force limit to VirtualNode::COUNT + if all_available_slots > max_parallelism { + tracing::warn!("available parallelism for table {table_id} is larger than max parallelism, force limit to {max_parallelism}"); + // force limit to `max_parallelism` let target_worker_slots = schedule_units_for_slots( &schedulable_worker_slots, - VirtualNode::COUNT, + max_parallelism, table_id, )?; @@ -2123,10 +2213,10 @@ impl ScaleController { } } TableParallelism::Fixed(mut n) => { - if n > VirtualNode::COUNT { + if n > max_parallelism { // This should be unreachable, but we still intercept it to prevent accidental modifications. - tracing::warn!("parallelism {n} for table {table_id} is larger than VirtualNode::COUNT, force limit to VirtualNode::COUNT"); - n = VirtualNode::COUNT + tracing::warn!("specified parallelism {n} for table {table_id} is larger than max parallelism, force limit to {max_parallelism}"); + n = max_parallelism } let target_worker_slots = @@ -2149,7 +2239,10 @@ impl ScaleController { } target_plan.retain(|_, plan| !plan.worker_actor_diff.is_empty()); - + tracing::debug!( + ?target_plan, + "generate_table_resize_plan finished target_plan" + ); Ok(target_plan) } @@ -2380,6 +2473,7 @@ impl ScaleController { /// At present, for table level scaling, we use the strategy `TableResizePolicy`. /// Currently, this is used as an internal interface, so it won’t be included in Protobuf. +#[derive(Debug)] pub struct TableResizePolicy { pub(crate) worker_ids: BTreeSet, pub(crate) table_parallelisms: HashMap, diff --git a/src/meta/src/stream/source_manager.rs b/src/meta/src/stream/source_manager.rs index a383bfee8e46a..ae5ca2a610b9c 100644 --- a/src/meta/src/stream/source_manager.rs +++ b/src/meta/src/stream/source_manager.rs @@ -188,10 +188,9 @@ impl ConnectorSourceWorker

{ let source_is_up = |res: i64| { self.source_is_up.set(res); }; - let splits = self.enumerator.list_splits().await.map_err(|e| { + let splits = self.enumerator.list_splits().await.inspect_err(|_| { source_is_up(0); self.fail_cnt += 1; - e })?; source_is_up(1); self.fail_cnt = 0; @@ -231,7 +230,8 @@ pub struct SourceManagerCore { /// `source_id` -> `(fragment_id, upstream_fragment_id)` backfill_fragments: HashMap>, - /// Splits assigned per actor + /// Splits assigned per actor, + /// incl. both `Source` and `SourceBackfill`. actor_splits: HashMap>, } @@ -468,13 +468,13 @@ impl Default for SplitDiffOptions { } /// Reassigns splits if there are new splits or dropped splits, -/// i.e., `actor_splits` and `discovered_splits` differ. +/// i.e., `actor_splits` and `discovered_splits` differ, or actors are rescheduled. /// /// The existing splits will remain unmoved in their currently assigned actor. /// /// If an actor has an upstream actor, it should be a backfill executor, -/// and its splits should be aligned with the upstream actor. `reassign_splits` should not be used in this case. -/// Use `align_backfill_splits` instead. +/// and its splits should be aligned with the upstream actor. **`reassign_splits` should not be used in this case. +/// Use `align_backfill_splits` instead.** /// /// - `fragment_id`: just for logging /// @@ -790,11 +790,10 @@ impl SourceManager { /// Migrates splits from previous actors to the new actors for a rescheduled fragment. /// - /// Very occasionally split removal may happen - /// during scaling, in which case we need to use the old splits for reallocation instead of the - /// latest splits (which may be missing), so that we can resolve the split removal in the next - /// command. - pub async fn migrate_splits( + /// Very occasionally split removal may happen during scaling, in which case we need to + /// use the old splits for reallocation instead of the latest splits (which may be missing), + /// so that we can resolve the split removal in the next command. + pub async fn migrate_splits_for_source_actors( &self, fragment_id: FragmentId, prev_actor_ids: &[ActorId], @@ -817,7 +816,7 @@ impl SourceManager { fragment_id, empty_actor_splits, &prev_splits, - // pre-allocate splits is the first time getting splits and it does not have scale in scene + // pre-allocate splits is the first time getting splits and it does not have scale-in scene SplitDiffOptions::default(), ) .unwrap_or_default(); @@ -825,6 +824,43 @@ impl SourceManager { Ok(diff) } + /// Migrates splits from previous actors to the new actors for a rescheduled fragment. + pub fn migrate_splits_for_backfill_actors( + &self, + fragment_id: FragmentId, + upstream_fragment_ids: &Vec, + curr_actor_ids: &[ActorId], + fragment_actor_splits: &HashMap>>, + no_shuffle_upstream_actor_map: &HashMap>, + ) -> MetaResult>> { + // align splits for backfill fragments with its upstream source fragment + debug_assert!(upstream_fragment_ids.len() == 1); + let upstream_fragment_id = upstream_fragment_ids[0]; + let actors = no_shuffle_upstream_actor_map + .iter() + .filter(|(id, _)| curr_actor_ids.contains(id)) + .map(|(id, upstream_fragment_actors)| { + debug_assert!(upstream_fragment_actors.len() == 1); + ( + *id, + vec![*upstream_fragment_actors.get(&upstream_fragment_id).unwrap()], + ) + }); + let upstream_assignment = fragment_actor_splits.get(&upstream_fragment_id).unwrap(); + tracing::info!( + fragment_id, + upstream_fragment_id, + ?upstream_assignment, + "migrate_splits_for_backfill_actors" + ); + Ok(align_backfill_splits( + actors, + upstream_assignment, + fragment_id, + upstream_fragment_id, + )?) + } + /// Allocates splits to actors for a newly created source executor. pub async fn allocate_splits(&self, table_id: &TableId) -> MetaResult { let core = self.core.lock().await; diff --git a/src/meta/src/stream/stream_graph/schedule.rs b/src/meta/src/stream/stream_graph/schedule.rs index 0f9e473c26486..d054beb0772b0 100644 --- a/src/meta/src/stream/stream_graph/schedule.rs +++ b/src/meta/src/stream/stream_graph/schedule.rs @@ -25,7 +25,7 @@ use either::Either; use enum_as_inner::EnumAsInner; use itertools::Itertools; use risingwave_common::bitmap::Bitmap; -use risingwave_common::hash::{ActorMapping, WorkerSlotId, WorkerSlotMapping}; +use risingwave_common::hash::{ActorMapping, VirtualNode, WorkerSlotId, WorkerSlotMapping}; use risingwave_common::{bail, hash}; use risingwave_pb::common::{ActorInfo, WorkerNode}; use risingwave_pb::meta::table_fragments::fragment::{ @@ -235,7 +235,9 @@ impl Scheduler { assert_eq!(scheduled_worker_slots.len(), parallelism); // Build the default hash mapping uniformly. - let default_hash_mapping = WorkerSlotMapping::build_from_ids(&scheduled_worker_slots); + // TODO(var-vnode): use vnode count from config + let default_hash_mapping = + WorkerSlotMapping::build_from_ids(&scheduled_worker_slots, VirtualNode::COUNT); let single_scheduled = schedule_units_for_slots(&slots, 1, streaming_job_id)?; let default_single_worker_id = single_scheduled.keys().exactly_one().cloned().unwrap(); diff --git a/src/meta/src/stream/stream_manager.rs b/src/meta/src/stream/stream_manager.rs index a8e8bc47752a5..5dc174106197c 100644 --- a/src/meta/src/stream/stream_manager.rs +++ b/src/meta/src/stream/stream_manager.rs @@ -31,7 +31,7 @@ use tracing::Instrument; use super::{Locations, RescheduleOptions, ScaleControllerRef, TableResizePolicy}; use crate::barrier::{ BarrierScheduler, Command, CreateStreamingJobCommandInfo, CreateStreamingJobType, - ReplaceTablePlan, SnapshotBackfillInfo, StreamRpcManager, + ReplaceTablePlan, SnapshotBackfillInfo, }; use crate::manager::{DdlType, MetaSrvEnv, MetadataManager, NotificationVersion, StreamingJob}; use crate::model::{ActorId, FragmentId, MetadataModel, TableFragments, TableParallelism}; @@ -203,8 +203,6 @@ pub struct GlobalStreamManager { creating_job_info: CreatingStreamingJobInfoRef, pub scale_controller: ScaleControllerRef, - - pub stream_rpc_manager: StreamRpcManager, } impl GlobalStreamManager { @@ -213,7 +211,6 @@ impl GlobalStreamManager { metadata_manager: MetadataManager, barrier_scheduler: BarrierScheduler, source_manager: SourceManagerRef, - stream_rpc_manager: StreamRpcManager, scale_controller: ScaleControllerRef, ) -> MetaResult { Ok(Self { @@ -223,7 +220,6 @@ impl GlobalStreamManager { source_manager, creating_job_info: Arc::new(CreatingStreamingJobInfo::default()), scale_controller, - stream_rpc_manager, }) } @@ -764,8 +760,7 @@ mod tests { use std::time::Duration; use futures::{Stream, TryStreamExt}; - use risingwave_common::hash; - use risingwave_common::hash::{ActorMapping, WorkerSlotId}; + use risingwave_common::hash::{self, ActorMapping, VirtualNode, WorkerSlotId}; use risingwave_common::system_param::reader::SystemParamsRead; use risingwave_pb::common::{HostAddress, WorkerType}; use risingwave_pb::meta::add_worker_node_request::Property; @@ -816,13 +811,6 @@ mod tests { type StreamingControlStreamStream = impl Stream>; - async fn drop_actors( - &self, - _request: Request, - ) -> std::result::Result, Status> { - Ok(Response::new(DropActorsResponse::default())) - } - async fn streaming_control_stream( &self, request: Request>, @@ -989,11 +977,9 @@ mod tests { let (sink_manager, _) = SinkCoordinatorManager::start_worker(); - let stream_rpc_manager = StreamRpcManager::new(env.clone()); let scale_controller = Arc::new(ScaleController::new( &metadata_manager, source_manager.clone(), - stream_rpc_manager.clone(), env.clone(), )); @@ -1005,7 +991,6 @@ mod tests { source_manager.clone(), sink_manager, meta_metrics.clone(), - stream_rpc_manager.clone(), scale_controller.clone(), ) .await; @@ -1015,7 +1000,6 @@ mod tests { metadata_manager, barrier_scheduler.clone(), source_manager.clone(), - stream_rpc_manager, scale_controller.clone(), )?; @@ -1137,12 +1121,14 @@ mod tests { } fn make_mview_stream_actors(table_id: &TableId, count: usize) -> Vec { - let mut actor_bitmaps: HashMap<_, _> = - ActorMapping::new_uniform((0..count).map(|i| i as hash::ActorId)) - .to_bitmaps() - .into_iter() - .map(|(actor_id, bitmap)| (actor_id, bitmap.to_protobuf())) - .collect(); + let mut actor_bitmaps: HashMap<_, _> = ActorMapping::new_uniform( + (0..count).map(|i| i as hash::ActorId), + VirtualNode::COUNT_FOR_TEST, + ) + .to_bitmaps() + .into_iter() + .map(|(actor_id, bitmap)| (actor_id, bitmap.to_protobuf())) + .collect(); (0..count) .map(|i| StreamActor { diff --git a/src/meta/src/stream/test_scale.rs b/src/meta/src/stream/test_scale.rs index 0dc0bced84005..589abb5bdab66 100644 --- a/src/meta/src/stream/test_scale.rs +++ b/src/meta/src/stream/test_scale.rs @@ -26,7 +26,7 @@ mod tests { use crate::stream::CustomActorInfo; fn simulated_parallelism(min: Option, max: Option) -> Vec { - let mut raw = vec![1, 3, 12, 42, VirtualNode::COUNT]; + let mut raw = vec![1, 3, 12, 42, VirtualNode::COUNT_FOR_TEST]; if let Some(min) = min { raw.retain(|n| *n > min); raw.push(min); @@ -39,23 +39,23 @@ mod tests { } fn build_fake_actors(actor_ids: Vec) -> Vec { - let actor_bitmaps = ActorMapping::new_uniform(actor_ids.clone().into_iter()).to_bitmaps(); + let actor_bitmaps = + ActorMapping::new_uniform(actor_ids.clone().into_iter(), VirtualNode::COUNT_FOR_TEST) + .to_bitmaps(); actor_ids .iter() .map(|actor_id| CustomActorInfo { actor_id: *actor_id, - vnode_bitmap: actor_bitmaps - .get(actor_id) - .map(|bitmap| bitmap.to_protobuf()), + vnode_bitmap: actor_bitmaps.get(actor_id).cloned(), ..Default::default() }) .collect() } fn check_affinity_for_scale_in(bitmap: &Bitmap, actor: &CustomActorInfo) { - let prev_bitmap = Bitmap::from(actor.vnode_bitmap.as_ref().unwrap()); + let prev_bitmap = actor.vnode_bitmap.as_ref().unwrap(); - for idx in 0..VirtualNode::COUNT { + for idx in 0..VirtualNode::COUNT_FOR_TEST { if prev_bitmap.is_set(idx) { assert!(bitmap.is_set(idx)); } @@ -63,7 +63,9 @@ mod tests { } fn check_bitmaps(bitmaps: &HashMap) { - let mut target = (0..VirtualNode::COUNT).map(|_| false).collect_vec(); + let mut target = (0..VirtualNode::COUNT_FOR_TEST) + .map(|_| false) + .collect_vec(); for bitmap in bitmaps.values() { for (idx, pos) in target.iter_mut().enumerate() { @@ -89,9 +91,10 @@ mod tests { fn test_build_actor_mapping() { for parallelism in simulated_parallelism(None, None) { let actor_ids = (0..parallelism as ActorId).collect_vec(); - let actor_mapping = ActorMapping::new_uniform(actor_ids.into_iter()); + let actor_mapping = + ActorMapping::new_uniform(actor_ids.into_iter(), VirtualNode::COUNT_FOR_TEST); - assert_eq!(actor_mapping.len(), VirtualNode::COUNT); + assert_eq!(actor_mapping.len(), VirtualNode::COUNT_FOR_TEST); let mut check: HashMap> = HashMap::new(); for (vnode, actor_id) in actor_mapping.iter_with_vnode() { @@ -120,7 +123,7 @@ mod tests { .map(|actor| { ( actor.actor_id as ActorId, - Bitmap::from(actor.vnode_bitmap.as_ref().unwrap()), + actor.vnode_bitmap.unwrap().clone(), ) }) .collect(); @@ -178,7 +181,7 @@ mod tests { #[test] fn test_rebalance_scale_out() { - for parallelism in simulated_parallelism(Some(3), Some(VirtualNode::COUNT - 1)) { + for parallelism in simulated_parallelism(Some(3), Some(VirtualNode::COUNT_FOR_TEST - 1)) { let actors = build_fake_actors((0..parallelism as ActorId).collect_vec()); // add 1 @@ -189,8 +192,9 @@ mod tests { let actors = build_fake_actors((0..parallelism as ActorId).collect_vec()); - // add to VirtualNode::COUNT - let actors_to_add = (parallelism as ActorId..VirtualNode::COUNT as ActorId).collect(); + // add to VirtualNode::COUNT_FOR_TEST + let actors_to_add = + (parallelism as ActorId..VirtualNode::COUNT_FOR_TEST as ActorId).collect(); let result = rebalance_actor_vnode(&actors, &BTreeSet::new(), &actors_to_add); assert_eq!(result.len(), actors.len() + actors_to_add.len()); check_bitmaps(&result); @@ -220,7 +224,7 @@ mod tests { } let target_bitmap = result.get(&actor.actor_id).unwrap(); - let prev_bitmap = Bitmap::from(actor.vnode_bitmap.as_ref().unwrap()); + let prev_bitmap = actor.vnode_bitmap.as_ref().unwrap(); assert!(prev_bitmap.eq(target_bitmap)); } } @@ -275,7 +279,7 @@ mod tests { #[test] fn test_rebalance_scale_real() { - let actor_ids = (0..(VirtualNode::COUNT - 1) as ActorId).collect_vec(); + let actor_ids = (0..(VirtualNode::COUNT_FOR_TEST - 1) as ActorId).collect_vec(); let actors = build_fake_actors(actor_ids); let actors_to_remove = btreeset! {0, 1}; let actors_to_add = btreeset! {255}; diff --git a/src/object_store/src/lib.rs b/src/object_store/src/lib.rs index d9e768b7f0290..c70d38eb90a90 100644 --- a/src/object_store/src/lib.rs +++ b/src/object_store/src/lib.rs @@ -14,7 +14,6 @@ #![feature(trait_alias)] #![feature(type_alias_impl_trait)] -#![feature(lint_reasons)] #![feature(error_generic_member_access)] #![feature(let_chains)] diff --git a/src/prost/build.rs b/src/prost/build.rs index 18bc2d4ae9494..0afbaef2ea730 100644 --- a/src/prost/build.rs +++ b/src/prost/build.rs @@ -147,6 +147,7 @@ fn main() -> Result<(), Box> { "plan_common.AdditionalColumnPartition", "#[derive(Eq, Hash)]", ) + .type_attribute("plan_common.AdditionalColumnPayload", "#[derive(Eq, Hash)]") .type_attribute( "plan_common.AdditionalColumnTimestamp", "#[derive(Eq, Hash)]", @@ -179,6 +180,7 @@ fn main() -> Result<(), Box> { .type_attribute("hummock.GroupDestroy", "#[derive(Eq)]") .type_attribute("hummock.GroupMetaChange", "#[derive(Eq)]") .type_attribute("hummock.GroupTableChange", "#[derive(Eq)]") + .type_attribute("hummock.GroupMerge", "#[derive(Eq)]") .type_attribute("hummock.GroupDelta", "#[derive(Eq)]") .type_attribute("hummock.LevelHandler.RunningCompactTask", "#[derive(Eq)]") .type_attribute("hummock.LevelHandler", "#[derive(Eq)]") diff --git a/src/prost/src/lib.rs b/src/prost/src/lib.rs index c8ad9de582edc..e965f76282da4 100644 --- a/src/prost/src/lib.rs +++ b/src/prost/src/lib.rs @@ -15,7 +15,6 @@ // for derived code of `Message` #![expect(clippy::all)] #![expect(clippy::doc_markdown)] -#![feature(lint_reasons)] use std::str::FromStr; diff --git a/src/risedevtool/src/bin/risedev-compose.rs b/src/risedevtool/src/bin/risedev-compose.rs index 0547fda6b7008..e51961831056a 100644 --- a/src/risedevtool/src/bin/risedev-compose.rs +++ b/src/risedevtool/src/bin/risedev-compose.rs @@ -82,11 +82,11 @@ fn main() -> Result<()> { ) .collect(); - let (config_path, expanded_config) = + let (config_path, _env, expanded_config) = ConfigExpander::expand_with_extra_info(".", &opts.profile, extra_info)?; (expanded_config, Some(compose_deploy_config), config_path) } else { - let (config_path, expanded_config) = ConfigExpander::expand(".", &opts.profile)?; + let (config_path, _env, expanded_config) = ConfigExpander::expand(".", &opts.profile)?; (expanded_config, None, config_path) }; diff --git a/src/risedevtool/src/bin/risedev-dev.rs b/src/risedevtool/src/bin/risedev-dev.rs index c53453b3f903d..80415e321d805 100644 --- a/src/risedevtool/src/bin/risedev-dev.rs +++ b/src/risedevtool/src/bin/risedev-dev.rs @@ -66,6 +66,7 @@ impl ProgressManager { fn task_main( manager: &mut ProgressManager, services: &Vec, + env: Vec, ) -> Result<(Vec<(String, Duration)>, String)> { let log_path = env::var("PREFIX_LOG")?; @@ -82,7 +83,7 @@ fn task_main( // Start Tmux and kill previous services { let mut ctx = ExecuteContext::new(&mut logger, manager.new_progress(), status_dir.clone()); - let mut service = ConfigureTmuxTask::new()?; + let mut service = ConfigureTmuxTask::new(env)?; service.execute(&mut ctx)?; writeln!( @@ -392,7 +393,7 @@ fn main() -> Result<()> { .nth(1) .unwrap_or_else(|| "default".to_string()); - let (config_path, risedev_config) = ConfigExpander::expand(".", &task_name)?; + let (config_path, env, risedev_config) = ConfigExpander::expand(".", &task_name)?; if let Some(config_path) = &config_path { let target = Path::new(&env::var("PREFIX_CONFIG")?).join("risingwave.toml"); @@ -420,7 +421,7 @@ fn main() -> Result<()> { services.len(), task_name )); - let task_result = task_main(&mut manager, &services); + let task_result = task_main(&mut manager, &services, env); match task_result { Ok(_) => { diff --git a/src/risedevtool/src/config.rs b/src/risedevtool/src/config.rs index e4ba9acdf4e19..1abab635b88c1 100644 --- a/src/risedevtool/src/config.rs +++ b/src/risedevtool/src/config.rs @@ -50,6 +50,24 @@ impl ConfigExpander { /// Transforms `risedev.yml` and `risedev-profiles.user.yml` to a fully expanded yaml file. /// + /// Format: + /// + /// ```yaml + /// my-profile: + /// config-path: src/config/ci-recovery.toml + /// env: + /// RUST_LOG: "info,risingwave_storage::hummock=off" + /// RW_ENABLE_PRETTY_LOG: "true" + /// steps: + /// - use: minio + /// - use: sqlite + /// - use: meta-node + /// meta-backend: sqlite + /// - use: compute-node + /// parallelism: 1 + /// - use: frontend + /// ``` + /// /// # Arguments /// /// * `root` is the root directory of these YAML files. @@ -58,8 +76,11 @@ impl ConfigExpander { /// /// # Returns /// - /// A pair of `config_path` and expanded steps (items in `{profile}.steps` section in YAML) - pub fn expand(root: impl AsRef, profile: &str) -> Result<(Option, Yaml)> { + /// `(config_path, env, steps)` + pub fn expand( + root: impl AsRef, + profile: &str, + ) -> Result<(Option, Vec, Yaml)> { Self::expand_with_extra_info(root, profile, HashMap::new()) } @@ -72,7 +93,7 @@ impl ConfigExpander { root: impl AsRef, profile: &str, extra_info: HashMap, - ) -> Result<(Option, Yaml)> { + ) -> Result<(Option, Vec, Yaml)> { let global_path = root.as_ref().join(RISEDEV_CONFIG_FILE); let global_yaml = Self::load_yaml(global_path)?; let global_config = global_yaml @@ -120,6 +141,22 @@ impl ConfigExpander { .get(&Yaml::String("config-path".to_string())) .and_then(|s| s.as_str()) .map(|s| s.to_string()); + let mut env = vec![]; + if let Some(env_section) = profile_section.get(&Yaml::String("env".to_string())) { + let env_section = env_section + .as_hash() + .ok_or_else(|| anyhow!("expect `env` section to be a hashmap"))?; + + for (k, v) in env_section { + let key = k + .as_str() + .ok_or_else(|| anyhow!("expect env key to be a string"))?; + let value = v + .as_str() + .ok_or_else(|| anyhow!("expect env value to be a string"))?; + env.push(format!("{}={}", key, value)); + } + } let steps = profile_section .get(&Yaml::String("steps".to_string())) @@ -131,7 +168,7 @@ impl ConfigExpander { let steps = IdExpander::new(&steps)?.visit(steps)?; let steps = ProvideExpander::new(&steps)?.visit(steps)?; - Ok((config_path, steps)) + Ok((config_path, env, steps)) } /// Parses the expanded yaml into [`ServiceConfig`]s. diff --git a/src/risedevtool/src/lib.rs b/src/risedevtool/src/lib.rs index 57294e5a7eafa..e7b2fdf56f777 100644 --- a/src/risedevtool/src/lib.rs +++ b/src/risedevtool/src/lib.rs @@ -15,7 +15,6 @@ #![allow(clippy::derive_partial_eq_without_eq)] #![feature(exit_status_error)] #![feature(let_chains)] -#![feature(lint_reasons)] mod config; pub use config::*; diff --git a/src/risedevtool/src/task/configure_tmux_service.rs b/src/risedevtool/src/task/configure_tmux_service.rs index a20274edfc3c1..925cb2de38444 100644 --- a/src/risedevtool/src/task/configure_tmux_service.rs +++ b/src/risedevtool/src/task/configure_tmux_service.rs @@ -16,13 +16,15 @@ use std::env; use std::path::Path; use std::process::Command; -use anyhow::{bail, Context, Result}; +use anyhow::{Context, Result}; use console::style; -use crate::util::stylized_risedev_subcmd; +use crate::util::{risedev_cmd, stylized_risedev_subcmd}; use crate::{ExecuteContext, Task}; -pub struct ConfigureTmuxTask; +pub struct ConfigureTmuxTask { + env: Vec, +} pub const RISEDEV_NAME: &str = "risedev"; @@ -33,8 +35,8 @@ pub fn new_tmux_command() -> Command { } impl ConfigureTmuxTask { - pub fn new() -> Result { - Ok(Self) + pub fn new(env: Vec) -> Result { + Ok(Self { env }) } } @@ -59,10 +61,17 @@ impl Task for ConfigureTmuxTask { let mut cmd = new_tmux_command(); cmd.arg("list-sessions"); if ctx.run_command(cmd).is_ok() { - bail!( - "A previous cluster is already running. Please kill it first with {}.", - stylized_risedev_subcmd("k"), - ); + ctx.pb.set_message("killing previous session..."); + + let mut cmd = Command::new(risedev_cmd()); + cmd.arg("k"); + ctx.run_command(cmd).with_context(|| { + format!( + "A previous cluster is already running while `risedev-dev` failed to kill it. \ + Please kill it manually with {}.", + stylized_risedev_subcmd("k") + ) + })?; } ctx.pb.set_message("creating new session..."); @@ -71,8 +80,11 @@ impl Task for ConfigureTmuxTask { cmd.arg("new-session") // this will automatically create the `risedev` tmux server .arg("-d") .arg("-s") - .arg(RISEDEV_NAME) - .arg("-c") + .arg(RISEDEV_NAME); + for e in &self.env { + cmd.arg("-e").arg(e); + } + cmd.arg("-c") .arg(Path::new(&prefix_path)) .arg(Path::new(&prefix_bin).join("welcome.sh")); diff --git a/src/risedevtool/src/task/task_kafka_ready_check.rs b/src/risedevtool/src/task/task_kafka_ready_check.rs index 79838bf8eca66..b749822a1ebe2 100644 --- a/src/risedevtool/src/task/task_kafka_ready_check.rs +++ b/src/risedevtool/src/task/task_kafka_ready_check.rs @@ -42,7 +42,7 @@ impl Task for KafkaReadyCheckTask { let mut config = ClientConfig::new(); config.set( "bootstrap.servers", - &format!("{}:{}", self.config.address, self.config.port), + format!("{}:{}", self.config.address, self.config.port), ); let rt = tokio::runtime::Builder::new_current_thread() diff --git a/src/rpc_client/Cargo.toml b/src/rpc_client/Cargo.toml index 49729c6d9e8ac..6a25be3c21738 100644 --- a/src/rpc_client/Cargo.toml +++ b/src/rpc_client/Cargo.toml @@ -23,7 +23,7 @@ http = "1" hyper = "1" itertools = { workspace = true } lru = { workspace = true } -moka = { version = "0.12", features = ["future"] } +moka = { version = "0.12.0", features = ["future"] } paste = "1" rand = { workspace = true } risingwave_common = { workspace = true } diff --git a/src/rpc_client/src/hummock_meta_client.rs b/src/rpc_client/src/hummock_meta_client.rs index df42a0da3ff35..bb62875b3fae1 100644 --- a/src/rpc_client/src/hummock_meta_client.rs +++ b/src/rpc_client/src/hummock_meta_client.rs @@ -66,5 +66,9 @@ pub trait HummockMetaClient: Send + Sync + 'static { BoxStream<'static, CompactionEventItem>, )>; - async fn get_version_by_epoch(&self, epoch: HummockEpoch) -> Result; + async fn get_version_by_epoch( + &self, + epoch: HummockEpoch, + table_id: u32, + ) -> Result; } diff --git a/src/rpc_client/src/meta_client.rs b/src/rpc_client/src/meta_client.rs index b4e06d8690b72..c7b7204bff7c8 100644 --- a/src/rpc_client/src/meta_client.rs +++ b/src/rpc_client/src/meta_client.rs @@ -22,6 +22,7 @@ use std::time::{Duration, SystemTime}; use anyhow::{anyhow, Context}; use async_trait::async_trait; +use cluster_limit_service_client::ClusterLimitServiceClient; use either::Either; use futures::stream::BoxStream; use lru::LruCache; @@ -1245,10 +1246,12 @@ impl MetaClient { &self, group_id: CompactionGroupId, table_ids_to_new_group: &[StateTableId], + partition_vnode_count: u32, ) -> Result { let req = SplitCompactionGroupRequest { group_id, table_ids: table_ids_to_new_group.to_vec(), + partition_vnode_count, }; let resp = self.inner.split_compaction_group(req).await?; Ok(resp.new_group_id) @@ -1431,11 +1434,36 @@ impl MetaClient { Ok(resp.ret) } - pub async fn get_version_by_epoch(&self, epoch: HummockEpoch) -> Result { - let req = GetVersionByEpochRequest { epoch }; + pub async fn get_version_by_epoch( + &self, + epoch: HummockEpoch, + table_id: u32, + ) -> Result { + let req = GetVersionByEpochRequest { epoch, table_id }; let resp = self.inner.get_version_by_epoch(req).await?; Ok(resp.version.unwrap()) } + + pub async fn get_cluster_limits( + &self, + ) -> Result> { + let req = GetClusterLimitsRequest {}; + let resp = self.inner.get_cluster_limits(req).await?; + Ok(resp.active_limits.into_iter().map(|l| l.into()).collect()) + } + + pub async fn merge_compaction_group( + &self, + left_group_id: CompactionGroupId, + right_group_id: CompactionGroupId, + ) -> Result<()> { + let req = MergeCompactionGroupRequest { + left_group_id, + right_group_id, + }; + self.inner.merge_compaction_group(req).await?; + Ok(()) + } } #[async_trait] @@ -1598,8 +1626,12 @@ impl HummockMetaClient for MetaClient { Ok((request_sender, Box::pin(stream))) } - async fn get_version_by_epoch(&self, epoch: HummockEpoch) -> Result { - self.get_version_by_epoch(epoch).await + async fn get_version_by_epoch( + &self, + epoch: HummockEpoch, + table_id: u32, + ) -> Result { + self.get_version_by_epoch(epoch, table_id).await } } @@ -1636,6 +1668,7 @@ struct GrpcMetaClientCore { cloud_client: CloudServiceClient, sink_coordinate_client: SinkCoordinationRpcClient, event_log_client: EventLogServiceClient, + cluster_limit_client: ClusterLimitServiceClient, } impl GrpcMetaClientCore { @@ -1662,7 +1695,8 @@ impl GrpcMetaClientCore { let serving_client = ServingServiceClient::new(channel.clone()); let cloud_client = CloudServiceClient::new(channel.clone()); let sink_coordinate_client = SinkCoordinationServiceClient::new(channel.clone()); - let event_log_client = EventLogServiceClient::new(channel); + let event_log_client = EventLogServiceClient::new(channel.clone()); + let cluster_limit_client = ClusterLimitServiceClient::new(channel); GrpcMetaClientCore { cluster_client, @@ -1682,6 +1716,7 @@ impl GrpcMetaClientCore { cloud_client, sink_coordinate_client, event_log_client, + cluster_limit_client, } } } @@ -2105,6 +2140,7 @@ macro_rules! for_all_meta_rpc { ,{ hummock_client, cancel_compact_task, CancelCompactTaskRequest, CancelCompactTaskResponse} ,{ hummock_client, list_change_log_epochs, ListChangeLogEpochsRequest, ListChangeLogEpochsResponse } ,{ hummock_client, get_version_by_epoch, GetVersionByEpochRequest, GetVersionByEpochResponse } + ,{ hummock_client, merge_compaction_group, MergeCompactionGroupRequest, MergeCompactionGroupResponse } ,{ user_client, create_user, CreateUserRequest, CreateUserResponse } ,{ user_client, update_user, UpdateUserRequest, UpdateUserResponse } ,{ user_client, drop_user, DropUserRequest, DropUserResponse } @@ -2126,6 +2162,7 @@ macro_rules! for_all_meta_rpc { ,{ cloud_client, rw_cloud_validate_source, RwCloudValidateSourceRequest, RwCloudValidateSourceResponse } ,{ event_log_client, list_event_log, ListEventLogRequest, ListEventLogResponse } ,{ event_log_client, add_event_log, AddEventLogRequest, AddEventLogResponse } + ,{ cluster_limit_client, get_cluster_limits, GetClusterLimitsRequest, GetClusterLimitsResponse } } }; } diff --git a/src/rpc_client/src/sink_coordinate_client.rs b/src/rpc_client/src/sink_coordinate_client.rs index 06602ef4db3b7..8823dd440bc77 100644 --- a/src/rpc_client/src/sink_coordinate_client.rs +++ b/src/rpc_client/src/sink_coordinate_client.rs @@ -18,7 +18,7 @@ use anyhow::anyhow; use futures::{Stream, TryStreamExt}; use risingwave_common::bitmap::Bitmap; use risingwave_pb::connector_service::coordinate_request::{ - CommitRequest, StartCoordinationRequest, + CommitRequest, StartCoordinationRequest, UpdateVnodeBitmapRequest, }; use risingwave_pb::connector_service::{ coordinate_request, coordinate_response, CoordinateRequest, CoordinateResponse, PbSinkParam, @@ -99,4 +99,24 @@ impl CoordinatorStreamHandle { msg => Err(anyhow!("should get commit response but get {:?}", msg)), } } + + pub async fn update_vnode_bitmap(&mut self, vnode_bitmap: &Bitmap) -> anyhow::Result<()> { + self.send_request(CoordinateRequest { + msg: Some(coordinate_request::Msg::UpdateVnodeRequest( + UpdateVnodeBitmapRequest { + vnode_bitmap: Some(vnode_bitmap.to_protobuf()), + }, + )), + }) + .await?; + Ok(()) + } + + pub async fn stop(&mut self) -> anyhow::Result<()> { + self.send_request(CoordinateRequest { + msg: Some(coordinate_request::Msg::Stop(true)), + }) + .await?; + Ok(()) + } } diff --git a/src/rpc_client/src/stream_client.rs b/src/rpc_client/src/stream_client.rs index 920b6f0777f37..40a6d48dacb37 100644 --- a/src/rpc_client/src/stream_client.rs +++ b/src/rpc_client/src/stream_client.rs @@ -70,8 +70,7 @@ pub type StreamClientPoolRef = Arc; macro_rules! for_all_stream_rpc { ($macro:ident) => { $macro! { - { 0, drop_actors, DropActorsRequest, DropActorsResponse } - ,{ 0, wait_epoch_commit, WaitEpochCommitRequest, WaitEpochCommitResponse } + { 0, wait_epoch_commit, WaitEpochCommitRequest, WaitEpochCommitResponse } } }; } diff --git a/src/sqlparser/src/ast/statement.rs b/src/sqlparser/src/ast/statement.rs index f297530ac6aff..b8e1aa245b4ec 100644 --- a/src/sqlparser/src/ast/statement.rs +++ b/src/sqlparser/src/ast/statement.rs @@ -412,6 +412,7 @@ pub(super) fn fmt_create_items( || !watermarks.is_empty() || wildcard_idx.is_some(); has_items.then(|| write!(&mut items, "(")); + if let Some(wildcard_idx) = wildcard_idx { let (columns_l, columns_r) = columns.split_at(wildcard_idx); write!(&mut items, "{}", display_comma_separated(columns_l))?; @@ -426,14 +427,21 @@ pub(super) fn fmt_create_items( } else { write!(&mut items, "{}", display_comma_separated(columns))?; } - if !columns.is_empty() && (!constraints.is_empty() || !watermarks.is_empty()) { + let mut leading_items = !columns.is_empty() || wildcard_idx.is_some(); + + if leading_items && !constraints.is_empty() { write!(&mut items, ", ")?; } write!(&mut items, "{}", display_comma_separated(constraints))?; - if !columns.is_empty() && !constraints.is_empty() && !watermarks.is_empty() { + leading_items |= !constraints.is_empty(); + + if leading_items && !watermarks.is_empty() { write!(&mut items, ", ")?; } write!(&mut items, "{}", display_comma_separated(watermarks))?; + // uncomment this when adding more sections below + // leading_items |= !watermarks.is_empty(); + has_items.then(|| write!(&mut items, ")")); Ok(items) } diff --git a/src/sqlparser/src/lib.rs b/src/sqlparser/src/lib.rs index a102e5428edae..07967d4cf75a7 100644 --- a/src/sqlparser/src/lib.rs +++ b/src/sqlparser/src/lib.rs @@ -31,7 +31,6 @@ //! ``` #![cfg_attr(not(feature = "std"), no_std)] -#![feature(lint_reasons)] #![feature(let_chains)] #![expect(clippy::doc_markdown)] #![expect(clippy::upper_case_acronyms)] diff --git a/src/sqlparser/tests/testdata/create.yaml b/src/sqlparser/tests/testdata/create.yaml index bcd94d53f1ed7..9fca29ad98527 100644 --- a/src/sqlparser/tests/testdata/create.yaml +++ b/src/sqlparser/tests/testdata/create.yaml @@ -42,6 +42,12 @@ - input: CREATE SOURCE IF NOT EXISTS src WITH (kafka.topic = 'abc', kafka.brokers = 'localhost:1001') FORMAT PLAIN ENCODE PROTOBUF (message = 'Foo', schema.registry = 'http://') formatted_sql: CREATE SOURCE IF NOT EXISTS src WITH (kafka.topic = 'abc', kafka.brokers = 'localhost:1001') FORMAT PLAIN ENCODE PROTOBUF (message = 'Foo', schema.registry = 'http://') formatted_ast: 'CreateSource { stmt: CreateSourceStatement { temporary: false, if_not_exists: true, columns: [], wildcard_idx: None, constraints: [], source_name: ObjectName([Ident { value: "src", quote_style: None }]), with_properties: WithProperties([SqlOption { name: ObjectName([Ident { value: "kafka", quote_style: None }, Ident { value: "topic", quote_style: None }]), value: SingleQuotedString("abc") }, SqlOption { name: ObjectName([Ident { value: "kafka", quote_style: None }, Ident { value: "brokers", quote_style: None }]), value: SingleQuotedString("localhost:1001") }]), source_schema: V2(ConnectorSchema { format: Plain, row_encode: Protobuf, row_options: [SqlOption { name: ObjectName([Ident { value: "message", quote_style: None }]), value: SingleQuotedString("Foo") }, SqlOption { name: ObjectName([Ident { value: "schema", quote_style: None }, Ident { value: "registry", quote_style: None }]), value: SingleQuotedString("http://") }], key_encode: None }), source_watermarks: [], include_column_options: [] } }' +- input: CREATE SOURCE IF NOT EXISTS src (*, WATERMARK FOR event_time AS event_time - INTERVAL '60' SECOND) WITH (kafka.topic = 'abc', kafka.brokers = 'localhost:1001') FORMAT PLAIN ENCODE PROTOBUF (message = 'Foo', schema.registry = 'http://') + formatted_sql: CREATE SOURCE IF NOT EXISTS src (*, WATERMARK FOR event_time AS event_time - INTERVAL '60' SECOND) WITH (kafka.topic = 'abc', kafka.brokers = 'localhost:1001') FORMAT PLAIN ENCODE PROTOBUF (message = 'Foo', schema.registry = 'http://') + formatted_ast: 'CreateSource { stmt: CreateSourceStatement { temporary: false, if_not_exists: true, columns: [], wildcard_idx: Some(0), constraints: [], source_name: ObjectName([Ident { value: "src", quote_style: None }]), with_properties: WithProperties([SqlOption { name: ObjectName([Ident { value: "kafka", quote_style: None }, Ident { value: "topic", quote_style: None }]), value: SingleQuotedString("abc") }, SqlOption { name: ObjectName([Ident { value: "kafka", quote_style: None }, Ident { value: "brokers", quote_style: None }]), value: SingleQuotedString("localhost:1001") }]), source_schema: V2(ConnectorSchema { format: Plain, row_encode: Protobuf, row_options: [SqlOption { name: ObjectName([Ident { value: "message", quote_style: None }]), value: SingleQuotedString("Foo") }, SqlOption { name: ObjectName([Ident { value: "schema", quote_style: None }, Ident { value: "registry", quote_style: None }]), value: SingleQuotedString("http://") }], key_encode: None }), source_watermarks: [SourceWatermark { column: Ident { value: "event_time", quote_style: None }, expr: BinaryOp { left: Identifier(Ident { value: "event_time", quote_style: None }), op: Minus, right: Value(Interval { value: "60", leading_field: Some(Second), leading_precision: None, last_field: None, fractional_seconds_precision: None }) } }], include_column_options: [] } }' +- input: CREATE SOURCE IF NOT EXISTS src (PRIMARY KEY (event_id), WATERMARK FOR event_time AS event_time - INTERVAL '60' SECOND) WITH (kafka.topic = 'abc', kafka.brokers = 'localhost:1001') FORMAT PLAIN ENCODE PROTOBUF (message = 'Foo', schema.registry = 'http://') + formatted_sql: CREATE SOURCE IF NOT EXISTS src (PRIMARY KEY (event_id), WATERMARK FOR event_time AS event_time - INTERVAL '60' SECOND) WITH (kafka.topic = 'abc', kafka.brokers = 'localhost:1001') FORMAT PLAIN ENCODE PROTOBUF (message = 'Foo', schema.registry = 'http://') + formatted_ast: 'CreateSource { stmt: CreateSourceStatement { temporary: false, if_not_exists: true, columns: [], wildcard_idx: None, constraints: [Unique { name: None, columns: [Ident { value: "event_id", quote_style: None }], is_primary: true }], source_name: ObjectName([Ident { value: "src", quote_style: None }]), with_properties: WithProperties([SqlOption { name: ObjectName([Ident { value: "kafka", quote_style: None }, Ident { value: "topic", quote_style: None }]), value: SingleQuotedString("abc") }, SqlOption { name: ObjectName([Ident { value: "kafka", quote_style: None }, Ident { value: "brokers", quote_style: None }]), value: SingleQuotedString("localhost:1001") }]), source_schema: V2(ConnectorSchema { format: Plain, row_encode: Protobuf, row_options: [SqlOption { name: ObjectName([Ident { value: "message", quote_style: None }]), value: SingleQuotedString("Foo") }, SqlOption { name: ObjectName([Ident { value: "schema", quote_style: None }, Ident { value: "registry", quote_style: None }]), value: SingleQuotedString("http://") }], key_encode: None }), source_watermarks: [SourceWatermark { column: Ident { value: "event_time", quote_style: None }, expr: BinaryOp { left: Identifier(Ident { value: "event_time", quote_style: None }), op: Minus, right: Value(Interval { value: "60", leading_field: Some(Second), leading_precision: None, last_field: None, fractional_seconds_precision: None }) } }], include_column_options: [] } }' - input: CREATE SOURCE bid (auction INTEGER, bidder INTEGER, price INTEGER, WATERMARK FOR auction AS auction - 1, "date_time" TIMESTAMP) with (connector = 'nexmark', nexmark.table.type = 'Bid', nexmark.split.num = '12', nexmark.min.event.gap.in.ns = '0') formatted_sql: CREATE SOURCE bid (auction INT, bidder INT, price INT, "date_time" TIMESTAMP, WATERMARK FOR auction AS auction - 1) WITH (connector = 'nexmark', nexmark.table.type = 'Bid', nexmark.split.num = '12', nexmark.min.event.gap.in.ns = '0') FORMAT NATIVE ENCODE NATIVE formatted_ast: 'CreateSource { stmt: CreateSourceStatement { temporary: false, if_not_exists: false, columns: [ColumnDef { name: Ident { value: "auction", quote_style: None }, data_type: Some(Int), collation: None, options: [] }, ColumnDef { name: Ident { value: "bidder", quote_style: None }, data_type: Some(Int), collation: None, options: [] }, ColumnDef { name: Ident { value: "price", quote_style: None }, data_type: Some(Int), collation: None, options: [] }, ColumnDef { name: Ident { value: "date_time", quote_style: Some(''"'') }, data_type: Some(Timestamp(false)), collation: None, options: [] }], wildcard_idx: None, constraints: [], source_name: ObjectName([Ident { value: "bid", quote_style: None }]), with_properties: WithProperties([SqlOption { name: ObjectName([Ident { value: "connector", quote_style: None }]), value: SingleQuotedString("nexmark") }, SqlOption { name: ObjectName([Ident { value: "nexmark", quote_style: None }, Ident { value: "table", quote_style: None }, Ident { value: "type", quote_style: None }]), value: SingleQuotedString("Bid") }, SqlOption { name: ObjectName([Ident { value: "nexmark", quote_style: None }, Ident { value: "split", quote_style: None }, Ident { value: "num", quote_style: None }]), value: SingleQuotedString("12") }, SqlOption { name: ObjectName([Ident { value: "nexmark", quote_style: None }, Ident { value: "min", quote_style: None }, Ident { value: "event", quote_style: None }, Ident { value: "gap", quote_style: None }, Ident { value: "in", quote_style: None }, Ident { value: "ns", quote_style: None }]), value: SingleQuotedString("0") }]), source_schema: V2(ConnectorSchema { format: Native, row_encode: Native, row_options: [], key_encode: None }), source_watermarks: [SourceWatermark { column: Ident { value: "auction", quote_style: None }, expr: BinaryOp { left: Identifier(Ident { value: "auction", quote_style: None }), op: Minus, right: Value(Number("1")) } }], include_column_options: [] } }' diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index 2886c4e4e23f7..b321c43b99e43 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -36,7 +36,7 @@ libc = "0.2" lz4 = "1.25.0" memcomparable = "0.2" metrics-prometheus = "0.7" -moka = { version = "0.12", features = ["future", "sync"] } +moka = { version = "0.12.0", features = ["future", "sync"] } more-asserts = "0.3" num-integer = "0.1" parking_lot = { workspace = true } @@ -96,7 +96,7 @@ workspace-hack = { path = "../workspace-hack" } bincode = "1" criterion = { workspace = true, features = ["async_futures", "async_tokio"] } expect-test = "1" -risingwave_hummock_sdk = { workspace = true } +risingwave_hummock_sdk = { workspace = true, features = ["test"] } risingwave_test_runner = { workspace = true } uuid = { version = "1", features = ["v4"] } diff --git a/src/storage/backup/integration_tests/test_basic.sh b/src/storage/backup/integration_tests/test_basic.sh index afaee3ac6c507..9674807e62c6e 100644 --- a/src/storage/backup/integration_tests/test_basic.sh +++ b/src/storage/backup/integration_tests/test_basic.sh @@ -34,12 +34,20 @@ if ! psql -h localhost -p 4566 -d dev -U root -c "show materialized views;" | gr echo "expect 0 MV" exit 1 fi +if ! psql -h localhost -p 4566 -d dev -U root -c "show secrets;" | grep -q "0 row"; then + echo "expect 0 SECRET" + exit 1 +fi echo "restore snapshot ${job_id_1} succeeded" restore "${job_id_2}" start_cluster if ! psql -h localhost -p 4566 -d dev -U root -c "show materialized views;" | grep -q "1 row"; then - echo "expect 1 MVs" + echo "expect 1 MV" + exit 1 +fi +if ! psql -h localhost -p 4566 -d dev -U root -c "show secrets;" | grep -q "1 row"; then + echo "expect 1 SECRET" exit 1 fi echo "restore snapshot ${job_id_2} succeeded" @@ -55,6 +63,10 @@ if ! psql -h localhost -p 4566 -d dev -U root -c "show materialized views;" | gr echo "expect 0 MV" exit 1 fi +if ! psql -h localhost -p 4566 -d dev -U root -c "show secrets;" | grep -q "0 row"; then + echo "expect 0 SECRET" + exit 1 +fi echo "restore snapshot ${job_id_3} succeeded" echo "test succeeded" diff --git a/src/storage/backup/src/lib.rs b/src/storage/backup/src/lib.rs index 8dfba1b62a181..e543d139b44f0 100644 --- a/src/storage/backup/src/lib.rs +++ b/src/storage/backup/src/lib.rs @@ -17,7 +17,6 @@ #![feature(type_alias_impl_trait)] #![feature(extract_if)] #![feature(custom_test_frameworks)] -#![feature(lint_reasons)] #![feature(map_try_insert)] #![feature(hash_extract_if)] #![feature(btree_extract_if)] diff --git a/src/storage/benches/bench_table_watermarks.rs b/src/storage/benches/bench_table_watermarks.rs index 4a9e1c5edda0b..5153dd0f9fe38 100644 --- a/src/storage/benches/bench_table_watermarks.rs +++ b/src/storage/benches/bench_table_watermarks.rs @@ -166,7 +166,7 @@ fn bench_table_watermarks(c: &mut Criterion) { let mut pinned_version = PinnedVersion::new(versions.pop_front().unwrap(), unbounded_channel().0); while let Some(version) = versions.pop_front() { - pinned_version = pinned_version.new_pin_version(version); + pinned_version = pinned_version.new_pin_version(version).unwrap(); } }, BatchSize::SmallInput, diff --git a/src/storage/compactor/src/lib.rs b/src/storage/compactor/src/lib.rs index 22e70ac759aed..4c503f3d7a8d5 100644 --- a/src/storage/compactor/src/lib.rs +++ b/src/storage/compactor/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] - mod compactor_observer; mod rpc; pub mod server; diff --git a/src/storage/hummock_sdk/src/change_log.rs b/src/storage/hummock_sdk/src/change_log.rs index 433309acab930..c231b0eb6b7b5 100644 --- a/src/storage/hummock_sdk/src/change_log.rs +++ b/src/storage/hummock_sdk/src/change_log.rs @@ -16,32 +16,42 @@ use std::collections::HashMap; use risingwave_common::catalog::TableId; use risingwave_pb::hummock::hummock_version_delta::PbChangeLogDelta; -use risingwave_pb::hummock::{PbEpochNewChangeLog, PbTableChangeLog}; +use risingwave_pb::hummock::{PbEpochNewChangeLog, PbSstableInfo, PbTableChangeLog}; use tracing::warn; use crate::sstable_info::SstableInfo; #[derive(Debug, Clone, PartialEq)] -pub struct TableChangeLog(pub Vec); +pub struct TableChangeLogCommon(pub Vec>); + +pub type TableChangeLog = TableChangeLogCommon; #[derive(Debug, Clone, PartialEq)] -pub struct EpochNewChangeLog { - pub new_value: Vec, - pub old_value: Vec, +pub struct EpochNewChangeLogCommon { + pub new_value: Vec, + pub old_value: Vec, pub epochs: Vec, } -impl From<&EpochNewChangeLog> for PbEpochNewChangeLog { - fn from(val: &EpochNewChangeLog) -> Self { +pub type EpochNewChangeLog = EpochNewChangeLogCommon; + +impl From<&EpochNewChangeLogCommon> for PbEpochNewChangeLog +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(val: &EpochNewChangeLogCommon) -> Self { Self { - new_value: val.new_value.iter().map(|a| a.clone().into()).collect(), - old_value: val.old_value.iter().map(|a| a.clone().into()).collect(), + new_value: val.new_value.iter().map(|a| a.into()).collect(), + old_value: val.old_value.iter().map(|a| a.into()).collect(), epochs: val.epochs.clone(), } } } -impl From<&PbEpochNewChangeLog> for EpochNewChangeLog { +impl From<&PbEpochNewChangeLog> for EpochNewChangeLogCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(value: &PbEpochNewChangeLog) -> Self { Self { new_value: value.new_value.iter().map(|a| a.into()).collect(), @@ -51,30 +61,28 @@ impl From<&PbEpochNewChangeLog> for EpochNewChangeLog { } } -impl From for PbEpochNewChangeLog { - fn from(val: EpochNewChangeLog) -> Self { +impl From> for PbEpochNewChangeLog +where + PbSstableInfo: From, +{ + fn from(val: EpochNewChangeLogCommon) -> Self { Self { - new_value: val - .new_value - .into_iter() - .map(|a| a.clone().into()) - .collect(), - old_value: val - .old_value - .into_iter() - .map(|a| a.clone().into()) - .collect(), - epochs: val.epochs.clone(), + new_value: val.new_value.into_iter().map(|a| a.into()).collect(), + old_value: val.old_value.into_iter().map(|a| a.into()).collect(), + epochs: val.epochs, } } } -impl From for EpochNewChangeLog { +impl From for EpochNewChangeLogCommon +where + T: From, +{ fn from(value: PbEpochNewChangeLog) -> Self { Self { new_value: value.new_value.into_iter().map(|a| a.into()).collect(), old_value: value.old_value.into_iter().map(|a| a.into()).collect(), - epochs: value.epochs.clone(), + epochs: value.epochs, } } } @@ -117,15 +125,23 @@ impl TableChangeLog { } } -impl TableChangeLog { +impl TableChangeLogCommon +where + PbSstableInfo: for<'a> From<&'a T>, +{ pub fn to_protobuf(&self) -> PbTableChangeLog { PbTableChangeLog { change_logs: self.0.iter().map(|a| a.into()).collect(), } } +} +impl TableChangeLogCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ pub fn from_protobuf(val: &PbTableChangeLog) -> Self { - Self(val.change_logs.clone().iter().map(|a| a.into()).collect()) + Self(val.change_logs.iter().map(|a| a.into()).collect()) } } @@ -173,13 +189,18 @@ pub fn build_table_change_log_delta<'a>( } #[derive(Debug, PartialEq, Clone)] -pub struct ChangeLogDelta { +pub struct ChangeLogDeltaCommon { pub truncate_epoch: u64, - pub new_log: Option, + pub new_log: Option>, } -impl From<&ChangeLogDelta> for PbChangeLogDelta { - fn from(val: &ChangeLogDelta) -> Self { +pub type ChangeLogDelta = ChangeLogDeltaCommon; + +impl From<&ChangeLogDeltaCommon> for PbChangeLogDelta +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(val: &ChangeLogDeltaCommon) -> Self { Self { truncate_epoch: val.truncate_epoch, new_log: val.new_log.as_ref().map(|a| a.into()), @@ -187,7 +208,10 @@ impl From<&ChangeLogDelta> for PbChangeLogDelta { } } -impl From<&PbChangeLogDelta> for ChangeLogDelta { +impl From<&PbChangeLogDelta> for ChangeLogDeltaCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(val: &PbChangeLogDelta) -> Self { Self { truncate_epoch: val.truncate_epoch, @@ -196,8 +220,11 @@ impl From<&PbChangeLogDelta> for ChangeLogDelta { } } -impl From for PbChangeLogDelta { - fn from(val: ChangeLogDelta) -> Self { +impl From> for PbChangeLogDelta +where + PbSstableInfo: From, +{ + fn from(val: ChangeLogDeltaCommon) -> Self { Self { truncate_epoch: val.truncate_epoch, new_log: val.new_log.map(|a| a.into()), @@ -205,7 +232,10 @@ impl From for PbChangeLogDelta { } } -impl From for ChangeLogDelta { +impl From for ChangeLogDeltaCommon +where + T: From, +{ fn from(val: PbChangeLogDelta) -> Self { Self { truncate_epoch: val.truncate_epoch, @@ -218,11 +248,12 @@ impl From for ChangeLogDelta { mod tests { use itertools::Itertools; - use crate::change_log::{EpochNewChangeLog, TableChangeLog}; + use crate::change_log::{EpochNewChangeLog, TableChangeLogCommon}; + use crate::sstable_info::SstableInfo; #[test] fn test_filter_epoch() { - let table_change_log = TableChangeLog(vec![ + let table_change_log = TableChangeLogCommon::(vec![ EpochNewChangeLog { new_value: vec![], old_value: vec![], @@ -262,7 +293,7 @@ mod tests { #[test] fn test_truncate() { - let mut table_change_log = TableChangeLog(vec![ + let mut table_change_log = TableChangeLogCommon::(vec![ EpochNewChangeLog { new_value: vec![], old_value: vec![], @@ -288,7 +319,7 @@ mod tests { table_change_log.truncate(1); assert_eq!( table_change_log, - TableChangeLog(vec![ + TableChangeLogCommon::(vec![ EpochNewChangeLog { new_value: vec![], old_value: vec![], @@ -310,7 +341,7 @@ mod tests { table_change_log.truncate(3); assert_eq!( table_change_log, - TableChangeLog(vec![ + TableChangeLogCommon::(vec![ EpochNewChangeLog { new_value: vec![], old_value: vec![], diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs index ca6585f46fd51..376626e844242 100644 --- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs +++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs @@ -22,13 +22,14 @@ use itertools::Itertools; use risingwave_common::catalog::TableId; use risingwave_common::hash::VnodeBitmapExt; use risingwave_pb::hummock::{ - CompactionConfig, CompatibilityVersion, GroupConstruct, GroupDestroy, GroupMetaChange, + CompactionConfig, CompatibilityVersion, GroupConstruct, GroupMerge, GroupMetaChange, GroupTableChange, PbLevelType, }; use tracing::warn; -use super::StateTableId; -use crate::change_log::TableChangeLog; +use super::group_split::get_sub_level_insert_hint; +use super::{group_split, StateTableId}; +use crate::change_log::TableChangeLogCommon; use crate::compaction_group::StaticCompactionGroupId; use crate::key_range::KeyRangeCommon; use crate::level::{Level, Levels, OverlappingLevel}; @@ -47,13 +48,17 @@ pub struct GroupDeltasSummary { pub insert_sub_level_id: u64, pub insert_table_infos: Vec, pub group_construct: Option, - pub group_destroy: Option, + pub group_destroy: Option, pub group_meta_changes: Vec, pub group_table_change: Option, pub new_vnode_partition_count: u32, + pub group_merge: Option, } -pub fn summarize_group_deltas(group_deltas: &GroupDeltas) -> GroupDeltasSummary { +pub fn summarize_group_deltas( + group_deltas: &GroupDeltas, + compaction_group_id: CompactionGroupId, +) -> GroupDeltasSummary { let mut delete_sst_levels = Vec::with_capacity(group_deltas.group_deltas.len()); let mut delete_sst_ids_set = HashSet::new(); let mut insert_sst_level_id = u32::MAX; @@ -64,6 +69,7 @@ pub fn summarize_group_deltas(group_deltas: &GroupDeltas) -> GroupDeltasSummary let mut group_meta_changes = vec![]; let mut group_table_change = None; let mut new_vnode_partition_count = 0; + let mut group_merge = None; for group_delta in &group_deltas.group_deltas { match group_delta { @@ -83,9 +89,9 @@ pub fn summarize_group_deltas(group_deltas: &GroupDeltas) -> GroupDeltasSummary assert!(group_construct.is_none()); group_construct = Some(construct_delta.clone()); } - GroupDelta::GroupDestroy(destroy_delta) => { + GroupDelta::GroupDestroy(_) => { assert!(group_destroy.is_none()); - group_destroy = Some(*destroy_delta); + group_destroy = Some(compaction_group_id); } GroupDelta::GroupMetaChange(meta_delta) => { group_meta_changes.push(meta_delta.clone()); @@ -93,6 +99,11 @@ pub fn summarize_group_deltas(group_deltas: &GroupDeltas) -> GroupDeltasSummary GroupDelta::GroupTableChange(meta_delta) => { group_table_change = Some(meta_delta.clone()); } + GroupDelta::GroupMerge(merge_delta) => { + assert!(group_merge.is_none()); + group_merge = Some(*merge_delta); + group_destroy = Some(merge_delta.right_group_id); + } } } @@ -110,6 +121,7 @@ pub fn summarize_group_deltas(group_deltas: &GroupDeltas) -> GroupDeltasSummary group_meta_changes, group_table_change, new_vnode_partition_count, + group_merge, } } @@ -173,6 +185,25 @@ impl HummockVersion { })) } + // only scan the sst infos from levels in the specified compaction group (without table change log) + pub fn get_sst_ids_by_group_id( + &self, + compaction_group_id: CompactionGroupId, + ) -> impl Iterator + '_ { + self.levels + .iter() + .filter_map(move |(cg_id, level)| { + if *cg_id == compaction_group_id { + Some(level) + } else { + None + } + }) + .flat_map(|level| level.l0.sub_levels.iter().rev().chain(level.levels.iter())) + .flat_map(|level| level.table_infos.iter()) + .map(|s| s.sst_id) + } + /// `get_sst_infos_from_groups` doesn't guarantee that all returned sst info belongs to `select_group`. /// i.e. `select_group` is just a hint. /// We separate `get_sst_infos_from_groups` and `get_sst_infos` because `get_sst_infos_from_groups` may be further customized in the future. @@ -354,7 +385,7 @@ impl HummockVersion { &mut self, parent_group_id: CompactionGroupId, group_id: CompactionGroupId, - member_table_ids: HashSet, + member_table_ids: BTreeSet, new_sst_start_id: u64, ) { let mut new_sst_id = new_sst_start_id; @@ -386,23 +417,6 @@ impl HummockVersion { { for sub_level in &mut l0.sub_levels { let target_l0 = &mut cur_levels.l0; - // When `insert_hint` is `Ok(idx)`, it means that the sub level `idx` in `target_l0` - // will extend these SSTs. When `insert_hint` is `Err(idx)`, it - // means that we will add a new sub level `idx` into `target_l0`. - let mut insert_hint = Err(target_l0.sub_levels.len()); - for (idx, other) in target_l0.sub_levels.iter_mut().enumerate() { - match other.sub_level_id.cmp(&sub_level.sub_level_id) { - Ordering::Less => {} - Ordering::Equal => { - insert_hint = Ok(idx); - break; - } - Ordering::Greater => { - insert_hint = Err(idx); - break; - } - } - } // Remove SST from sub level may result in empty sub level. It will be purged // whenever another compaction task is finished. let insert_table_infos = @@ -419,7 +433,7 @@ impl HummockVersion { if insert_table_infos.is_empty() { continue; } - match insert_hint { + match get_sub_level_insert_hint(&target_l0.sub_levels, sub_level) { Ok(idx) => { add_ssts_to_sub_level(target_l0, idx, insert_table_infos); } @@ -570,7 +584,7 @@ impl HummockVersion { // apply to `levels`, which is different compaction groups for (compaction_group_id, group_deltas) in &version_delta.group_deltas { - let summary = summarize_group_deltas(group_deltas); + let summary = summarize_group_deltas(group_deltas, *compaction_group_id); if let Some(group_construct) = &summary.group_construct { let mut new_levels = build_initial_compaction_group_levels( *compaction_group_id, @@ -594,7 +608,7 @@ impl HummockVersion { } else { #[expect(deprecated)] // for backward-compatibility of previous hummock version delta - HashSet::from_iter(group_construct.table_ids.clone()) + BTreeSet::from_iter(group_construct.table_ids.clone()) }; self.init_with_parent_group( @@ -614,7 +628,7 @@ impl HummockVersion { self.init_with_parent_group( group_change.origin_group_id, group_change.target_group_id, - HashSet::from_iter(group_change.table_ids.clone()), + BTreeSet::from_iter(group_change.table_ids.clone()), group_change.new_sst_start_id, ); @@ -635,14 +649,19 @@ impl HummockVersion { .expect("compaction group should exist") .member_table_ids .append(&mut moving_tables); + } else if let Some(group_merge) = &summary.group_merge { + tracing::info!( + "group_merge left {:?} right {:?}", + group_merge.left_group_id, + group_merge.right_group_id + ); + self.merge_compaction_group(group_merge.left_group_id, group_merge.right_group_id) } - let has_destroy = summary.group_destroy.is_some(); let visible_table_committed_epoch = self.visible_table_committed_epoch(); - let levels = self - .levels - .get_mut(compaction_group_id) - .expect("compaction group should exist"); - + let group_destroy = summary.group_destroy; + let levels = self.levels.get_mut(compaction_group_id).unwrap_or_else(|| { + panic!("compaction group {} does not exist", compaction_group_id) + }); #[expect(deprecated)] // for backward-compatibility of previous hummock version delta for group_meta_delta in &summary.group_meta_changes { levels @@ -669,7 +688,8 @@ impl HummockVersion { } = summary; assert!( - delete_sst_levels.is_empty() && delete_sst_ids_set.is_empty() || has_destroy, + delete_sst_levels.is_empty() && delete_sst_ids_set.is_empty() + || group_destroy.is_some(), "no sst should be deleted when committing an epoch" ); for group_delta in &group_deltas.group_deltas { @@ -703,8 +723,8 @@ impl HummockVersion { .compaction_group_member_table_ids(*compaction_group_id), ); } - if has_destroy { - self.levels.remove(compaction_group_id); + if let Some(destroy_group_id) = &group_destroy { + self.levels.remove(destroy_group_id); } } self.id = version_delta.id; @@ -775,7 +795,7 @@ impl HummockVersion { change_log.0.push(new_change_log.clone()); } Entry::Vacant(entry) => { - entry.insert(TableChangeLog(vec![new_change_log.clone()])); + entry.insert(TableChangeLogCommon(vec![new_change_log.clone()])); } }; } @@ -835,6 +855,45 @@ impl HummockVersion { } ret } + + pub fn merge_compaction_group( + &mut self, + left_group_id: CompactionGroupId, + right_group_id: CompactionGroupId, + ) { + // Double check + let left_group_id_table_ids = self + .state_table_info + .compaction_group_member_table_ids(left_group_id) + .iter() + .map(|table_id| table_id.table_id); + let right_group_id_table_ids = self + .state_table_info + .compaction_group_member_table_ids(right_group_id) + .iter() + .map(|table_id| table_id.table_id); + + assert!(left_group_id_table_ids + .chain(right_group_id_table_ids) + .is_sorted()); + + let total_cg = self.levels.keys().cloned().collect::>(); + let right_levels = self.levels.remove(&right_group_id).unwrap_or_else(|| { + panic!( + "compaction group should exist right {} all {:?}", + right_group_id, total_cg + ) + }); + + let left_levels = self.levels.get_mut(&left_group_id).unwrap_or_else(|| { + panic!( + "compaction group should exist left {} all {:?}", + left_group_id, total_cg + ) + }); + + group_split::merge_levels(left_levels, right_levels); + } } #[easy_ext::ext(HummockLevelsExt)] @@ -998,7 +1057,7 @@ pub fn build_initial_compaction_group_levels( } fn split_sst_info_for_level( - member_table_ids: &HashSet, + member_table_ids: &BTreeSet, level: &mut Level, new_sst_id: &mut u64, ) -> Vec { @@ -1228,6 +1287,14 @@ pub fn object_size_map(version: &HummockVersion) -> HashMap, + new_table_ids: Vec, ) -> SstableInfo { let mut branch_table_info = sst_info.clone(); branch_table_info.sst_id = *new_sst_id; branch_table_info.sst_size = new_sst_size; + *new_sst_id += 1; - sst_info.sst_id = *new_sst_id + 1; + sst_info.sst_id = *new_sst_id; sst_info.sst_size = old_sst_size; + *new_sst_id += 1; { // related github.com/risingwavelabs/risingwave/pull/17898/ // This is a temporary implementation that will update `table_ids`` based on the new split rule after PR 17898 - - let set1: HashSet<_> = sst_info.table_ids.iter().cloned().collect(); - let set2: HashSet<_> = new_sst_table_ids.iter().cloned().collect(); + // sst_info.table_ids = vec[1, 2, 3]; + // new_table_ids = vec[2, 3, 4]; + // branch_table_info.table_ids = vec[1, 2, 3] ∩ vec[2, 3, 4] = vec[2, 3] + let set1: BTreeSet<_> = sst_info.table_ids.iter().cloned().collect(); + let set2: BTreeSet<_> = new_table_ids.into_iter().collect(); let intersection: Vec<_> = set1.intersection(&set2).cloned().collect(); // Update table_ids @@ -1362,8 +1433,6 @@ pub fn split_sst( .retain(|table_id| !branch_table_info.table_ids.contains(table_id)); } - *new_sst_id += 1; - branch_table_info } @@ -1371,9 +1440,15 @@ pub fn split_sst( mod tests { use std::collections::HashMap; + use bytes::Bytes; + use risingwave_common::catalog::TableId; + use risingwave_common::hash::VirtualNode; use risingwave_pb::hummock::{CompactionConfig, GroupConstruct, GroupDestroy, LevelType}; + use crate::compaction_group::group_split; use crate::compaction_group::hummock_version_ext::build_initial_compaction_group_levels; + use crate::key::{gen_key_from_str, FullKey}; + use crate::key_range::KeyRange; use crate::level::{Level, Levels, OverlappingLevel}; use crate::sstable_info::SstableInfo; use crate::version::{ @@ -1383,20 +1458,22 @@ mod tests { #[test] fn test_get_sst_object_ids() { - let mut version = HummockVersion::default(); - version.id = HummockVersionId::new(0); - version.levels = HashMap::from_iter([( - 0, - Levels { - levels: vec![], - l0: OverlappingLevel { - sub_levels: vec![], - total_file_size: 0, - uncompressed_file_size: 0, + let mut version = HummockVersion { + id: HummockVersionId::new(0), + levels: HashMap::from_iter([( + 0, + Levels { + levels: vec![], + l0: OverlappingLevel { + sub_levels: vec![], + total_file_size: 0, + uncompressed_file_size: 0, + }, + ..Default::default() }, - ..Default::default() - }, - )]); + )]), + ..Default::default() + }; assert_eq!(version.get_object_ids().len(), 0); // Add to sub level @@ -1430,68 +1507,72 @@ mod tests { #[test] fn test_apply_version_delta() { - let mut version = HummockVersion::default(); - version.id = HummockVersionId::new(0); - version.levels = HashMap::from_iter([ - ( - 0, - build_initial_compaction_group_levels( + let mut version = HummockVersion { + id: HummockVersionId::new(0), + levels: HashMap::from_iter([ + ( 0, - &CompactionConfig { - max_level: 6, - ..Default::default() - }, + build_initial_compaction_group_levels( + 0, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ), ), - ), - ( - 1, - build_initial_compaction_group_levels( + ( 1, - &CompactionConfig { - max_level: 6, - ..Default::default() - }, - ), - ), - ]); - let mut version_delta = HummockVersionDelta::default(); - version_delta.id = HummockVersionId::new(1); - version_delta.group_deltas = HashMap::from_iter([ - ( - 2, - GroupDeltas { - group_deltas: vec![GroupDelta::GroupConstruct(GroupConstruct { - group_config: Some(CompactionConfig { + build_initial_compaction_group_levels( + 1, + &CompactionConfig { max_level: 6, ..Default::default() - }), - ..Default::default() - })], - }, - ), - ( - 0, - GroupDeltas { - group_deltas: vec![GroupDelta::GroupDestroy(GroupDestroy {})], - }, - ), - ( - 1, - GroupDeltas { - group_deltas: vec![GroupDelta::IntraLevel(IntraLevelDelta::new( - 1, - 0, - vec![], - vec![SstableInfo { - object_id: 1, - sst_id: 1, + }, + ), + ), + ]), + ..Default::default() + }; + let version_delta = HummockVersionDelta { + id: HummockVersionId::new(1), + group_deltas: HashMap::from_iter([ + ( + 2, + GroupDeltas { + group_deltas: vec![GroupDelta::GroupConstruct(GroupConstruct { + group_config: Some(CompactionConfig { + max_level: 6, + ..Default::default() + }), ..Default::default() - }], - 0, - ))], - }, - ), - ]); + })], + }, + ), + ( + 0, + GroupDeltas { + group_deltas: vec![GroupDelta::GroupDestroy(GroupDestroy {})], + }, + ), + ( + 1, + GroupDeltas { + group_deltas: vec![GroupDelta::IntraLevel(IntraLevelDelta::new( + 1, + 0, + vec![], + vec![SstableInfo { + object_id: 1, + sst_id: 1, + ..Default::default() + }], + 0, + ))], + }, + ), + ]), + ..Default::default() + }; let version_delta = version_delta; version.apply_version_delta(&version_delta); @@ -1512,23 +1593,425 @@ mod tests { }], ..Default::default() }; - assert_eq!(version, { - let mut version = HummockVersion::default(); - version.id = HummockVersionId::new(1); - version.levels = HashMap::from_iter([ - ( - 2, - build_initial_compaction_group_levels( + assert_eq!( + version, + HummockVersion { + id: HummockVersionId::new(1), + levels: HashMap::from_iter([ + ( 2, - &CompactionConfig { - max_level: 6, - ..Default::default() - }, + build_initial_compaction_group_levels( + 2, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ), ), + (1, cg1), + ]), + ..Default::default() + } + ); + } + + fn gen_sst_info(object_id: u64, table_ids: Vec, left: Bytes, right: Bytes) -> SstableInfo { + SstableInfo { + object_id, + sst_id: object_id, + key_range: KeyRange { + left, + right, + right_exclusive: false, + }, + table_ids, + file_size: 100, + sst_size: 100, + uncompressed_file_size: 100, + ..Default::default() + } + } + + #[test] + fn test_merge_levels() { + let mut left_levels = build_initial_compaction_group_levels( + 1, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ); + + let mut right_levels = build_initial_compaction_group_levels( + 2, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ); + + left_levels.levels[0] = Level { + level_idx: 1, + level_type: LevelType::Nonoverlapping, + table_infos: vec![ + gen_sst_info( + 1, + vec![3], + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + ), + gen_sst_info( + 10, + vec![3, 4], + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(201), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(4), + gen_key_from_str(VirtualNode::from_index(10), "1"), + 0, + ) + .encode() + .into(), + ), + gen_sst_info( + 11, + vec![4], + FullKey::for_test( + TableId::new(4), + gen_key_from_str(VirtualNode::from_index(11), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(4), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + ), + ], + total_file_size: 300, + ..Default::default() + }; + + left_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 3, + vec![3], + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 101, + level_type: LevelType::Overlapping, + total_file_size: 100, + ..Default::default() + }); + + left_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 3, + vec![3], + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 103, + level_type: LevelType::Overlapping, + total_file_size: 100, + ..Default::default() + }); + + left_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 3, + vec![3], + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(3), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 105, + level_type: LevelType::Nonoverlapping, + total_file_size: 100, + ..Default::default() + }); + + right_levels.levels[0] = Level { + level_idx: 1, + level_type: LevelType::Nonoverlapping, + table_infos: vec![ + gen_sst_info( + 1, + vec![5], + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + ), + gen_sst_info( + 10, + vec![5, 6], + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(201), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(6), + gen_key_from_str(VirtualNode::from_index(10), "1"), + 0, + ) + .encode() + .into(), ), - (1, cg1), - ]); - version + gen_sst_info( + 11, + vec![6], + FullKey::for_test( + TableId::new(6), + gen_key_from_str(VirtualNode::from_index(11), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(6), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + ), + ], + total_file_size: 300, + ..Default::default() + }; + + right_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 3, + vec![5], + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 101, + level_type: LevelType::Overlapping, + total_file_size: 100, + ..Default::default() }); + + right_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 5, + vec![5], + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 102, + level_type: LevelType::Overlapping, + total_file_size: 100, + ..Default::default() + }); + + right_levels.l0.sub_levels.push(Level { + level_idx: 0, + table_infos: vec![gen_sst_info( + 3, + vec![5], + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(1), "1"), + 0, + ) + .encode() + .into(), + FullKey::for_test( + TableId::new(5), + gen_key_from_str(VirtualNode::from_index(200), "1"), + 0, + ) + .encode() + .into(), + )], + sub_level_id: 103, + level_type: LevelType::Nonoverlapping, + total_file_size: 100, + ..Default::default() + }); + + { + // test empty + let mut left_levels = Levels::default(); + let right_levels = Levels::default(); + + group_split::merge_levels(&mut left_levels, right_levels); + } + + { + // test empty left + let mut left_levels = build_initial_compaction_group_levels( + 1, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ); + let right_levels = right_levels.clone(); + + group_split::merge_levels(&mut left_levels, right_levels); + + assert!(left_levels.l0.sub_levels.len() == 3); + assert!(left_levels.l0.sub_levels[0].sub_level_id == 101); + assert_eq!(100, left_levels.l0.sub_levels[0].total_file_size); + assert!(left_levels.l0.sub_levels[1].sub_level_id == 102); + assert_eq!(100, left_levels.l0.sub_levels[1].total_file_size); + assert!(left_levels.l0.sub_levels[2].sub_level_id == 103); + assert_eq!(100, left_levels.l0.sub_levels[2].total_file_size); + + assert!(left_levels.levels[0].level_idx == 1); + assert_eq!(300, left_levels.levels[0].total_file_size); + } + + { + // test empty right + let mut left_levels = left_levels.clone(); + let right_levels = build_initial_compaction_group_levels( + 2, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ); + + group_split::merge_levels(&mut left_levels, right_levels); + + assert!(left_levels.l0.sub_levels.len() == 3); + assert!(left_levels.l0.sub_levels[0].sub_level_id == 101); + assert_eq!(100, left_levels.l0.sub_levels[0].total_file_size); + assert!(left_levels.l0.sub_levels[1].sub_level_id == 103); + assert_eq!(100, left_levels.l0.sub_levels[1].total_file_size); + assert!(left_levels.l0.sub_levels[2].sub_level_id == 105); + assert_eq!(100, left_levels.l0.sub_levels[2].total_file_size); + + assert!(left_levels.levels[0].level_idx == 1); + assert_eq!(300, left_levels.levels[0].total_file_size); + } + + { + let mut left_levels = left_levels.clone(); + let right_levels = right_levels.clone(); + + group_split::merge_levels(&mut left_levels, right_levels); + + assert!(left_levels.l0.sub_levels.len() == 6); + assert!(left_levels.l0.sub_levels[0].sub_level_id == 101); + assert_eq!(100, left_levels.l0.sub_levels[0].total_file_size); + assert!(left_levels.l0.sub_levels[1].sub_level_id == 103); + assert_eq!(100, left_levels.l0.sub_levels[1].total_file_size); + assert!(left_levels.l0.sub_levels[2].sub_level_id == 105); + assert_eq!(100, left_levels.l0.sub_levels[2].total_file_size); + assert!(left_levels.l0.sub_levels[3].sub_level_id == 106); + assert_eq!(100, left_levels.l0.sub_levels[3].total_file_size); + assert!(left_levels.l0.sub_levels[4].sub_level_id == 107); + assert_eq!(100, left_levels.l0.sub_levels[4].total_file_size); + assert!(left_levels.l0.sub_levels[5].sub_level_id == 108); + assert_eq!(100, left_levels.l0.sub_levels[5].total_file_size); + + assert!(left_levels.levels[0].level_idx == 1); + assert_eq!(600, left_levels.levels[0].total_file_size); + } } } diff --git a/src/storage/hummock_sdk/src/compaction_group/mod.rs b/src/storage/hummock_sdk/src/compaction_group/mod.rs index 973cc3e3c6140..94ef89b8046e2 100644 --- a/src/storage/hummock_sdk/src/compaction_group/mod.rs +++ b/src/storage/hummock_sdk/src/compaction_group/mod.rs @@ -43,3 +43,115 @@ impl From for CompactionGroupId { cg as CompactionGroupId } } + +pub mod group_split { + use std::cmp::Ordering; + + use super::hummock_version_ext::insert_new_sub_level; + use crate::can_concat; + use crate::level::{Level, Levels}; + + pub fn merge_levels(left_levels: &mut Levels, right_levels: Levels) { + let right_l0 = right_levels.l0; + + let mut max_left_sub_level_id = left_levels + .l0 + .sub_levels + .iter() + .map(|sub_level| sub_level.sub_level_id + 1) + .max() + .unwrap_or(0); // If there are no sub levels, the max sub level id is 0. + let need_rewrite_right_sub_level_id = max_left_sub_level_id != 0; + + for mut right_sub_level in right_l0.sub_levels { + // Rewrtie the sub level id of right sub level to avoid conflict with left sub levels. (conflict level type) + // e.g. left sub levels: [0, 1, 2], right sub levels: [0, 1, 2], after rewrite, right sub levels: [3, 4, 5] + if need_rewrite_right_sub_level_id { + right_sub_level.sub_level_id = max_left_sub_level_id; + max_left_sub_level_id += 1; + } + + insert_new_sub_level( + &mut left_levels.l0, + right_sub_level.sub_level_id, + right_sub_level.level_type, + right_sub_level.table_infos, + None, + ); + } + + assert!( + left_levels + .l0 + .sub_levels + .is_sorted_by_key(|sub_level| sub_level.sub_level_id), + "{}", + format!("left_levels.l0.sub_levels: {:?}", left_levels.l0.sub_levels) + ); + + // Reinitialise `vnode_partition_count` to avoid misaligned hierarchies + // caused by the merge of different compaction groups.(picker might reject the different `vnode_partition_count` sub_level to compact) + left_levels + .l0 + .sub_levels + .iter_mut() + .for_each(|sub_level| sub_level.vnode_partition_count = 0); + + for (idx, level) in right_levels.levels.into_iter().enumerate() { + if level.table_infos.is_empty() { + continue; + } + + let insert_table_infos = level.table_infos; + left_levels.levels[idx].total_file_size += insert_table_infos + .iter() + .map(|sst| sst.sst_size) + .sum::(); + left_levels.levels[idx].uncompressed_file_size += insert_table_infos + .iter() + .map(|sst| sst.uncompressed_file_size) + .sum::(); + + left_levels.levels[idx] + .table_infos + .extend(insert_table_infos); + left_levels.levels[idx] + .table_infos + .sort_by(|sst1, sst2| sst1.key_range.cmp(&sst2.key_range)); + assert!( + can_concat(&left_levels.levels[idx].table_infos), + "{}", + format!( + "left-group {} right-group {} left_levels.levels[{}].table_infos: {:?} level_idx {:?}", + left_levels.group_id, + right_levels.group_id, + idx, + left_levels.levels[idx].table_infos, + left_levels.levels[idx].level_idx + ) + ); + } + } + + // When `insert_hint` is `Ok(idx)`, it means that the sub level `idx` in `target_l0` + // will extend these SSTs. When `insert_hint` is `Err(idx)`, it + // means that we will add a new sub level `idx` into `target_l0`. + pub fn get_sub_level_insert_hint( + target_levels: &Vec, + sub_level: &Level, + ) -> Result { + for (idx, other) in target_levels.iter().enumerate() { + match other.sub_level_id.cmp(&sub_level.sub_level_id) { + Ordering::Less => {} + Ordering::Equal => { + return Ok(idx); + } + Ordering::Greater => { + return Err(idx); + } + } + } + + Err(target_levels.len()) + } +} diff --git a/src/storage/hummock_sdk/src/level.rs b/src/storage/hummock_sdk/src/level.rs index c7db09e69e76d..762b5abd25ac9 100644 --- a/src/storage/hummock_sdk/src/level.rs +++ b/src/storage/hummock_sdk/src/level.rs @@ -23,19 +23,24 @@ use risingwave_pb::hummock::{ use crate::sstable_info::SstableInfo; #[derive(Debug, Clone, PartialEq, Default)] -pub struct OverlappingLevel { - pub sub_levels: Vec, +pub struct OverlappingLevelCommon { + pub sub_levels: Vec>, pub total_file_size: u64, pub uncompressed_file_size: u64, } -impl From<&PbOverlappingLevel> for OverlappingLevel { +pub type OverlappingLevel = OverlappingLevelCommon; + +impl From<&PbOverlappingLevel> for OverlappingLevelCommon +where + for<'a> LevelCommon: From<&'a PbLevel>, +{ fn from(pb_overlapping_level: &PbOverlappingLevel) -> Self { Self { sub_levels: pb_overlapping_level .sub_levels .iter() - .map(Level::from) + .map(LevelCommon::from) .collect_vec(), total_file_size: pb_overlapping_level.total_file_size, uncompressed_file_size: pb_overlapping_level.uncompressed_file_size, @@ -43,13 +48,16 @@ impl From<&PbOverlappingLevel> for OverlappingLevel { } } -impl From<&OverlappingLevel> for PbOverlappingLevel { - fn from(overlapping_level: &OverlappingLevel) -> Self { +impl From<&OverlappingLevelCommon> for PbOverlappingLevel +where + for<'a> &'a LevelCommon: Into, +{ + fn from(overlapping_level: &OverlappingLevelCommon) -> Self { Self { sub_levels: overlapping_level .sub_levels .iter() - .map(|pb_level| pb_level.into()) + .map(|level| level.into()) .collect_vec(), total_file_size: overlapping_level.total_file_size, uncompressed_file_size: overlapping_level.uncompressed_file_size, @@ -57,8 +65,11 @@ impl From<&OverlappingLevel> for PbOverlappingLevel { } } -impl From for PbOverlappingLevel { - fn from(overlapping_level: OverlappingLevel) -> Self { +impl From> for PbOverlappingLevel +where + LevelCommon: Into, +{ + fn from(overlapping_level: OverlappingLevelCommon) -> Self { Self { sub_levels: overlapping_level .sub_levels @@ -71,13 +82,16 @@ impl From for PbOverlappingLevel { } } -impl From for OverlappingLevel { +impl From for OverlappingLevelCommon +where + LevelCommon: From, +{ fn from(pb_overlapping_level: PbOverlappingLevel) -> Self { Self { sub_levels: pb_overlapping_level .sub_levels .into_iter() - .map(Level::from) + .map(LevelCommon::from) .collect_vec(), total_file_size: pb_overlapping_level.total_file_size, uncompressed_file_size: pb_overlapping_level.uncompressed_file_size, @@ -97,26 +111,27 @@ impl OverlappingLevel { } #[derive(Debug, Clone, PartialEq, Default)] -pub struct Level { +pub struct LevelCommon { pub level_idx: u32, pub level_type: PbLevelType, - pub table_infos: Vec, + pub table_infos: Vec, pub total_file_size: u64, pub sub_level_id: u64, pub uncompressed_file_size: u64, pub vnode_partition_count: u32, } -impl From<&PbLevel> for Level { +pub type Level = LevelCommon; + +impl From<&PbLevel> for LevelCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_level: &PbLevel) -> Self { Self { level_idx: pb_level.level_idx, level_type: PbLevelType::try_from(pb_level.level_type).unwrap(), - table_infos: pb_level - .table_infos - .iter() - .map(SstableInfo::from) - .collect_vec(), + table_infos: pb_level.table_infos.iter().map(Into::into).collect_vec(), total_file_size: pb_level.total_file_size, sub_level_id: pb_level.sub_level_id, uncompressed_file_size: pb_level.uncompressed_file_size, @@ -125,16 +140,15 @@ impl From<&PbLevel> for Level { } } -impl From<&Level> for PbLevel { - fn from(level: &Level) -> Self { +impl From<&LevelCommon> for PbLevel +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(level: &LevelCommon) -> Self { Self { level_idx: level.level_idx, level_type: level.level_type.into(), - table_infos: level - .table_infos - .iter() - .map(PbSstableInfo::from) - .collect_vec(), + table_infos: level.table_infos.iter().map(Into::into).collect_vec(), total_file_size: level.total_file_size, sub_level_id: level.sub_level_id, uncompressed_file_size: level.uncompressed_file_size, @@ -143,16 +157,15 @@ impl From<&Level> for PbLevel { } } -impl From for PbLevel { - fn from(level: Level) -> Self { +impl From> for PbLevel +where + PbSstableInfo: From, +{ + fn from(level: LevelCommon) -> Self { Self { level_idx: level.level_idx, level_type: level.level_type.into(), - table_infos: level - .table_infos - .into_iter() - .map(PbSstableInfo::from) - .collect_vec(), + table_infos: level.table_infos.into_iter().map(Into::into).collect_vec(), total_file_size: level.total_file_size, sub_level_id: level.sub_level_id, uncompressed_file_size: level.uncompressed_file_size, @@ -161,7 +174,10 @@ impl From for PbLevel { } } -impl From for Level { +impl From for LevelCommon +where + T: From, +{ fn from(pb_level: PbLevel) -> Self { Self { level_idx: pb_level.level_idx, @@ -169,7 +185,7 @@ impl From for Level { table_infos: pb_level .table_infos .into_iter() - .map(SstableInfo::from) + .map(Into::into) .collect_vec(), total_file_size: pb_level.total_file_size, sub_level_id: pb_level.sub_level_id, @@ -196,9 +212,9 @@ impl Level { } #[derive(Debug, Clone, PartialEq, Default)] -pub struct Levels { - pub levels: Vec, - pub l0: OverlappingLevel, +pub struct LevelsCommon { + pub levels: Vec>, + pub l0: OverlappingLevelCommon, pub group_id: u64, pub parent_group_id: u64, @@ -206,6 +222,8 @@ pub struct Levels { pub member_table_ids: Vec, } +pub type Levels = LevelsCommon; + impl Levels { pub fn level0(&self) -> &OverlappingLevel { &self.l0 @@ -236,15 +254,25 @@ impl Levels { } } -impl Levels { - pub fn from_protobuf(pb_levels: &PbLevels) -> Self { - Self::from(pb_levels) - } - +impl LevelsCommon +where + PbLevels: for<'a> From<&'a LevelsCommon>, +{ pub fn to_protobuf(&self) -> PbLevels { self.into() } +} + +impl LevelsCommon +where + LevelsCommon: for<'a> From<&'a PbLevels>, +{ + pub fn from_protobuf(pb_levels: &PbLevels) -> LevelsCommon { + LevelsCommon::::from(pb_levels) + } +} +impl Levels { pub fn estimated_encode_len(&self) -> usize { let mut basic = self .levels @@ -260,12 +288,15 @@ impl Levels { } } -impl From<&PbLevels> for Levels { +impl From<&PbLevels> for LevelsCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ #[expect(deprecated)] fn from(pb_levels: &PbLevels) -> Self { Self { - l0: OverlappingLevel::from(pb_levels.l0.as_ref().unwrap()), - levels: pb_levels.levels.iter().map(Level::from).collect_vec(), + l0: OverlappingLevelCommon::from(pb_levels.l0.as_ref().unwrap()), + levels: pb_levels.levels.iter().map(Into::into).collect_vec(), group_id: pb_levels.group_id, parent_group_id: pb_levels.parent_group_id, member_table_ids: pb_levels.member_table_ids.clone(), @@ -273,9 +304,12 @@ impl From<&PbLevels> for Levels { } } -impl From<&Levels> for PbLevels { +impl From<&LevelsCommon> for PbLevels +where + PbSstableInfo: for<'a> From<&'a T>, +{ #[expect(deprecated)] - fn from(levels: &Levels) -> Self { + fn from(levels: &LevelsCommon) -> Self { Self { l0: Some((&levels.l0).into()), levels: levels.levels.iter().map(PbLevel::from).collect_vec(), @@ -286,28 +320,38 @@ impl From<&Levels> for PbLevels { } } -impl From for Levels { +impl From for LevelsCommon +where + T: From, +{ #[expect(deprecated)] fn from(pb_levels: PbLevels) -> Self { Self { - l0: OverlappingLevel::from(pb_levels.l0.as_ref().unwrap()), - levels: pb_levels.levels.into_iter().map(Level::from).collect_vec(), + l0: OverlappingLevelCommon::from(pb_levels.l0.unwrap()), + levels: pb_levels + .levels + .into_iter() + .map(LevelCommon::from) + .collect_vec(), group_id: pb_levels.group_id, parent_group_id: pb_levels.parent_group_id, - member_table_ids: pb_levels.member_table_ids.clone(), + member_table_ids: pb_levels.member_table_ids, } } } -impl From for PbLevels { - fn from(levels: Levels) -> Self { +impl From> for PbLevels +where + PbSstableInfo: From, +{ + fn from(levels: LevelsCommon) -> Self { #[expect(deprecated)] Self { l0: Some(levels.l0.into()), levels: levels.levels.into_iter().map(PbLevel::from).collect_vec(), group_id: levels.group_id, parent_group_id: levels.parent_group_id, - member_table_ids: levels.member_table_ids.clone(), + member_table_ids: levels.member_table_ids, } } } diff --git a/src/storage/hummock_sdk/src/lib.rs b/src/storage/hummock_sdk/src/lib.rs index 9e6962ab117aa..921ab18fcf7cd 100644 --- a/src/storage/hummock_sdk/src/lib.rs +++ b/src/storage/hummock_sdk/src/lib.rs @@ -15,7 +15,6 @@ #![feature(async_closure)] #![feature(extract_if)] #![feature(hash_extract_if)] -#![feature(lint_reasons)] #![feature(map_many_mut)] #![feature(type_alias_impl_trait)] #![feature(impl_trait_in_assoc_type)] @@ -130,6 +129,7 @@ pub const FIRST_VERSION_ID: HummockVersionId = HummockVersionId(1); pub const SPLIT_TABLE_COMPACTION_GROUP_ID_HEAD: u64 = 1u64 << 56; pub const SINGLE_TABLE_COMPACTION_GROUP_ID_HEAD: u64 = 2u64 << 56; pub const OBJECT_SUFFIX: &str = "data"; +pub const HUMMOCK_SSTABLE_OBJECT_ID_MAX_DECIMAL_LENGTH: usize = 20; #[macro_export] /// This is wrapper for `info` log. @@ -359,3 +359,14 @@ impl EpochWithGap { self.0 & EPOCH_SPILL_TIME_MASK } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_object_id_decimal_max_length() { + let len = HummockSstableObjectId::MAX.to_string().len(); + assert_eq!(len, HUMMOCK_SSTABLE_OBJECT_ID_MAX_DECIMAL_LENGTH) + } +} diff --git a/src/storage/hummock_sdk/src/sstable_info.rs b/src/storage/hummock_sdk/src/sstable_info.rs index 2f64508e57314..20943e4dd101a 100644 --- a/src/storage/hummock_sdk/src/sstable_info.rs +++ b/src/storage/hummock_sdk/src/sstable_info.rs @@ -63,6 +63,7 @@ impl SstableInfo { impl From for SstableInfo { fn from(pb_sstable_info: PbSstableInfo) -> Self { + assert!(pb_sstable_info.table_ids.is_sorted()); Self { object_id: pb_sstable_info.object_id, sst_id: pb_sstable_info.sst_id, @@ -100,6 +101,7 @@ impl From for SstableInfo { impl From<&PbSstableInfo> for SstableInfo { fn from(pb_sstable_info: &PbSstableInfo) -> Self { + assert!(pb_sstable_info.table_ids.is_sorted()); Self { object_id: pb_sstable_info.object_id, sst_id: pb_sstable_info.sst_id, @@ -136,7 +138,8 @@ impl From<&PbSstableInfo> for SstableInfo { impl From for PbSstableInfo { fn from(sstable_info: SstableInfo) -> Self { - assert_ne!(0, sstable_info.sst_size); + assert!(sstable_info.sst_size > 0 || sstable_info.is_stripped()); + assert!(sstable_info.table_ids.is_sorted()); PbSstableInfo { object_id: sstable_info.object_id, sst_id: sstable_info.sst_id, @@ -174,7 +177,8 @@ impl From for PbSstableInfo { impl From<&SstableInfo> for PbSstableInfo { fn from(sstable_info: &SstableInfo) -> Self { - assert_ne!(0, sstable_info.sst_size); + assert!(sstable_info.sst_size > 0 || sstable_info.is_stripped()); + assert!(sstable_info.table_ids.is_sorted()); PbSstableInfo { object_id: sstable_info.object_id, sst_id: sstable_info.sst_id, @@ -212,3 +216,10 @@ impl SstableInfo { self.key_range = KeyRange::default(); } } + +// Time travel +impl SstableInfo { + pub fn is_stripped(&self) -> bool { + self.object_id == 0 + } +} diff --git a/src/storage/hummock_sdk/src/time_travel.rs b/src/storage/hummock_sdk/src/time_travel.rs index 380d75340df27..e828c94a4d781 100644 --- a/src/storage/hummock_sdk/src/time_travel.rs +++ b/src/storage/hummock_sdk/src/time_travel.rs @@ -13,87 +13,20 @@ // limitations under the License. use std::collections::{HashMap, HashSet}; -use std::sync::Arc; -use risingwave_common::catalog::TableId; -use risingwave_pb::hummock::hummock_version_delta::PbGroupDeltas; -use risingwave_pb::hummock::{PbHummockVersion, PbHummockVersionDelta, PbStateTableInfoDelta}; +use risingwave_pb::hummock::hummock_version::PbLevels; +use risingwave_pb::hummock::hummock_version_delta::{PbChangeLogDelta, PbGroupDeltas}; +use risingwave_pb::hummock::{PbEpochNewChangeLog, PbSstableInfo}; -use crate::change_log::{ChangeLogDelta, EpochNewChangeLog, TableChangeLog}; -use crate::level::{Level, Levels, OverlappingLevel}; +use crate::change_log::{TableChangeLog, TableChangeLogCommon}; +use crate::level::Level; use crate::sstable_info::SstableInfo; -use crate::table_watermark::TableWatermarks; use crate::version::{ - GroupDelta, GroupDeltas, HummockVersion, HummockVersionDelta, HummockVersionStateTableInfo, - IntraLevelDelta, + HummockVersion, HummockVersionCommon, HummockVersionDelta, HummockVersionDeltaCommon, }; -use crate::{CompactionGroupId, HummockSstableId, HummockVersionId}; +use crate::{CompactionGroupId, HummockSstableId}; -/// [`IncompleteHummockVersion`] is incomplete because `SSTableInfo` only has the `sst_id` set in the following fields: -/// - `PbLevels` -/// - `TableChangeLog` -#[derive(Debug, Clone, PartialEq)] -pub struct IncompleteHummockVersion { - pub id: HummockVersionId, - pub levels: HashMap, - max_committed_epoch: u64, - safe_epoch: u64, - pub table_watermarks: HashMap>, - pub table_change_log: HashMap, - pub state_table_info: HummockVersionStateTableInfo, -} - -/// Clone from an `SstableInfo`, but only set the `sst_id` for the target, leaving other fields as default. -/// The goal is to reduce the size of pb object generated afterward. -fn stripped_sstable_info(origin: &SstableInfo) -> SstableInfo { - SstableInfo { - object_id: Default::default(), - sst_id: origin.sst_id, - key_range: Default::default(), - file_size: Default::default(), - table_ids: Default::default(), - meta_offset: Default::default(), - stale_key_count: Default::default(), - total_key_count: Default::default(), - min_epoch: Default::default(), - max_epoch: Default::default(), - uncompressed_file_size: Default::default(), - range_tombstone_count: Default::default(), - bloom_filter_kind: Default::default(), - sst_size: Default::default(), - } -} - -fn stripped_epoch_new_change_log(origin: &EpochNewChangeLog) -> EpochNewChangeLog { - EpochNewChangeLog { - old_value: origin.old_value.iter().map(stripped_sstable_info).collect(), - new_value: origin.new_value.iter().map(stripped_sstable_info).collect(), - epochs: origin.epochs.clone(), - } -} - -fn stripped_change_log_delta(origin: &ChangeLogDelta) -> ChangeLogDelta { - ChangeLogDelta { - new_log: origin.new_log.as_ref().map(stripped_epoch_new_change_log), - truncate_epoch: origin.truncate_epoch, - } -} - -fn stripped_level(origin: &Level) -> Level { - Level { - level_idx: origin.level_idx, - level_type: origin.level_type, - table_infos: origin - .table_infos - .iter() - .map(stripped_sstable_info) - .collect(), - total_file_size: origin.total_file_size, - sub_level_id: origin.sub_level_id, - uncompressed_file_size: origin.uncompressed_file_size, - vnode_partition_count: origin.vnode_partition_count, - } -} +pub type IncompleteHummockVersion = HummockVersionCommon; pub fn refill_version( version: &mut HummockVersion, @@ -146,55 +79,6 @@ fn refill_sstable_info( .clone(); } -fn stripped_l0(origin: &OverlappingLevel) -> OverlappingLevel { - OverlappingLevel { - sub_levels: origin.sub_levels.iter().map(stripped_level).collect(), - total_file_size: origin.total_file_size, - uncompressed_file_size: origin.uncompressed_file_size, - } -} - -#[allow(deprecated)] -fn stripped_levels(origin: &Levels) -> Levels { - Levels { - levels: origin.levels.iter().map(stripped_level).collect(), - l0: stripped_l0(&origin.l0), - group_id: origin.group_id, - parent_group_id: origin.parent_group_id, - member_table_ids: Default::default(), - } -} - -fn stripped_intra_level_delta(origin: &IntraLevelDelta) -> IntraLevelDelta { - IntraLevelDelta { - level_idx: origin.level_idx, - l0_sub_level_id: origin.l0_sub_level_id, - removed_table_ids: origin.removed_table_ids.clone(), - inserted_table_infos: origin - .inserted_table_infos - .iter() - .map(stripped_sstable_info) - .collect(), - vnode_partition_count: origin.vnode_partition_count, - } -} - -fn stripped_group_delta(origin: &GroupDelta) -> GroupDelta { - match origin { - GroupDelta::IntraLevel(l) => GroupDelta::IntraLevel(stripped_intra_level_delta(l)), - _ => panic!("time travel expects DeltaType::IntraLevel only"), - } -} - -fn stripped_group_deltas(origin: &GroupDeltas) -> GroupDeltas { - let group_deltas = origin - .group_deltas - .iter() - .map(stripped_group_delta) - .collect(); - GroupDeltas { group_deltas } -} - /// `SStableInfo` will be stripped. impl From<(&HummockVersion, &HashSet)> for IncompleteHummockVersion { fn from(p: (&HummockVersion, &HashSet)) -> Self { @@ -206,7 +90,10 @@ impl From<(&HummockVersion, &HashSet)> for IncompleteHummockV .iter() .filter_map(|(group_id, levels)| { if select_group.contains(group_id) { - Some((*group_id as CompactionGroupId, stripped_levels(levels))) + Some(( + *group_id as CompactionGroupId, + PbLevels::from(levels).into(), + )) } else { None } @@ -215,7 +102,7 @@ impl From<(&HummockVersion, &HashSet)> for IncompleteHummockV max_committed_epoch: version.visible_table_committed_epoch(), safe_epoch: version.visible_table_safe_epoch(), table_watermarks: version.table_watermarks.clone(), - // TODO: optimization: strip table change log + // TODO: optimization: strip table change log based on select_group table_change_log: version .table_change_log .iter() @@ -223,9 +110,9 @@ impl From<(&HummockVersion, &HashSet)> for IncompleteHummockV let incomplete_table_change_log = change_log .0 .iter() - .map(stripped_epoch_new_change_log) + .map(|e| PbEpochNewChangeLog::from(e).into()) .collect(); - (*table_id, TableChangeLog(incomplete_table_change_log)) + (*table_id, TableChangeLogCommon(incomplete_table_change_log)) }) .collect(), state_table_info: version.state_table_info.clone(), @@ -233,49 +120,10 @@ impl From<(&HummockVersion, &HashSet)> for IncompleteHummockV } } -impl IncompleteHummockVersion { - /// Resulted `SStableInfo` is incompelte. - pub fn to_protobuf(&self) -> PbHummockVersion { - PbHummockVersion { - id: self.id.0, - levels: self - .levels - .iter() - .map(|(group_id, levels)| (*group_id as _, levels.to_protobuf())) - .collect(), - max_committed_epoch: self.max_committed_epoch, - safe_epoch: self.safe_epoch, - table_watermarks: self - .table_watermarks - .iter() - .map(|(table_id, watermark)| (table_id.table_id, watermark.to_protobuf())) - .collect(), - table_change_logs: self - .table_change_log - .iter() - .map(|(table_id, change_log)| (table_id.table_id, change_log.to_protobuf())) - .collect(), - state_table_info: self.state_table_info.to_protobuf(), - } - } -} - /// [`IncompleteHummockVersionDelta`] is incomplete because `SSTableInfo` only has the `sst_id` set in the following fields: /// - `PbGroupDeltas` /// - `ChangeLogDelta` -#[derive(Debug, PartialEq, Clone)] -pub struct IncompleteHummockVersionDelta { - pub id: HummockVersionId, - pub prev_id: HummockVersionId, - pub group_deltas: HashMap, - pub max_committed_epoch: u64, - pub safe_epoch: u64, - pub trivial_move: bool, - pub new_table_watermarks: HashMap, - pub removed_table_ids: HashSet, - pub change_log_delta: HashMap, - pub state_table_info_delta: HashMap, -} +pub type IncompleteHummockVersionDelta = HummockVersionDeltaCommon; /// `SStableInfo` will be stripped. impl From<(&HummockVersionDelta, &HashSet)> for IncompleteHummockVersionDelta { @@ -289,7 +137,7 @@ impl From<(&HummockVersionDelta, &HashSet)> for IncompleteHum .iter() .filter_map(|(cg_id, deltas)| { if select_group.contains(cg_id) { - Some((*cg_id, stripped_group_deltas(deltas).to_protobuf())) + Some((*cg_id, PbGroupDeltas::from(deltas).into())) } else { None } @@ -300,47 +148,42 @@ impl From<(&HummockVersionDelta, &HashSet)> for IncompleteHum trivial_move: delta.trivial_move, new_table_watermarks: delta.new_table_watermarks.clone(), removed_table_ids: delta.removed_table_ids.clone(), - // TODO: optimization: strip table change log + // TODO: optimization: strip table change log based on select_group change_log_delta: delta .change_log_delta .iter() - .map(|(table_id, log_delta)| (*table_id, stripped_change_log_delta(log_delta))) + .map(|(table_id, log_delta)| (*table_id, PbChangeLogDelta::from(log_delta).into())) .collect(), state_table_info_delta: delta.state_table_info_delta.clone(), } } } -impl IncompleteHummockVersionDelta { - /// Resulted `SStableInfo` is incompelte. - pub fn to_protobuf(&self) -> PbHummockVersionDelta { - PbHummockVersionDelta { - id: self.id.0, - prev_id: self.prev_id.0, - group_deltas: self.group_deltas.clone(), - max_committed_epoch: self.max_committed_epoch, - safe_epoch: self.safe_epoch, - trivial_move: self.trivial_move, - new_table_watermarks: self - .new_table_watermarks - .iter() - .map(|(table_id, watermarks)| (table_id.table_id, watermarks.to_protobuf())) - .collect(), - removed_table_ids: self - .removed_table_ids - .iter() - .map(|table_id| table_id.table_id) - .collect(), - change_log_delta: self - .change_log_delta - .iter() - .map(|(table_id, log_delta)| (table_id.table_id, log_delta.into())) - .collect(), - state_table_info_delta: self - .state_table_info_delta - .iter() - .map(|(table_id, delta)| (table_id.table_id, *delta)) - .collect(), +pub struct SstableIdInVersion(HummockSstableId); + +impl From<&SstableIdInVersion> for PbSstableInfo { + fn from(sst_id: &SstableIdInVersion) -> Self { + Self { + sst_id: sst_id.0, + ..Default::default() } } } + +impl From for PbSstableInfo { + fn from(sst_id: SstableIdInVersion) -> Self { + (&sst_id).into() + } +} + +impl From<&PbSstableInfo> for SstableIdInVersion { + fn from(value: &PbSstableInfo) -> Self { + SstableIdInVersion(value.sst_id) + } +} + +impl From for SstableIdInVersion { + fn from(value: PbSstableInfo) -> Self { + (&value).into() + } +} diff --git a/src/storage/hummock_sdk/src/version.rs b/src/storage/hummock_sdk/src/version.rs index e418250f0b6bf..4aecfcde0cf48 100644 --- a/src/storage/hummock_sdk/src/version.rs +++ b/src/storage/hummock_sdk/src/version.rs @@ -24,16 +24,16 @@ use risingwave_common::util::epoch::INVALID_EPOCH; use risingwave_pb::hummock::group_delta::PbDeltaType; use risingwave_pb::hummock::hummock_version_delta::PbGroupDeltas; use risingwave_pb::hummock::{ - CompactionConfig, PbGroupConstruct, PbGroupDelta, PbGroupDestroy, PbGroupMetaChange, - PbGroupTableChange, PbHummockVersion, PbHummockVersionDelta, PbIntraLevelDelta, - PbStateTableInfo, StateTableInfo, StateTableInfoDelta, + CompactionConfig, PbGroupConstruct, PbGroupDelta, PbGroupDestroy, PbGroupMerge, + PbGroupMetaChange, PbGroupTableChange, PbHummockVersion, PbHummockVersionDelta, + PbIntraLevelDelta, PbSstableInfo, PbStateTableInfo, StateTableInfo, StateTableInfoDelta, }; use tracing::warn; -use crate::change_log::{ChangeLogDelta, TableChangeLog}; +use crate::change_log::{ChangeLogDeltaCommon, TableChangeLogCommon}; use crate::compaction_group::hummock_version_ext::build_initial_compaction_group_levels; use crate::compaction_group::StaticCompactionGroupId; -use crate::level::Levels; +use crate::level::LevelsCommon; use crate::sstable_info::SstableInfo; use crate::table_watermark::TableWatermarks; use crate::{CompactionGroupId, HummockSstableObjectId, HummockVersionId, FIRST_VERSION_ID}; @@ -209,33 +209,39 @@ impl HummockVersionStateTableInfo { } #[derive(Debug, Clone, PartialEq)] -pub struct HummockVersion { +pub struct HummockVersionCommon { pub id: HummockVersionId, - pub levels: HashMap, - max_committed_epoch: u64, - safe_epoch: u64, + pub levels: HashMap>, + pub(crate) max_committed_epoch: u64, + pub(crate) safe_epoch: u64, pub table_watermarks: HashMap>, - pub table_change_log: HashMap, + pub table_change_log: HashMap>, pub state_table_info: HummockVersionStateTableInfo, } +pub type HummockVersion = HummockVersionCommon; + impl Default for HummockVersion { fn default() -> Self { HummockVersion::from(&PbHummockVersion::default()) } } -impl HummockVersion { +impl HummockVersionCommon +where + T: for<'a> From<&'a PbSstableInfo>, + PbSstableInfo: for<'a> From<&'a T>, +{ /// Convert the `PbHummockVersion` received from rpc to `HummockVersion`. No need to /// maintain backward compatibility. pub fn from_rpc_protobuf(pb_version: &PbHummockVersion) -> Self { - HummockVersion::from(pb_version) + pb_version.into() } /// Convert the `PbHummockVersion` deserialized from persisted state to `HummockVersion`. /// We should maintain backward compatibility. pub fn from_persisted_protobuf(pb_version: &PbHummockVersion) -> Self { - HummockVersion::from(pb_version) + pb_version.into() } pub fn to_protobuf(&self) -> PbHummockVersion { @@ -260,14 +266,19 @@ impl HummockVersion { } } -impl From<&PbHummockVersion> for HummockVersion { +impl From<&PbHummockVersion> for HummockVersionCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_version: &PbHummockVersion) -> Self { Self { id: HummockVersionId(pb_version.id), levels: pb_version .levels .iter() - .map(|(group_id, levels)| (*group_id as CompactionGroupId, Levels::from(levels))) + .map(|(group_id, levels)| { + (*group_id as CompactionGroupId, LevelsCommon::from(levels)) + }) .collect(), max_committed_epoch: pb_version.max_committed_epoch, safe_epoch: pb_version.safe_epoch, @@ -287,7 +298,7 @@ impl From<&PbHummockVersion> for HummockVersion { .map(|(table_id, change_log)| { ( TableId::new(*table_id), - TableChangeLog::from_protobuf(change_log), + TableChangeLogCommon::from_protobuf(change_log), ) }) .collect(), @@ -298,8 +309,11 @@ impl From<&PbHummockVersion> for HummockVersion { } } -impl From<&HummockVersion> for PbHummockVersion { - fn from(version: &HummockVersion) -> Self { +impl From<&HummockVersionCommon> for PbHummockVersion +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(version: &HummockVersionCommon) -> Self { Self { id: version.id.0, levels: version @@ -324,8 +338,12 @@ impl From<&HummockVersion> for PbHummockVersion { } } -impl From for PbHummockVersion { - fn from(version: HummockVersion) -> Self { +impl From> for PbHummockVersion +where + PbSstableInfo: From, + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(version: HummockVersionCommon) -> Self { Self { id: version.id.0, levels: version @@ -453,36 +471,42 @@ impl HummockVersion { } #[derive(Debug, PartialEq, Clone)] -pub struct HummockVersionDelta { +pub struct HummockVersionDeltaCommon { pub id: HummockVersionId, pub prev_id: HummockVersionId, - pub group_deltas: HashMap, - max_committed_epoch: u64, - safe_epoch: u64, + pub group_deltas: HashMap>, + pub(crate) max_committed_epoch: u64, + pub(crate) safe_epoch: u64, pub trivial_move: bool, pub new_table_watermarks: HashMap, pub removed_table_ids: HashSet, - pub change_log_delta: HashMap, + pub change_log_delta: HashMap>, pub state_table_info_delta: HashMap, } +pub type HummockVersionDelta = HummockVersionDeltaCommon; + impl Default for HummockVersionDelta { fn default() -> Self { HummockVersionDelta::from(&PbHummockVersionDelta::default()) } } -impl HummockVersionDelta { +impl HummockVersionDeltaCommon +where + T: for<'a> From<&'a PbSstableInfo>, + PbSstableInfo: for<'a> From<&'a T>, +{ /// Convert the `PbHummockVersionDelta` deserialized from persisted state to `HummockVersionDelta`. /// We should maintain backward compatibility. pub fn from_persisted_protobuf(delta: &PbHummockVersionDelta) -> Self { - Self::from(delta) + delta.into() } /// Convert the `PbHummockVersionDelta` received from rpc to `HummockVersionDelta`. No need to /// maintain backward compatibility. pub fn from_rpc_protobuf(delta: &PbHummockVersionDelta) -> Self { - Self::from(delta) + delta.into() } pub fn to_protobuf(&self) -> PbHummockVersionDelta { @@ -501,12 +525,10 @@ impl HummockVersionDelta { .flat_map(|group_deltas| { group_deltas.group_deltas.iter().flat_map(|group_delta| { static EMPTY_VEC: Vec = Vec::new(); - let sst_slice = match group_delta { - GroupDelta::IntraLevel(level_delta) => &level_delta.inserted_table_infos, - GroupDelta::GroupConstruct(_) - | GroupDelta::GroupDestroy(_) - | GroupDelta::GroupMetaChange(_) - | GroupDelta::GroupTableChange(_) => &EMPTY_VEC, + let sst_slice = if let GroupDelta::IntraLevel(level_delta) = &group_delta { + &level_delta.inserted_table_infos + } else { + &EMPTY_VEC }; sst_slice.iter().map(|sst| sst.object_id) }) @@ -526,12 +548,10 @@ impl HummockVersionDelta { let ssts_from_group_deltas = self.group_deltas.values().flat_map(|group_deltas| { group_deltas.group_deltas.iter().flat_map(|group_delta| { static EMPTY_VEC: Vec = Vec::new(); - let sst_slice = match group_delta { - GroupDelta::IntraLevel(level_delta) => &level_delta.inserted_table_infos, - GroupDelta::GroupConstruct(_) - | GroupDelta::GroupDestroy(_) - | GroupDelta::GroupMetaChange(_) - | GroupDelta::GroupTableChange(_) => &EMPTY_VEC, + let sst_slice = if let GroupDelta::IntraLevel(level_delta) = &group_delta { + &level_delta.inserted_table_infos + } else { + &EMPTY_VEC }; sst_slice.iter() }) @@ -564,12 +584,10 @@ impl HummockVersionDelta { .flat_map(|group_deltas| { group_deltas.group_deltas.iter().flat_map(|group_delta| { static EMPTY_VEC: Vec = Vec::new(); - let sst_slice = match group_delta { - GroupDelta::IntraLevel(level_delta) => &level_delta.inserted_table_infos, - GroupDelta::GroupConstruct(_) - | GroupDelta::GroupDestroy(_) - | GroupDelta::GroupMetaChange(_) - | GroupDelta::GroupTableChange(_) => &EMPTY_VEC, + let sst_slice = if let GroupDelta::IntraLevel(level_delta) = &group_delta { + &level_delta.inserted_table_infos + } else { + &EMPTY_VEC }; sst_slice.iter() }) @@ -598,7 +616,10 @@ impl HummockVersionDelta { } } -impl From<&PbHummockVersionDelta> for HummockVersionDelta { +impl From<&PbHummockVersionDelta> for HummockVersionDeltaCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_version_delta: &PbHummockVersionDelta) -> Self { Self { id: HummockVersionId(pb_version_delta.id), @@ -607,7 +628,10 @@ impl From<&PbHummockVersionDelta> for HummockVersionDelta { .group_deltas .iter() .map(|(group_id, deltas)| { - (*group_id as CompactionGroupId, GroupDeltas::from(deltas)) + ( + *group_id as CompactionGroupId, + GroupDeltasCommon::from(deltas), + ) }) .collect(), max_committed_epoch: pb_version_delta.max_committed_epoch, @@ -631,8 +655,8 @@ impl From<&PbHummockVersionDelta> for HummockVersionDelta { .map(|(table_id, log_delta)| { ( TableId::new(*table_id), - ChangeLogDelta { - new_log: log_delta.new_log.clone().map(Into::into), + ChangeLogDeltaCommon { + new_log: log_delta.new_log.as_ref().map(Into::into), truncate_epoch: log_delta.truncate_epoch, }, ) @@ -648,8 +672,11 @@ impl From<&PbHummockVersionDelta> for HummockVersionDelta { } } -impl From<&HummockVersionDelta> for PbHummockVersionDelta { - fn from(version_delta: &HummockVersionDelta) -> Self { +impl From<&HummockVersionDeltaCommon> for PbHummockVersionDelta +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(version_delta: &HummockVersionDeltaCommon) -> Self { Self { id: version_delta.id.0, prev_id: version_delta.prev_id.0, @@ -685,8 +712,11 @@ impl From<&HummockVersionDelta> for PbHummockVersionDelta { } } -impl From for PbHummockVersionDelta { - fn from(version_delta: HummockVersionDelta) -> Self { +impl From> for PbHummockVersionDelta +where + PbSstableInfo: From, +{ + fn from(version_delta: HummockVersionDeltaCommon) -> Self { Self { id: version_delta.id.0, prev_id: version_delta.prev_id.0, @@ -722,7 +752,10 @@ impl From for PbHummockVersionDelta { } } -impl From for HummockVersionDelta { +impl From for HummockVersionDeltaCommon +where + T: From, +{ fn from(pb_version_delta: PbHummockVersionDelta) -> Self { Self { id: HummockVersionId(pb_version_delta.id), @@ -751,7 +784,7 @@ impl From for HummockVersionDelta { .map(|(table_id, log_delta)| { ( TableId::new(*table_id), - ChangeLogDelta { + ChangeLogDeltaCommon { new_log: log_delta.new_log.clone().map(Into::into), truncate_epoch: log_delta.truncate_epoch, }, @@ -768,14 +801,16 @@ impl From for HummockVersionDelta { } #[derive(Debug, PartialEq, Clone)] -pub struct IntraLevelDelta { +pub struct IntraLevelDeltaCommon { pub level_idx: u32, pub l0_sub_level_id: u64, pub removed_table_ids: Vec, - pub inserted_table_infos: Vec, + pub inserted_table_infos: Vec, pub vnode_partition_count: u32, } +pub type IntraLevelDelta = IntraLevelDeltaCommon; + impl IntraLevelDelta { pub fn estimated_encode_len(&self) -> usize { size_of::() @@ -790,40 +825,49 @@ impl IntraLevelDelta { } } -impl From for IntraLevelDelta { +impl From for IntraLevelDeltaCommon +where + T: From, +{ fn from(pb_intra_level_delta: PbIntraLevelDelta) -> Self { Self { level_idx: pb_intra_level_delta.level_idx, l0_sub_level_id: pb_intra_level_delta.l0_sub_level_id, - removed_table_ids: pb_intra_level_delta.removed_table_ids.clone(), + removed_table_ids: pb_intra_level_delta.removed_table_ids, inserted_table_infos: pb_intra_level_delta .inserted_table_infos .into_iter() - .map(SstableInfo::from) + .map(Into::into) .collect_vec(), vnode_partition_count: pb_intra_level_delta.vnode_partition_count, } } } -impl From for PbIntraLevelDelta { - fn from(intra_level_delta: IntraLevelDelta) -> Self { +impl From> for PbIntraLevelDelta +where + PbSstableInfo: From, +{ + fn from(intra_level_delta: IntraLevelDeltaCommon) -> Self { Self { level_idx: intra_level_delta.level_idx, l0_sub_level_id: intra_level_delta.l0_sub_level_id, - removed_table_ids: intra_level_delta.removed_table_ids.clone(), + removed_table_ids: intra_level_delta.removed_table_ids, inserted_table_infos: intra_level_delta .inserted_table_infos .into_iter() - .map(|sst| sst.into()) + .map(Into::into) .collect_vec(), vnode_partition_count: intra_level_delta.vnode_partition_count, } } } -impl From<&IntraLevelDelta> for PbIntraLevelDelta { - fn from(intra_level_delta: &IntraLevelDelta) -> Self { +impl From<&IntraLevelDeltaCommon> for PbIntraLevelDelta +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(intra_level_delta: &IntraLevelDeltaCommon) -> Self { Self { level_idx: intra_level_delta.level_idx, l0_sub_level_id: intra_level_delta.l0_sub_level_id, @@ -831,14 +875,17 @@ impl From<&IntraLevelDelta> for PbIntraLevelDelta { inserted_table_infos: intra_level_delta .inserted_table_infos .iter() - .map(|sst| sst.into()) + .map(Into::into) .collect_vec(), vnode_partition_count: intra_level_delta.vnode_partition_count, } } } -impl From<&PbIntraLevelDelta> for IntraLevelDelta { +impl From<&PbIntraLevelDelta> for IntraLevelDeltaCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_intra_level_delta: &PbIntraLevelDelta) -> Self { Self { level_idx: pb_intra_level_delta.level_idx, @@ -847,7 +894,7 @@ impl From<&PbIntraLevelDelta> for IntraLevelDelta { inserted_table_infos: pb_intra_level_delta .inserted_table_infos .iter() - .map(SstableInfo::from) + .map(Into::into) .collect_vec(), vnode_partition_count: pb_intra_level_delta.vnode_partition_count, } @@ -873,100 +920,128 @@ impl IntraLevelDelta { } #[derive(Debug, PartialEq, Clone)] -pub enum GroupDelta { - IntraLevel(IntraLevelDelta), +pub enum GroupDeltaCommon { + IntraLevel(IntraLevelDeltaCommon), GroupConstruct(PbGroupConstruct), GroupDestroy(PbGroupDestroy), GroupMetaChange(PbGroupMetaChange), #[allow(dead_code)] GroupTableChange(PbGroupTableChange), + + GroupMerge(PbGroupMerge), } -impl From for GroupDelta { +pub type GroupDelta = GroupDeltaCommon; + +impl From for GroupDeltaCommon +where + T: From, +{ fn from(pb_group_delta: PbGroupDelta) -> Self { match pb_group_delta.delta_type { Some(PbDeltaType::IntraLevel(pb_intra_level_delta)) => { - GroupDelta::IntraLevel(IntraLevelDelta::from(pb_intra_level_delta)) + GroupDeltaCommon::IntraLevel(IntraLevelDeltaCommon::from(pb_intra_level_delta)) } Some(PbDeltaType::GroupConstruct(pb_group_construct)) => { - GroupDelta::GroupConstruct(pb_group_construct) + GroupDeltaCommon::GroupConstruct(pb_group_construct) } Some(PbDeltaType::GroupDestroy(pb_group_destroy)) => { - GroupDelta::GroupDestroy(pb_group_destroy) + GroupDeltaCommon::GroupDestroy(pb_group_destroy) } Some(PbDeltaType::GroupMetaChange(pb_group_meta_change)) => { - GroupDelta::GroupMetaChange(pb_group_meta_change) + GroupDeltaCommon::GroupMetaChange(pb_group_meta_change) } Some(PbDeltaType::GroupTableChange(pb_group_table_change)) => { - GroupDelta::GroupTableChange(pb_group_table_change) + GroupDeltaCommon::GroupTableChange(pb_group_table_change) + } + Some(PbDeltaType::GroupMerge(pb_group_merge)) => { + GroupDeltaCommon::GroupMerge(pb_group_merge) } None => panic!("delta_type is not set"), } } } -impl From for PbGroupDelta { - fn from(group_delta: GroupDelta) -> Self { +impl From> for PbGroupDelta +where + PbSstableInfo: From, +{ + fn from(group_delta: GroupDeltaCommon) -> Self { match group_delta { - GroupDelta::IntraLevel(intra_level_delta) => PbGroupDelta { + GroupDeltaCommon::IntraLevel(intra_level_delta) => PbGroupDelta { delta_type: Some(PbDeltaType::IntraLevel(intra_level_delta.into())), }, - GroupDelta::GroupConstruct(pb_group_construct) => PbGroupDelta { + GroupDeltaCommon::GroupConstruct(pb_group_construct) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupConstruct(pb_group_construct)), }, - GroupDelta::GroupDestroy(pb_group_destroy) => PbGroupDelta { + GroupDeltaCommon::GroupDestroy(pb_group_destroy) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupDestroy(pb_group_destroy)), }, - GroupDelta::GroupMetaChange(pb_group_meta_change) => PbGroupDelta { + GroupDeltaCommon::GroupMetaChange(pb_group_meta_change) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupMetaChange(pb_group_meta_change)), }, - GroupDelta::GroupTableChange(pb_group_table_change) => PbGroupDelta { + GroupDeltaCommon::GroupTableChange(pb_group_table_change) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupTableChange(pb_group_table_change)), }, + GroupDeltaCommon::GroupMerge(pb_group_merge) => PbGroupDelta { + delta_type: Some(PbDeltaType::GroupMerge(pb_group_merge)), + }, } } } -impl From<&GroupDelta> for PbGroupDelta { - fn from(group_delta: &GroupDelta) -> Self { +impl From<&GroupDeltaCommon> for PbGroupDelta +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(group_delta: &GroupDeltaCommon) -> Self { match group_delta { - GroupDelta::IntraLevel(intra_level_delta) => PbGroupDelta { + GroupDeltaCommon::IntraLevel(intra_level_delta) => PbGroupDelta { delta_type: Some(PbDeltaType::IntraLevel(intra_level_delta.into())), }, - GroupDelta::GroupConstruct(pb_group_construct) => PbGroupDelta { + GroupDeltaCommon::GroupConstruct(pb_group_construct) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupConstruct(pb_group_construct.clone())), }, - GroupDelta::GroupDestroy(pb_group_destroy) => PbGroupDelta { + GroupDeltaCommon::GroupDestroy(pb_group_destroy) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupDestroy(*pb_group_destroy)), }, - GroupDelta::GroupMetaChange(pb_group_meta_change) => PbGroupDelta { + GroupDeltaCommon::GroupMetaChange(pb_group_meta_change) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupMetaChange(pb_group_meta_change.clone())), }, - GroupDelta::GroupTableChange(pb_group_table_change) => PbGroupDelta { + GroupDeltaCommon::GroupTableChange(pb_group_table_change) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupTableChange(pb_group_table_change.clone())), }, + GroupDeltaCommon::GroupMerge(pb_group_merge) => PbGroupDelta { + delta_type: Some(PbDeltaType::GroupMerge(*pb_group_merge)), + }, } } } -impl From<&PbGroupDelta> for GroupDelta { +impl From<&PbGroupDelta> for GroupDeltaCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_group_delta: &PbGroupDelta) -> Self { match &pb_group_delta.delta_type { Some(PbDeltaType::IntraLevel(pb_intra_level_delta)) => { - GroupDelta::IntraLevel(IntraLevelDelta::from(pb_intra_level_delta)) + GroupDeltaCommon::IntraLevel(IntraLevelDeltaCommon::from(pb_intra_level_delta)) } Some(PbDeltaType::GroupConstruct(pb_group_construct)) => { - GroupDelta::GroupConstruct(pb_group_construct.clone()) + GroupDeltaCommon::GroupConstruct(pb_group_construct.clone()) } Some(PbDeltaType::GroupDestroy(pb_group_destroy)) => { - GroupDelta::GroupDestroy(*pb_group_destroy) + GroupDeltaCommon::GroupDestroy(*pb_group_destroy) } Some(PbDeltaType::GroupMetaChange(pb_group_meta_change)) => { - GroupDelta::GroupMetaChange(pb_group_meta_change.clone()) + GroupDeltaCommon::GroupMetaChange(pb_group_meta_change.clone()) } Some(PbDeltaType::GroupTableChange(pb_group_table_change)) => { - GroupDelta::GroupTableChange(pb_group_table_change.clone()) + GroupDeltaCommon::GroupTableChange(pb_group_table_change.clone()) + } + Some(PbDeltaType::GroupMerge(pb_group_merge)) => { + GroupDeltaCommon::GroupMerge(*pb_group_merge) } None => panic!("delta_type is not set"), } @@ -974,24 +1049,32 @@ impl From<&PbGroupDelta> for GroupDelta { } #[derive(Debug, PartialEq, Clone, Default)] -pub struct GroupDeltas { - pub group_deltas: Vec, +pub struct GroupDeltasCommon { + pub group_deltas: Vec>, } -impl From for GroupDeltas { +pub type GroupDeltas = GroupDeltasCommon; + +impl From for GroupDeltasCommon +where + T: From, +{ fn from(pb_group_deltas: PbGroupDeltas) -> Self { Self { group_deltas: pb_group_deltas .group_deltas .into_iter() - .map(GroupDelta::from) + .map(GroupDeltaCommon::from) .collect_vec(), } } } -impl From for PbGroupDeltas { - fn from(group_deltas: GroupDeltas) -> Self { +impl From> for PbGroupDeltas +where + PbSstableInfo: From, +{ + fn from(group_deltas: GroupDeltasCommon) -> Self { Self { group_deltas: group_deltas .group_deltas @@ -1002,8 +1085,11 @@ impl From for PbGroupDeltas { } } -impl From<&GroupDeltas> for PbGroupDeltas { - fn from(group_deltas: &GroupDeltas) -> Self { +impl From<&GroupDeltasCommon> for PbGroupDeltas +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(group_deltas: &GroupDeltasCommon) -> Self { Self { group_deltas: group_deltas .group_deltas @@ -1014,19 +1100,25 @@ impl From<&GroupDeltas> for PbGroupDeltas { } } -impl From<&PbGroupDeltas> for GroupDeltas { +impl From<&PbGroupDeltas> for GroupDeltasCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_group_deltas: &PbGroupDeltas) -> Self { Self { group_deltas: pb_group_deltas .group_deltas .iter() - .map(GroupDelta::from) + .map(GroupDeltaCommon::from) .collect_vec(), } } } -impl GroupDeltas { +impl GroupDeltasCommon +where + PbSstableInfo: for<'a> From<&'a T>, +{ pub fn to_protobuf(&self) -> PbGroupDeltas { self.into() } diff --git a/src/storage/hummock_test/src/bin/replay/main.rs b/src/storage/hummock_test/src/bin/replay/main.rs index 9181e37c992e2..7760d7ce530c6 100644 --- a/src/storage/hummock_test/src/bin/replay/main.rs +++ b/src/storage/hummock_test/src/bin/replay/main.rs @@ -31,7 +31,7 @@ use clap::Parser; use foyer::HybridCacheBuilder; use replay_impl::{get_replay_notification_client, GlobalReplayImpl}; use risingwave_common::config::{ - extract_storage_memory_config, load_config, NoOverride, ObjectStoreConfig, StorageConfig, + extract_storage_memory_config, load_config, NoOverride, ObjectStoreConfig, }; use risingwave_common::system_param::reader::SystemParamsReader; use risingwave_hummock_trace::{ @@ -46,7 +46,6 @@ use risingwave_storage::filter_key_extractor::{ use risingwave_storage::hummock::{HummockStorage, SstableStore, SstableStoreConfig}; use risingwave_storage::monitor::{CompactorMetrics, HummockStateStoreMetrics, ObjectStoreMetrics}; use risingwave_storage::opts::StorageOpts; -use serde::{Deserialize, Serialize}; // use a large offset to avoid collision with real sstables const SST_OFFSET: u64 = 2147383647000; @@ -183,8 +182,3 @@ async fn create_replay_hummock(r: Record, args: &Args) -> Result, notification_client: impl NotificationClient, hummock_manager_ref: &HummockManagerRef, - table_id: TableId, + table_ids: &[u32], ) -> HummockStorage { let remote_dir = "hummock_001_test".to_string(); let options = Arc::new(StorageOpts { @@ -117,7 +117,7 @@ pub(crate) mod tests { register_tables_with_id_for_test( hummock.filter_key_extractor_manager(), hummock_manager_ref, - &[table_id.table_id()], + table_ids, ) .await; @@ -189,7 +189,6 @@ pub(crate) mod tests { local.seal_current_epoch(u64::MAX, SealCurrentEpochOptions::for_test()); } let res = storage.seal_and_sync_epoch(epoch).await.unwrap(); - hummock_meta_client.commit_epoch(epoch, res).await.unwrap(); } } @@ -236,7 +235,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - Default::default(), + &[0], ) .await; let rpc_filter_key_extractor_manager = match storage.filter_key_extractor_manager().clone() @@ -406,7 +405,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - Default::default(), + &[0], ) .await; @@ -604,7 +603,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; @@ -885,7 +884,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; @@ -1090,7 +1089,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; @@ -1290,7 +1289,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; let (compact_ctx, filter_key_extractor_manager) = @@ -1505,7 +1504,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; hummock_manager_ref.get_new_sst_ids(10).await.unwrap(); @@ -1680,7 +1679,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; hummock_manager_ref.get_new_sst_ids(10).await.unwrap(); @@ -1798,7 +1797,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; hummock_manager_ref.get_new_sst_ids(10).await.unwrap(); @@ -1980,4 +1979,504 @@ pub(crate) mod tests { count += 1; } } + + #[tokio::test] + async fn test_split_and_merge() { + let (env, hummock_manager_ref, _cluster_manager_ref, worker_node) = + setup_compute_env(8080).await; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager_ref.clone(), + worker_node.id, + )); + + let table_id_1 = TableId::from(1); + let table_id_2 = TableId::from(2); + + let storage = get_hummock_storage( + hummock_meta_client.clone(), + get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), + &hummock_manager_ref, + &[table_id_1.table_id(), table_id_2.table_id()], + ) + .await; + + // basic cg2 -> [1, 2] + let rpc_filter_key_extractor_manager = match storage.filter_key_extractor_manager().clone() + { + FilterKeyExtractorManager::RpcFilterKeyExtractorManager( + rpc_filter_key_extractor_manager, + ) => rpc_filter_key_extractor_manager, + FilterKeyExtractorManager::StaticFilterKeyExtractorManager(_) => unreachable!(), + }; + + let mut key = BytesMut::default(); + key.put_u16(1); + key.put_slice(b"key_prefix"); + let key_prefix = key.freeze(); + + rpc_filter_key_extractor_manager.update( + table_id_1.table_id(), + Arc::new(FilterKeyExtractorImpl::FixedLength( + FixedLengthFilterKeyExtractor::new(TABLE_PREFIX_LEN + key_prefix.len()), + )), + ); + rpc_filter_key_extractor_manager.update( + table_id_2.table_id(), + Arc::new(FilterKeyExtractorImpl::FixedLength( + FixedLengthFilterKeyExtractor::new(TABLE_PREFIX_LEN + key_prefix.len()), + )), + ); + + let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( + rpc_filter_key_extractor_manager, + ); + let compact_ctx = get_compactor_context(&storage); + let sstable_object_id_manager = Arc::new(SstableObjectIdManager::new( + hummock_meta_client.clone(), + storage + .storage_opts() + .clone() + .sstable_id_remote_fetch_number, + )); + + let base_epoch = Epoch::now(); + let mut epoch: u64 = base_epoch.0; + let millisec_interval_epoch: u64 = (1 << 16) * 100; + + let mut local_1 = storage + .new_local(NewLocalOptions::for_test(table_id_1)) + .await; + let mut local_2 = storage + .new_local(NewLocalOptions::for_test(table_id_2)) + .await; + + let val = Bytes::from(b"0"[..].to_vec()); + + async fn write_data( + storage: &HummockStorage, + local_1: (&mut LocalHummockStorage, bool), + local_2: (&mut LocalHummockStorage, bool), + epoch: &mut u64, + val: Bytes, + kv_count: u64, + millisec_interval_epoch: u64, + key_prefix: Bytes, + hummock_meta_client: Arc, + is_init: &mut bool, + ) { + let table_id_set = + HashSet::from_iter(vec![local_1.0.table_id(), local_2.0.table_id()].into_iter()); + + storage.start_epoch(*epoch, table_id_set.clone()); + for i in 0..kv_count { + if i == 0 && *is_init { + local_1.0.init_for_test(*epoch).await.unwrap(); + local_2.0.init_for_test(*epoch).await.unwrap(); + + *is_init = false; + } + let next_epoch = *epoch + millisec_interval_epoch; + storage.start_epoch(next_epoch, table_id_set.clone()); + + let ramdom_key = + [key_prefix.as_ref(), &rand::thread_rng().gen::<[u8; 32]>()].concat(); + + if local_1.1 { + local_1 + .0 + .insert(TableKey(Bytes::from(ramdom_key.clone())), val.clone(), None) + .unwrap(); + } + local_1.0.flush().await.unwrap(); + local_1 + .0 + .seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); + + if local_2.1 { + local_2 + .0 + .insert(TableKey(Bytes::from(ramdom_key.clone())), val.clone(), None) + .unwrap(); + } + local_2.0.flush().await.unwrap(); + local_2 + .0 + .seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); + + let res = storage.seal_and_sync_epoch(*epoch).await.unwrap(); + hummock_meta_client.commit_epoch(*epoch, res).await.unwrap(); + *epoch += millisec_interval_epoch; + } + } + + let mut is_init = true; + write_data( + &storage, + (&mut local_1, true), + (&mut local_2, true), + &mut epoch, + val.clone(), + 1, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + epoch += millisec_interval_epoch; + + let parent_group_id = 2; + let split_table_ids = vec![table_id_2.table_id()]; + + async fn compact_once( + group_id: CompactionGroupId, + level: usize, + hummock_manager_ref: HummockManagerRef, + compact_ctx: CompactorContext, + filter_key_extractor_manager: FilterKeyExtractorManager, + sstable_object_id_manager: Arc, + ) { + // compact left group + let manual_compcation_option = ManualCompactionOption { + level, + ..Default::default() + }; + // 2. get compact task + let compact_task = hummock_manager_ref + .manual_get_compact_task(group_id, manual_compcation_option) + .await + .unwrap(); + + if compact_task.is_none() { + return; + } + + let mut compact_task = compact_task.unwrap(); + + let compaction_filter_flag = + CompactionFilterFlag::STATE_CLEAN | CompactionFilterFlag::TTL; + compact_task.compaction_filter_mask = compaction_filter_flag.bits(); + compact_task.current_epoch_time = hummock_manager_ref + .get_current_version() + .await + .max_committed_epoch(); + + // 3. compact + let (_tx, rx) = tokio::sync::oneshot::channel(); + let ((result_task, task_stats), _) = compact( + compact_ctx, + compact_task.clone(), + rx, + Box::new(sstable_object_id_manager.clone()), + filter_key_extractor_manager.clone(), + ) + .await; + + hummock_manager_ref + .report_compact_task( + result_task.task_id, + result_task.task_status, + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) + .await + .unwrap(); + } + + // compact + compact_once( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + let new_cg_id = hummock_manager_ref + .split_compaction_group(parent_group_id, &split_table_ids, 0) + .await + .unwrap(); + + assert_ne!(parent_group_id, new_cg_id); + assert!(hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .is_err()); + + write_data( + &storage, + (&mut local_1, true), + (&mut local_2, true), + &mut epoch, + val.clone(), + 100, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + epoch += millisec_interval_epoch; + + compact_once( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + compact_once( + new_cg_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + // try merge + hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .unwrap(); + + let new_cg_id = hummock_manager_ref + .split_compaction_group(parent_group_id, &split_table_ids, 0) + .await + .unwrap(); + + compact_once( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + compact_once( + new_cg_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + // write left + write_data( + &storage, + (&mut local_1, true), + (&mut local_2, false), + &mut epoch, + val.clone(), + 16, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + + epoch += millisec_interval_epoch; + + // try merge + hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .unwrap(); + + // compact + compact_once( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + // try split + let new_cg_id = hummock_manager_ref + .split_compaction_group(parent_group_id, &split_table_ids, 0) + .await + .unwrap(); + + // write right + write_data( + &storage, + (&mut local_1, false), + (&mut local_2, true), + &mut epoch, + val.clone(), + 16, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + + epoch += millisec_interval_epoch; + + hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .unwrap(); + + // write left and right + + write_data( + &storage, + (&mut local_1, true), + (&mut local_2, true), + &mut epoch, + val.clone(), + 1, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + + epoch += millisec_interval_epoch; + + compact_once( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + compact_once( + new_cg_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + async fn compact_all( + group_id: CompactionGroupId, + level: usize, + hummock_manager_ref: HummockManagerRef, + compact_ctx: CompactorContext, + filter_key_extractor_manager: FilterKeyExtractorManager, + sstable_object_id_manager: Arc, + ) { + loop { + let manual_compcation_option = ManualCompactionOption { + level, + ..Default::default() + }; + let compact_task = hummock_manager_ref + .manual_get_compact_task(group_id, manual_compcation_option) + .await + .unwrap(); + + if compact_task.is_none() { + break; + } + + let mut compact_task = compact_task.unwrap(); + let compaction_filter_flag = + CompactionFilterFlag::STATE_CLEAN | CompactionFilterFlag::TTL; + compact_task.compaction_filter_mask = compaction_filter_flag.bits(); + compact_task.current_epoch_time = hummock_manager_ref + .get_current_version() + .await + .max_committed_epoch(); + + // 3. compact + let (_tx, rx) = tokio::sync::oneshot::channel(); + let ((result_task, task_stats), _) = compact( + compact_ctx.clone(), + compact_task.clone(), + rx, + Box::new(sstable_object_id_manager.clone()), + filter_key_extractor_manager.clone(), + ) + .await; + + hummock_manager_ref + .report_compact_task( + result_task.task_id, + result_task.task_status, + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) + .await + .unwrap(); + } + } + + // try split + let new_cg_id = hummock_manager_ref + .split_compaction_group(parent_group_id, &split_table_ids, 0) + .await + .unwrap(); + + // try merge + assert!(hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .is_err()); + + // write left and write + + write_data( + &storage, + (&mut local_1, true), + (&mut local_2, true), + &mut epoch, + val.clone(), + 200, + millisec_interval_epoch, + key_prefix.clone(), + hummock_meta_client.clone(), + &mut is_init, + ) + .await; + + compact_all( + parent_group_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + compact_all( + new_cg_id, + 0, + hummock_manager_ref.clone(), + compact_ctx.clone(), + filter_key_extractor_manager.clone(), + sstable_object_id_manager.clone(), + ) + .await; + + // try merge + hummock_manager_ref + .merge_compaction_group(parent_group_id, new_cg_id) + .await + .unwrap(); + } } diff --git a/src/storage/hummock_test/src/hummock_storage_tests.rs b/src/storage/hummock_test/src/hummock_storage_tests.rs index 7f3d35f16b80b..fc0fd6ae97b4f 100644 --- a/src/storage/hummock_test/src/hummock_storage_tests.rs +++ b/src/storage/hummock_test/src/hummock_storage_tests.rs @@ -31,6 +31,7 @@ use risingwave_hummock_sdk::key::{ gen_key_from_bytes, prefixed_range_with_vnode, FullKey, TableKey, UserKey, TABLE_PREFIX_LEN, }; use risingwave_hummock_sdk::sstable_info::SstableInfo; +use risingwave_hummock_sdk::table_stats::TableStats; use risingwave_hummock_sdk::table_watermark::{ TableWatermarksIndex, VnodeWatermark, WatermarkDirection, }; @@ -2510,8 +2511,20 @@ async fn test_commit_multi_epoch() { new_table_watermarks: Default::default(), sst_to_context: context_id_map(&[sst.object_id]), sstables: vec![LocalSstableInfo { + table_stats: sst + .table_ids + .iter() + .map(|&table_id| { + ( + table_id, + TableStats { + total_compressed_size: 10, + ..Default::default() + }, + ) + }) + .collect(), sst_info: sst, - table_stats: Default::default(), }], new_table_fragment_info, change_log_delta: Default::default(), diff --git a/src/storage/hummock_test/src/state_store_tests.rs b/src/storage/hummock_test/src/state_store_tests.rs index 35f3d08a9ed8a..67da2150735af 100644 --- a/src/storage/hummock_test/src/state_store_tests.rs +++ b/src/storage/hummock_test/src/state_store_tests.rs @@ -24,7 +24,6 @@ use futures::{pin_mut, StreamExt}; use itertools::Itertools; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{TableId, TableOption}; -use risingwave_common::hash::table_distribution::TableDistribution; use risingwave_common::hash::VirtualNode; use risingwave_common::util::epoch::{test_epoch, EpochExt, MAX_EPOCH}; use risingwave_hummock_sdk::key::{prefixed_range_with_vnode, TableKeyRange}; @@ -1565,7 +1564,7 @@ async fn test_iter_log() { }, table_option: Default::default(), is_replicated: false, - vnodes: TableDistribution::all_vnodes(), + vnodes: Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into(), }) .await; @@ -1580,7 +1579,7 @@ async fn test_iter_log() { }, table_option: Default::default(), is_replicated: false, - vnodes: TableDistribution::all_vnodes(), + vnodes: Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into(), }) .await; // flush for about 10 times per epoch diff --git a/src/storage/hummock_test/src/sync_point_tests.rs b/src/storage/hummock_test/src/sync_point_tests.rs index f5ee41783813d..008c667ccedf5 100644 --- a/src/storage/hummock_test/src/sync_point_tests.rs +++ b/src/storage/hummock_test/src/sync_point_tests.rs @@ -242,7 +242,7 @@ async fn test_syncpoints_get_in_delete_range_boundary() { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - TableId::from(existing_table_id), + &[existing_table_id], ) .await; let (compact_ctx, filter_key_extractor_manager) = diff --git a/src/storage/hummock_trace/src/opts.rs b/src/storage/hummock_trace/src/opts.rs index 5d480cca96b58..ff8b43c15c458 100644 --- a/src/storage/hummock_trace/src/opts.rs +++ b/src/storage/hummock_trace/src/opts.rs @@ -109,7 +109,7 @@ pub struct TracedReadOptions { pub retention_seconds: Option, pub table_id: TracedTableId, pub read_version_from_backup: bool, - pub read_version_from_time_travel: bool, + pub read_committed: bool, } impl TracedReadOptions { @@ -125,7 +125,7 @@ impl TracedReadOptions { retention_seconds: None, table_id: TracedTableId { table_id }, read_version_from_backup: false, - read_version_from_time_travel: false, + read_committed: false, } } } diff --git a/src/storage/src/hummock/event_handler/hummock_event_handler.rs b/src/storage/src/hummock/event_handler/hummock_event_handler.rs index f2aa2ea7fd88d..1c8abc78ddffc 100644 --- a/src/storage/src/hummock/event_handler/hummock_event_handler.rs +++ b/src/storage/src/hummock/event_handler/hummock_event_handler.rs @@ -50,6 +50,7 @@ use crate::hummock::event_handler::{ ReadOnlyRwLockRef, }; use crate::hummock::local_version::pinned_version::PinnedVersion; +use crate::hummock::local_version::recent_versions::RecentVersions; use crate::hummock::store::version::{ HummockReadVersion, StagingData, StagingSstableInfo, VersionUpdate, }; @@ -197,7 +198,7 @@ pub struct HummockEventHandler { local_read_version_mapping: HashMap, version_update_notifier_tx: Arc>, - pinned_version: Arc>, + recent_versions: Arc>, write_conflict_detector: Option>, uploader: HummockUploader, @@ -355,7 +356,10 @@ impl HummockEventHandler { hummock_event_rx, version_update_rx, version_update_notifier_tx, - pinned_version: Arc::new(ArcSwap::from_pointee(pinned_version)), + recent_versions: Arc::new(ArcSwap::from_pointee(RecentVersions::new( + pinned_version, + storage_opts.max_cached_recent_versions_number, + ))), write_conflict_detector, read_version_mapping, local_read_version_mapping: Default::default(), @@ -371,8 +375,8 @@ impl HummockEventHandler { self.version_update_notifier_tx.clone() } - pub fn pinned_version(&self) -> Arc> { - self.pinned_version.clone() + pub fn recent_versions(&self) -> Arc> { + self.recent_versions.clone() } pub fn read_version_mapping(&self) -> ReadOnlyReadVersionMapping { @@ -529,17 +533,18 @@ impl HummockEventHandler { .await .expect("should not be empty"); let prev_version_id = latest_version_ref.id(); - let new_version = Self::resolve_version_update_info( + if let Some(new_version) = Self::resolve_version_update_info( latest_version_ref.clone(), version_update, None, - ); - info!( - ?prev_version_id, - new_version_id = ?new_version.id(), - "recv new version" - ); - latest_version = Some(new_version); + ) { + info!( + ?prev_version_id, + new_version_id = ?new_version.id(), + "recv new version" + ); + latest_version = Some(new_version); + } } self.apply_version_update( @@ -582,21 +587,21 @@ impl HummockEventHandler { .unwrap_or_else(|| self.uploader.hummock_version().clone()); let mut sst_delta_infos = vec![]; - let new_pinned_version = Self::resolve_version_update_info( + if let Some(new_pinned_version) = Self::resolve_version_update_info( pinned_version.clone(), version_payload, Some(&mut sst_delta_infos), - ); - - self.refiller - .start_cache_refill(sst_delta_infos, pinned_version, new_pinned_version); + ) { + self.refiller + .start_cache_refill(sst_delta_infos, pinned_version, new_pinned_version); + } } fn resolve_version_update_info( pinned_version: PinnedVersion, version_payload: HummockVersionUpdate, mut sst_delta_infos: Option<&mut Vec>, - ) -> PinnedVersion { + ) -> Option { let newly_pinned_version = match version_payload { HummockVersionUpdate::VersionDeltas(version_deltas) => { let mut version_to_apply = pinned_version.version().clone(); @@ -629,8 +634,9 @@ impl HummockEventHandler { .metrics .event_handler_on_apply_version_update .start_timer(); - self.pinned_version - .store(Arc::new(new_pinned_version.clone())); + self.recent_versions.rcu(|prev_recent_versions| { + prev_recent_versions.with_new_version(new_pinned_version.clone()) + }); { self.for_each_read_version( @@ -663,7 +669,10 @@ impl HummockEventHandler { // TODO: should we change the logic when supporting partial ckpt? if let Some(sstable_object_id_manager) = &self.sstable_object_id_manager { sstable_object_id_manager.remove_watermark_object_id(TrackerId::Epoch( - self.pinned_version.load().visible_table_committed_epoch(), + self.recent_versions + .load() + .latest_version() + .visible_table_committed_epoch(), )); } @@ -789,13 +798,13 @@ impl HummockEventHandler { is_replicated, vnodes, } => { - let pinned_version = self.pinned_version.load(); + let pinned_version = self.recent_versions.load().latest_version().clone(); let instance_id = self.generate_instance_id(); let basic_read_version = Arc::new(RwLock::new( HummockReadVersion::new_with_replication_option( table_id, instance_id, - (**pinned_version).clone(), + pinned_version, is_replicated, vnodes, ), @@ -992,7 +1001,7 @@ mod tests { ); let event_tx = event_handler.event_sender(); - let latest_version = event_handler.pinned_version.clone(); + let latest_version = event_handler.recent_versions.clone(); let latest_version_update_tx = event_handler.version_update_notifier_tx.clone(); let send_clear = |version_id| { @@ -1018,12 +1027,15 @@ mod tests { let (old_version, new_version, refill_finish_tx) = refill_task_rx.recv().await.unwrap(); assert_eq!(old_version.version(), initial_version.version()); assert_eq!(new_version.version(), &version1); - assert_eq!(latest_version.load().version(), initial_version.version()); + assert_eq!( + latest_version.load().latest_version().version(), + initial_version.version() + ); let mut changed = latest_version_update_tx.subscribe(); refill_finish_tx.send(()).unwrap(); changed.changed().await.unwrap(); - assert_eq!(latest_version.load().version(), &version1); + assert_eq!(latest_version.load().latest_version().version(), &version1); } // test recovery with pending refill task @@ -1050,11 +1062,11 @@ mod tests { refill_task_rx.recv().await.unwrap(); assert_eq!(old_version3.version(), &version2); assert_eq!(new_version3.version(), &version3); - assert_eq!(latest_version.load().version(), &version1); + assert_eq!(latest_version.load().latest_version().version(), &version1); let rx = send_clear(version3.id); rx.await.unwrap(); - assert_eq!(latest_version.load().version(), &version3); + assert_eq!(latest_version.load().latest_version().version(), &version3); } async fn assert_pending(fut: &mut (impl Future + Unpin)) { @@ -1081,7 +1093,7 @@ mod tests { ))) .unwrap(); rx.await.unwrap(); - assert_eq!(latest_version.load().version(), &version5); + assert_eq!(latest_version.load().latest_version().version(), &version5); } } diff --git a/src/storage/src/hummock/event_handler/uploader/mod.rs b/src/storage/src/hummock/event_handler/uploader/mod.rs index 4494049d93b0b..90e6a9306930a 100644 --- a/src/storage/src/hummock/event_handler/uploader/mod.rs +++ b/src/storage/src/hummock/event_handler/uploader/mod.rs @@ -1643,7 +1643,8 @@ pub(crate) mod tests { let new_pinned_version = uploader .context .pinned_version - .new_pin_version(test_hummock_version(epoch1)); + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); uploader.update_pinned_version(new_pinned_version); assert_eq!(epoch1, uploader.max_committed_epoch()); } @@ -1672,7 +1673,8 @@ pub(crate) mod tests { let new_pinned_version = uploader .context .pinned_version - .new_pin_version(test_hummock_version(epoch1)); + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); uploader.update_pinned_version(new_pinned_version); assert!(uploader.data().syncing_data.is_empty()); assert_eq!(epoch1, uploader.max_committed_epoch()); @@ -1706,7 +1708,8 @@ pub(crate) mod tests { let new_pinned_version = uploader .context .pinned_version - .new_pin_version(test_hummock_version(epoch1)); + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); uploader.update_pinned_version(new_pinned_version); assert!(uploader.data().syncing_data.is_empty()); assert_eq!(epoch1, uploader.max_committed_epoch()); @@ -1730,11 +1733,21 @@ pub(crate) mod tests { let epoch4 = epoch3.next_epoch(); let epoch5 = epoch4.next_epoch(); let epoch6 = epoch5.next_epoch(); - let version1 = initial_pinned_version.new_pin_version(test_hummock_version(epoch1)); - let version2 = initial_pinned_version.new_pin_version(test_hummock_version(epoch2)); - let version3 = initial_pinned_version.new_pin_version(test_hummock_version(epoch3)); - let version4 = initial_pinned_version.new_pin_version(test_hummock_version(epoch4)); - let version5 = initial_pinned_version.new_pin_version(test_hummock_version(epoch5)); + let version1 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); + let version2 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch2)) + .unwrap(); + let version3 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch3)) + .unwrap(); + let version4 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch4)) + .unwrap(); + let version5 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch5)) + .unwrap(); uploader.start_epochs_for_test([epoch6]); uploader.init_instance(TEST_LOCAL_INSTANCE_ID, TEST_TABLE_ID, epoch6); diff --git a/src/storage/src/hummock/hummock_meta_client.rs b/src/storage/src/hummock/hummock_meta_client.rs index 4445a74884d5a..d123558acc50b 100644 --- a/src/storage/src/hummock/hummock_meta_client.rs +++ b/src/storage/src/hummock/hummock_meta_client.rs @@ -130,7 +130,11 @@ impl HummockMetaClient for MonitoredHummockMetaClient { self.meta_client.subscribe_compaction_event().await } - async fn get_version_by_epoch(&self, epoch: HummockEpoch) -> Result { - self.meta_client.get_version_by_epoch(epoch).await + async fn get_version_by_epoch( + &self, + epoch: HummockEpoch, + table_id: u32, + ) -> Result { + self.meta_client.get_version_by_epoch(epoch, table_id).await } } diff --git a/src/storage/src/hummock/iterator/change_log.rs b/src/storage/src/hummock/iterator/change_log.rs index 6fc99f29a80f3..ae8061c37b07d 100644 --- a/src/storage/src/hummock/iterator/change_log.rs +++ b/src/storage/src/hummock/iterator/change_log.rs @@ -527,8 +527,9 @@ mod tests { use bytes::Bytes; use itertools::Itertools; + use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::TableId; - use risingwave_common::hash::table_distribution::TableDistribution; + use risingwave_common::hash::VirtualNode; use risingwave_common::util::epoch::test_epoch; use risingwave_hummock_sdk::key::{TableKey, UserKey}; use risingwave_hummock_sdk::EpochWithGap; @@ -699,7 +700,7 @@ mod tests { }, table_option: Default::default(), is_replicated: false, - vnodes: TableDistribution::all_vnodes(), + vnodes: Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into(), }) .await; let logs = gen_test_data(epoch_count, 10000, 0.05, 0.2); diff --git a/src/storage/src/hummock/local_version/mod.rs b/src/storage/src/hummock/local_version/mod.rs index 578e123c6574e..4a45c8dc9075c 100644 --- a/src/storage/src/hummock/local_version/mod.rs +++ b/src/storage/src/hummock/local_version/mod.rs @@ -13,3 +13,4 @@ // limitations under the License. pub mod pinned_version; +pub mod recent_versions; diff --git a/src/storage/src/hummock/local_version/pinned_version.rs b/src/storage/src/hummock/local_version/pinned_version.rs index 5ef53edcd26ef..afaafdf7cbe8a 100644 --- a/src/storage/src/hummock/local_version/pinned_version.rs +++ b/src/storage/src/hummock/local_version/pinned_version.rs @@ -92,22 +92,25 @@ impl PinnedVersion { } } - pub fn new_pin_version(&self, version: HummockVersion) -> Self { + pub fn new_pin_version(&self, version: HummockVersion) -> Option { assert!( version.id >= self.version.id, "pinning a older version {}. Current is {}", version.id, self.version.id ); + if version.id == self.version.id { + return None; + } let version_id = version.id; - PinnedVersion { + Some(PinnedVersion { version: Arc::new(version), guard: Arc::new(PinnedVersionGuard::new( version_id, self.guard.pinned_version_manager_tx.clone(), )), - } + }) } pub fn id(&self) -> HummockVersionId { diff --git a/src/storage/src/hummock/local_version/recent_versions.rs b/src/storage/src/hummock/local_version/recent_versions.rs new file mode 100644 index 0000000000000..8d3f1a015ad6a --- /dev/null +++ b/src/storage/src/hummock/local_version/recent_versions.rs @@ -0,0 +1,321 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; + +use risingwave_common::catalog::TableId; +use risingwave_hummock_sdk::HummockEpoch; + +use crate::hummock::local_version::pinned_version::PinnedVersion; + +pub struct RecentVersions { + latest_version: PinnedVersion, + is_latest_committed: bool, + recent_versions: Vec, // earlier version at the front + max_version_num: usize, +} + +impl RecentVersions { + pub fn new(version: PinnedVersion, max_version_num: usize) -> Self { + assert!(max_version_num > 0); + Self { + latest_version: version, + is_latest_committed: true, // The first version is always treated as committed epochs + recent_versions: Vec::new(), + max_version_num, + } + } + + fn has_table_committed(&self, new_version: &PinnedVersion) -> bool { + let mut has_table_committed = false; + for (table_id, info) in new_version.version().state_table_info.info() { + if let Some(prev_info) = self + .latest_version + .version() + .state_table_info + .info() + .get(table_id) + { + match info.committed_epoch.cmp(&prev_info.committed_epoch) { + Ordering::Less => { + unreachable!( + "table {} has regress committed epoch {}, prev committed epoch {}", + table_id, info.committed_epoch, prev_info.committed_epoch + ); + } + Ordering::Equal => {} + Ordering::Greater => { + has_table_committed = true; + } + } + } else { + has_table_committed = true; + } + } + has_table_committed + } + + #[must_use] + pub fn with_new_version(&self, version: PinnedVersion) -> Self { + assert!(version.version().id > self.latest_version.version().id); + let is_committed = self.has_table_committed(&version); + let recent_versions = if self.is_latest_committed { + let prev_recent_versions = if self.recent_versions.len() >= self.max_version_num { + assert_eq!(self.recent_versions.len(), self.max_version_num); + &self.recent_versions[1..] + } else { + &self.recent_versions[..] + }; + let mut recent_versions = Vec::with_capacity(prev_recent_versions.len() + 1); + recent_versions.extend(prev_recent_versions.iter().cloned()); + recent_versions.push(self.latest_version.clone()); + recent_versions + } else { + self.recent_versions.clone() + }; + Self { + latest_version: version, + is_latest_committed: is_committed, + recent_versions, + max_version_num: self.max_version_num, + } + } + + pub fn latest_version(&self) -> &PinnedVersion { + &self.latest_version + } + + /// Return the latest version that is safe to read `epoch` on `table_id`. + /// + /// `safe to read` means that the `committed_epoch` of the `table_id` in the version won't be greater than the given `epoch` + pub fn get_safe_version( + &self, + table_id: TableId, + epoch: HummockEpoch, + ) -> Option { + if let Some(info) = self + .latest_version + .version() + .state_table_info + .info() + .get(&table_id) + { + if info.committed_epoch <= epoch { + Some(self.latest_version.clone()) + } else { + self.get_safe_version_from_recent(table_id, epoch) + } + } else { + None + } + } + + fn get_safe_version_from_recent( + &self, + table_id: TableId, + epoch: HummockEpoch, + ) -> Option { + if cfg!(debug_assertions) { + assert!( + epoch + < self + .latest_version + .version() + .state_table_info + .info() + .get(&table_id) + .expect("should exist") + .committed_epoch + ); + } + let result = self.recent_versions.binary_search_by(|version| { + let committed_epoch = version + .version() + .state_table_info + .info() + .get(&table_id) + .map(|info| info.committed_epoch); + if let Some(committed_epoch) = committed_epoch { + committed_epoch.cmp(&epoch) + } else { + // We have ensured that the table_id exists in the latest version, so if the table_id does not exist in a + // previous version, the table must have not created yet, and therefore has less ordering. + Ordering::Less + } + }); + match result { + Ok(index) => Some(self.recent_versions[index].clone()), + Err(index) => { + // `index` is index of the first version that has `committed_epoch` greater than `epoch` + // or `index` equals `recent_version.len()` when `epoch` is greater than all `committed_epoch` + let version = if index >= self.recent_versions.len() { + assert_eq!(index, self.recent_versions.len()); + self.recent_versions.last().cloned() + } else if index == 0 { + // The earliest version has a higher committed epoch + None + } else { + self.recent_versions.get(index - 1).cloned() + }; + version.and_then(|version| { + if version + .version() + .state_table_info + .info() + .contains_key(&table_id) + { + Some(version) + } else { + // if the table does not exist in the version, return `None` to try get a time travel version + None + } + }) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use risingwave_common::catalog::TableId; + use risingwave_hummock_sdk::version::HummockVersion; + use risingwave_pb::hummock::{PbHummockVersion, StateTableInfo}; + use tokio::sync::mpsc::unbounded_channel; + + use crate::hummock::local_version::pinned_version::PinnedVersion; + use crate::hummock::local_version::recent_versions::RecentVersions; + + const TEST_TABLE_ID1: TableId = TableId::new(233); + const TEST_TABLE_ID2: TableId = TableId::new(234); + + fn gen_pin_version( + version_id: u64, + table_committed_epoch: impl IntoIterator, + ) -> PinnedVersion { + PinnedVersion::new( + HummockVersion::from_rpc_protobuf(&PbHummockVersion { + id: version_id, + state_table_info: HashMap::from_iter(table_committed_epoch.into_iter().map( + |(table_id, committed_epoch)| { + ( + table_id.table_id, + StateTableInfo { + committed_epoch, + safe_epoch: 0, + compaction_group_id: 0, + }, + ) + }, + )), + ..Default::default() + }), + unbounded_channel().0, + ) + } + + fn assert_query_equal( + recent_version: &RecentVersions, + expected: &[(TableId, u64, Option<&PinnedVersion>)], + ) { + for (table_id, epoch, expected_version) in expected + .iter() + .cloned() + .chain([(TEST_TABLE_ID1, 0, None), (TEST_TABLE_ID2, 0, None)]) + { + let version = recent_version.get_safe_version(table_id, epoch); + assert_eq!( + version.as_ref().map(|version| version.id()), + expected_version.map(|version| version.id()) + ); + } + } + + #[test] + fn test_basic() { + let epoch1 = 233; + let epoch0 = epoch1 - 1; + let epoch2 = epoch1 + 1; + let epoch3 = epoch2 + 1; + let epoch4 = epoch3 + 1; + let version1 = gen_pin_version(1, [(TEST_TABLE_ID1, epoch1)]); + // with at most 2 historical versions + let recent_versions = RecentVersions::new(version1.clone(), 2); + assert!(recent_versions.recent_versions.is_empty()); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, Some(&version1)), + (TEST_TABLE_ID1, epoch2, Some(&version1)), + ], + ); + + let recent_versions = + recent_versions.with_new_version(gen_pin_version(2, [(TEST_TABLE_ID1, epoch1)])); + assert_eq!(recent_versions.recent_versions.len(), 1); + assert!(!recent_versions.is_latest_committed); + + let version3 = gen_pin_version(3, [(TEST_TABLE_ID1, epoch2)]); + let recent_versions = recent_versions.with_new_version(version3.clone()); + assert_eq!(recent_versions.recent_versions.len(), 1); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, Some(&version1)), + (TEST_TABLE_ID1, epoch2, Some(&version3)), + (TEST_TABLE_ID1, epoch3, Some(&version3)), + ], + ); + + let version4 = gen_pin_version(4, [(TEST_TABLE_ID2, epoch1), (TEST_TABLE_ID1, epoch2)]); + let recent_versions = recent_versions.with_new_version(version4.clone()); + assert_eq!(recent_versions.recent_versions.len(), 2); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, Some(&version1)), + (TEST_TABLE_ID1, epoch2, Some(&version4)), + (TEST_TABLE_ID1, epoch3, Some(&version4)), + (TEST_TABLE_ID2, epoch0, None), + (TEST_TABLE_ID2, epoch1, Some(&version4)), + (TEST_TABLE_ID2, epoch2, Some(&version4)), + ], + ); + + let version5 = gen_pin_version(5, [(TEST_TABLE_ID2, epoch1), (TEST_TABLE_ID1, epoch3)]); + let recent_versions = recent_versions.with_new_version(version5.clone()); + assert_eq!(recent_versions.recent_versions.len(), 2); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, None), + (TEST_TABLE_ID1, epoch2, Some(&version4)), + (TEST_TABLE_ID1, epoch3, Some(&version5)), + (TEST_TABLE_ID1, epoch4, Some(&version5)), + (TEST_TABLE_ID2, epoch0, None), + (TEST_TABLE_ID2, epoch1, Some(&version5)), + (TEST_TABLE_ID2, epoch2, Some(&version5)), + ], + ); + } +} diff --git a/src/storage/src/hummock/mod.rs b/src/storage/src/hummock/mod.rs index 14ac9532c8cb3..f10b6deee503e 100644 --- a/src/storage/src/hummock/mod.rs +++ b/src/storage/src/hummock/mod.rs @@ -172,8 +172,7 @@ pub fn get_from_batch( read_options: &ReadOptions, local_stats: &mut StoreLocalStatistic, ) -> Option<(HummockValue, EpochWithGap)> { - imm.get(table_key, read_epoch, read_options).map(|v| { + imm.get(table_key, read_epoch, read_options).inspect(|_| { local_stats.get_shared_buffer_hit_counts += 1; - v }) } diff --git a/src/storage/src/hummock/sstable/bloom.rs b/src/storage/src/hummock/sstable/bloom.rs index f2ca47ba00e12..b38a4c10ada30 100644 --- a/src/storage/src/hummock/sstable/bloom.rs +++ b/src/storage/src/hummock/sstable/bloom.rs @@ -102,7 +102,7 @@ impl BloomFilterReader { true } else { let nbits = self.data.bit_len(); - let delta = (h >> 17) | (h << 15); + let delta = h.rotate_left(15); for _ in 0..self.k { let bit_pos = h % (nbits as u32); if !self.data.get_bit(bit_pos as usize) { @@ -171,7 +171,7 @@ impl FilterBuilder for BloomFilterBuilder { filter.resize(nbytes, 0); for h in &self.key_hash_entries { let mut h = *h; - let delta = (h >> 17) | (h << 15); + let delta = h.rotate_left(15); for _ in 0..k { let bit_pos = (h as usize) % nbits; filter.set_bit(bit_pos, true); diff --git a/src/storage/src/hummock/sstable_store.rs b/src/storage/src/hummock/sstable_store.rs index b9f29c5740e4b..d1367b92a9ce8 100644 --- a/src/storage/src/hummock/sstable_store.rs +++ b/src/storage/src/hummock/sstable_store.rs @@ -26,7 +26,9 @@ use foyer::{ use futures::{future, StreamExt}; use itertools::Itertools; use risingwave_hummock_sdk::sstable_info::SstableInfo; -use risingwave_hummock_sdk::{HummockSstableObjectId, OBJECT_SUFFIX}; +use risingwave_hummock_sdk::{ + HummockSstableObjectId, HUMMOCK_SSTABLE_OBJECT_ID_MAX_DECIMAL_LENGTH, OBJECT_SUFFIX, +}; use risingwave_hummock_trace::TracedCachePolicy; use risingwave_object_store::object::{ ObjectError, ObjectMetadataIter, ObjectResult, ObjectStoreRef, ObjectStreamingUploader, @@ -519,10 +521,21 @@ impl SstableStore { let obj_prefix = self .store .get_object_prefix(object_id, self.use_new_object_prefix_strategy); - format!( - "{}/{}{}.{}", - self.path, obj_prefix, object_id, OBJECT_SUFFIX - ) + let mut path = String::with_capacity( + self.path.len() + + "/".len() + + obj_prefix.len() + + HUMMOCK_SSTABLE_OBJECT_ID_MAX_DECIMAL_LENGTH + + ".".len() + + OBJECT_SUFFIX.len(), + ); + path.push_str(&self.path); + path.push('/'); + path.push_str(&obj_prefix); + path.push_str(&object_id.to_string()); + path.push('.'); + path.push_str(OBJECT_SUFFIX); + path } pub fn get_object_id_from_path(path: &str) -> HummockSstableObjectId { diff --git a/src/storage/src/hummock/store/hummock_storage.rs b/src/storage/src/hummock/store/hummock_storage.rs index b64752fca7fd6..b4924a5dca60f 100644 --- a/src/storage/src/hummock/store/hummock_storage.rs +++ b/src/storage/src/hummock/store/hummock_storage.rs @@ -14,7 +14,7 @@ use std::collections::HashSet; use std::future::Future; -use std::ops::{Bound, Deref}; +use std::ops::Bound; use std::sync::Arc; use arc_swap::ArcSwap; @@ -50,9 +50,10 @@ use crate::hummock::event_handler::{ }; use crate::hummock::iterator::change_log::ChangeLogIterator; use crate::hummock::local_version::pinned_version::{start_pinned_version_worker, PinnedVersion}; +use crate::hummock::local_version::recent_versions::RecentVersions; use crate::hummock::observer_manager::HummockObserverNode; use crate::hummock::time_travel_version_cache::SimpleTimeTravelVersionCache; -use crate::hummock::utils::{validate_safe_epoch, wait_for_epoch}; +use crate::hummock::utils::wait_for_epoch; use crate::hummock::write_limiter::{WriteLimiter, WriteLimiterRef}; use crate::hummock::{ HummockEpoch, HummockError, HummockResult, HummockStorageIterator, HummockStorageRevIterator, @@ -97,7 +98,7 @@ pub struct HummockStorage { version_update_notifier_tx: Arc>, - pinned_version: Arc>, + recent_versions: Arc>, hummock_version_reader: HummockVersionReader, @@ -223,7 +224,7 @@ impl HummockStorage { version_update_notifier_tx: hummock_event_handler.version_update_notifier_tx(), hummock_event_sender: event_tx.clone(), _version_update_sender: version_update_tx, - pinned_version: hummock_event_handler.pinned_version(), + recent_versions: hummock_event_handler.recent_versions(), hummock_version_reader: HummockVersionReader::new( sstable_store, state_store_metrics.clone(), @@ -260,15 +261,9 @@ impl HummockStorage { ) -> StorageResult> { let key_range = (Bound::Included(key.clone()), Bound::Included(key.clone())); - let (key_range, read_version_tuple) = if read_options.read_version_from_time_travel { - self.build_read_version_by_time_travel(epoch, read_options.table_id, key_range) - .await? - } else if read_options.read_version_from_backup { - self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) - .await? - } else { - self.build_read_version_tuple(epoch, read_options.table_id, key_range)? - }; + let (key_range, read_version_tuple) = self + .build_read_version_tuple(epoch, key_range, &read_options) + .await?; if is_empty_key_range(&key_range) { return Ok(None); @@ -285,15 +280,9 @@ impl HummockStorage { epoch: u64, read_options: ReadOptions, ) -> StorageResult { - let (key_range, read_version_tuple) = if read_options.read_version_from_time_travel { - self.build_read_version_by_time_travel(epoch, read_options.table_id, key_range) - .await? - } else if read_options.read_version_from_backup { - self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) - .await? - } else { - self.build_read_version_tuple(epoch, read_options.table_id, key_range)? - }; + let (key_range, read_version_tuple) = self + .build_read_version_tuple(epoch, key_range, &read_options) + .await?; self.hummock_version_reader .iter(key_range, epoch, read_options, read_version_tuple) @@ -306,36 +295,28 @@ impl HummockStorage { epoch: u64, read_options: ReadOptions, ) -> StorageResult { - let (key_range, read_version_tuple) = if read_options.read_version_from_time_travel { - self.build_read_version_by_time_travel(epoch, read_options.table_id, key_range) - .await? - } else if read_options.read_version_from_backup { - self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) - .await? - } else { - self.build_read_version_tuple(epoch, read_options.table_id, key_range)? - }; + let (key_range, read_version_tuple) = self + .build_read_version_tuple(epoch, key_range, &read_options) + .await?; self.hummock_version_reader .rev_iter(key_range, epoch, read_options, read_version_tuple, None) .await } - async fn build_read_version_by_time_travel( + async fn get_time_travel_version( &self, epoch: u64, table_id: TableId, - key_range: TableKeyRange, - ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { + ) -> StorageResult { let fetch = async { let pb_version = self .hummock_meta_client - .get_version_by_epoch(epoch) + .get_version_by_epoch(epoch, table_id.table_id()) .await .inspect_err(|e| tracing::error!("{}", e.to_report_string())) .map_err(|e| HummockError::meta_error(e.to_report_string()))?; let version = HummockVersion::from_rpc_protobuf(&pb_version); - validate_safe_epoch(&version, table_id, epoch)?; let (tx, _rx) = unbounded_channel(); Ok(PinnedVersion::new(version, tx)) }; @@ -343,9 +324,24 @@ impl HummockStorage { .simple_time_travel_version_cache .get_or_insert(epoch, fetch) .await?; - Ok(get_committed_read_version_tuple( - version, table_id, key_range, epoch, - )) + Ok(version) + } + + async fn build_read_version_tuple( + &self, + epoch: u64, + key_range: TableKeyRange, + read_options: &ReadOptions, + ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { + if read_options.read_version_from_backup { + self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) + .await + } else if read_options.read_committed { + self.build_read_version_tuple_from_committed(epoch, read_options.table_id, key_range) + .await + } else { + self.build_read_version_tuple_from_all(epoch, read_options.table_id, key_range) + } } async fn build_read_version_tuple_from_backup( @@ -359,16 +355,12 @@ impl HummockStorage { .try_get_hummock_version(table_id, epoch) .await { - Ok(Some(backup_version)) => { - validate_safe_epoch(backup_version.version(), table_id, epoch)?; - - Ok(get_committed_read_version_tuple( - backup_version, - table_id, - key_range, - epoch, - )) - } + Ok(Some(backup_version)) => Ok(get_committed_read_version_tuple( + backup_version, + table_id, + key_range, + epoch, + )), Ok(None) => Err(HummockError::read_backup_error(format!( "backup include epoch {} not found", epoch @@ -378,27 +370,47 @@ impl HummockStorage { } } - fn build_read_version_tuple( + async fn build_read_version_tuple_from_committed( &self, epoch: u64, table_id: TableId, key_range: TableKeyRange, ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { - let pinned_version = self.pinned_version.load(); - validate_safe_epoch(pinned_version.version(), table_id, epoch)?; - let table_committed_epoch = pinned_version + let version = match self + .recent_versions + .load() + .get_safe_version(table_id, epoch) + { + Some(version) => version, + None => self.get_time_travel_version(epoch, table_id).await?, + }; + Ok(get_committed_read_version_tuple( + version, table_id, key_range, epoch, + )) + } + + fn build_read_version_tuple_from_all( + &self, + epoch: u64, + table_id: TableId, + key_range: TableKeyRange, + ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { + let pinned_version = self.recent_versions.load().latest_version().clone(); + let info = pinned_version .version() .state_table_info .info() - .get(&table_id) - .map(|info| info.committed_epoch); + .get(&table_id); // check epoch if lower mce - let ret = if let Some(table_committed_epoch) = table_committed_epoch - && epoch <= table_committed_epoch + let ret = if let Some(info) = info + && epoch <= info.committed_epoch { + if epoch < info.safe_epoch { + return Err(HummockError::expired_epoch(table_id, info.safe_epoch, epoch).into()); + } // read committed_version directly without build snapshot - get_committed_read_version_tuple((**pinned_version).clone(), table_id, key_range, epoch) + get_committed_read_version_tuple(pinned_version, table_id, key_range, epoch) } else { let vnode = vnode(&key_range); let mut matched_replicated_read_version_cnt = 0; @@ -431,6 +443,7 @@ impl HummockStorage { // When the system has just started and no state has been created, the memory state // may be empty if read_version_vec.is_empty() { + let table_committed_epoch = info.map(|info| info.committed_epoch); if matched_replicated_read_version_cnt > 0 { tracing::warn!( "Read(table_id={} vnode={} epoch={}) is not allowed on replicated read version ({} found). Fall back to committed version (epoch={:?})", @@ -449,12 +462,7 @@ impl HummockStorage { table_committed_epoch ); } - get_committed_read_version_tuple( - (**pinned_version).clone(), - table_id, - key_range, - epoch, - ) + get_committed_read_version_tuple(pinned_version, table_id, key_range, epoch) } else { if read_version_vec.len() != 1 { let read_version_vnodes = read_version_vec @@ -538,7 +546,7 @@ impl HummockStorage { } pub fn get_pinned_version(&self) -> PinnedVersion { - self.pinned_version.load().deref().deref().clone() + self.recent_versions.load().latest_version().clone() } pub fn backup_reader(&self) -> BackupReaderRef { @@ -604,7 +612,7 @@ impl StateStoreRead for HummockStorage { key_range: TableKeyRange, options: ReadLogOptions, ) -> StorageResult { - let version = (**self.pinned_version.load()).clone(); + let version = self.recent_versions.load().latest_version().clone(); let iter = self .hummock_version_reader .iter_log(version, epoch_range, key_range, options) @@ -655,8 +663,9 @@ impl HummockStorage { epoch: u64, ) -> StorageResult { let table_ids = self - .pinned_version + .recent_versions .load() + .latest_version() .version() .state_table_info .info() @@ -675,7 +684,7 @@ impl HummockStorage { .send(HummockVersionUpdate::PinnedVersion(Box::new(version))) .unwrap(); loop { - if self.pinned_version.load().id() >= version_id { + if self.recent_versions.load().latest_version().id() >= version_id { break; } @@ -686,7 +695,7 @@ impl HummockStorage { pub async fn wait_version(&self, version: HummockVersion) { use tokio::task::yield_now; loop { - if self.pinned_version.load().id() >= version.id { + if self.recent_versions.load().latest_version().id() >= version.id { break; } @@ -736,7 +745,7 @@ impl HummockStorage { pub async fn wait_version_update(&self, old_id: HummockVersionId) -> HummockVersionId { use tokio::task::yield_now; loop { - let cur_id = self.pinned_version.load().id(); + let cur_id = self.recent_versions.load().latest_version().id(); if cur_id > old_id { return cur_id; } diff --git a/src/storage/src/hummock/utils.rs b/src/storage/src/hummock/utils.rs index 3f2d1f989f529..c2f6cbafed79b 100644 --- a/src/storage/src/hummock/utils.rs +++ b/src/storage/src/hummock/utils.rs @@ -30,11 +30,10 @@ use risingwave_hummock_sdk::key::{ bound_table_key_range, EmptySliceRef, FullKey, TableKey, UserKey, }; use risingwave_hummock_sdk::sstable_info::SstableInfo; -use risingwave_hummock_sdk::version::HummockVersion; use risingwave_hummock_sdk::{can_concat, HummockEpoch}; use tokio::sync::oneshot::{channel, Receiver, Sender}; -use super::{HummockError, HummockResult, SstableStoreRef}; +use super::{HummockError, SstableStoreRef}; use crate::error::StorageResult; use crate::hummock::CachePolicy; use crate::mem_table::{KeyOp, MemTableError}; @@ -72,24 +71,6 @@ where !too_left && !too_right } -pub fn validate_safe_epoch( - version: &HummockVersion, - table_id: TableId, - epoch: u64, -) -> HummockResult<()> { - if let Some(info) = version.state_table_info.info().get(&table_id) - && epoch < info.safe_epoch - { - return Err(HummockError::expired_epoch( - table_id, - info.safe_epoch, - epoch, - )); - } - - Ok(()) -} - pub fn filter_single_sst(info: &SstableInfo, table_id: TableId, table_key_range: &R) -> bool where R: RangeBounds>, diff --git a/src/storage/src/lib.rs b/src/storage/src/lib.rs index e11d3e1cee1ca..779062767c7ae 100644 --- a/src/storage/src/lib.rs +++ b/src/storage/src/lib.rs @@ -18,7 +18,6 @@ #![feature(extract_if)] #![feature(coroutines)] #![feature(hash_extract_if)] -#![feature(lint_reasons)] #![feature(proc_macro_hygiene)] #![feature(stmt_expr_attributes)] #![feature(strict_provenance)] diff --git a/src/storage/src/opts.rs b/src/storage/src/opts.rs index f6d6f31fb3a4f..a3a787f55c97d 100644 --- a/src/storage/src/opts.rs +++ b/src/storage/src/opts.rs @@ -63,6 +63,8 @@ pub struct StorageOpts { /// max memory usage for large query. pub prefetch_buffer_capacity_mb: usize, + pub max_cached_recent_versions_number: usize, + pub max_prefetch_block_number: usize, pub disable_remote_compactor: bool, @@ -170,6 +172,10 @@ impl From<(&RwConfig, &SystemParamsReader, &StorageMemoryConfig)> for StorageOpt meta_cache_shard_num: s.meta_cache_shard_num, meta_cache_eviction_config: s.meta_cache_eviction_config.clone(), prefetch_buffer_capacity_mb: s.prefetch_buffer_capacity_mb, + max_cached_recent_versions_number: c + .storage + .max_cached_recent_versions_number + .unwrap_or(60), max_prefetch_block_number: c.storage.max_prefetch_block_number, disable_remote_compactor: c.storage.disable_remote_compactor, share_buffer_upload_concurrency: c.storage.share_buffer_upload_concurrency, diff --git a/src/storage/src/store.rs b/src/storage/src/store.rs index 91f79231f6939..ab80f712570ca 100644 --- a/src/storage/src/store.rs +++ b/src/storage/src/store.rs @@ -502,7 +502,7 @@ pub struct ReadOptions { /// Read from historical hummock version of meta snapshot backup. /// It should only be used by `StorageTable` for batch query. pub read_version_from_backup: bool, - pub read_version_from_time_travel: bool, + pub read_committed: bool, } impl From for ReadOptions { @@ -515,7 +515,7 @@ impl From for ReadOptions { retention_seconds: value.retention_seconds, table_id: value.table_id.into(), read_version_from_backup: value.read_version_from_backup, - read_version_from_time_travel: value.read_version_from_time_travel, + read_committed: value.read_committed, } } } @@ -530,7 +530,7 @@ impl From for TracedReadOptions { retention_seconds: value.retention_seconds, table_id: value.table_id.into(), read_version_from_backup: value.read_version_from_backup, - read_version_from_time_travel: value.read_version_from_time_travel, + read_committed: value.read_committed, } } } diff --git a/src/storage/src/table/batch_table/storage_table.rs b/src/storage/src/table/batch_table/storage_table.rs index 7a0ad76cce4a5..8c5f432f46c57 100644 --- a/src/storage/src/table/batch_table/storage_table.rs +++ b/src/storage/src/table/batch_table/storage_table.rs @@ -361,7 +361,10 @@ impl StorageTableInner { ) -> StorageResult> { let epoch = wait_epoch.get_epoch(); let read_backup = matches!(wait_epoch, HummockReadEpoch::Backup(_)); - let read_time_travel = matches!(wait_epoch, HummockReadEpoch::TimeTravel(_)); + let read_committed = matches!( + wait_epoch, + HummockReadEpoch::TimeTravel(_) | HummockReadEpoch::Committed(_) + ); self.store.try_wait_epoch(wait_epoch).await?; let serialized_pk = serialize_pk_with_vnode( &pk, @@ -382,7 +385,7 @@ impl StorageTableInner { retention_seconds: self.table_option.retention_seconds, table_id: self.table_id, read_version_from_backup: read_backup, - read_version_from_time_travel: read_time_travel, + read_committed, cache_policy: CachePolicy::Fill(CacheContext::Default), ..Default::default() }; @@ -487,14 +490,17 @@ impl StorageTableInner { let iterators: Vec<_> = try_join_all(table_key_ranges.map(|table_key_range| { let prefix_hint = prefix_hint.clone(); let read_backup = matches!(wait_epoch, HummockReadEpoch::Backup(_)); - let read_time_travel = matches!(wait_epoch, HummockReadEpoch::TimeTravel(_)); + let read_committed = matches!( + wait_epoch, + HummockReadEpoch::TimeTravel(_) | HummockReadEpoch::Committed(_) + ); async move { let read_options = ReadOptions { prefix_hint, retention_seconds: self.table_option.retention_seconds, table_id: self.table_id, read_version_from_backup: read_backup, - read_version_from_time_travel: read_time_travel, + read_committed, prefetch_options, cache_policy, ..Default::default() diff --git a/src/stream/src/common/log_store_impl/kv_log_store/reader.rs b/src/stream/src/common/log_store_impl/kv_log_store/reader.rs index 5497b989a0873..c84db97002b02 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/reader.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/reader.rs @@ -16,7 +16,7 @@ use std::future::Future; use std::ops::Bound; use std::ops::Bound::{Excluded, Included, Unbounded}; use std::pin::Pin; -use std::time::{Duration, Instant}; +use std::time::Duration; use anyhow::anyhow; use await_tree::InstrumentAwait; @@ -53,18 +53,28 @@ use crate::common::log_store_impl::kv_log_store::serde::{ }; use crate::common::log_store_impl::kv_log_store::KvLogStoreMetrics; -type RewindBackoffPolicy = impl Iterator; pub(crate) const REWIND_BASE_DELAY: Duration = Duration::from_secs(1); pub(crate) const REWIND_BACKOFF_FACTOR: u64 = 2; pub(crate) const REWIND_MAX_DELAY: Duration = Duration::from_secs(180); -fn initial_rewind_backoff_policy() -> RewindBackoffPolicy { - tokio_retry::strategy::ExponentialBackoff::from_millis(REWIND_BASE_DELAY.as_millis() as _) - .factor(REWIND_BACKOFF_FACTOR) - .max_delay(REWIND_MAX_DELAY) - .map(tokio_retry::strategy::jitter) +mod rewind_backoff_policy { + use std::time::Duration; + + use crate::common::log_store_impl::kv_log_store::{ + REWIND_BACKOFF_FACTOR, REWIND_BASE_DELAY, REWIND_MAX_DELAY, + }; + + pub(super) type RewindBackoffPolicy = impl Iterator; + pub(super) fn initial_rewind_backoff_policy() -> RewindBackoffPolicy { + tokio_retry::strategy::ExponentialBackoff::from_millis(REWIND_BASE_DELAY.as_millis() as _) + .factor(REWIND_BACKOFF_FACTOR) + .max_delay(REWIND_MAX_DELAY) + .map(tokio_retry::strategy::jitter) + } } +use rewind_backoff_policy::*; + struct RewindDelay { last_rewind_truncate_offset: Option, backoff_policy: RewindBackoffPolicy, @@ -218,58 +228,71 @@ impl bool> AutoRebuildStateStoreReadIter } } -type TimeoutAutoRebuildIter = - AutoRebuildStateStoreReadIter bool + Send>; +mod timeout_auto_rebuild { + use std::time::{Duration, Instant}; -async fn iter_with_timeout_rebuild( - state_store: S, - range: TableKeyRange, - epoch: HummockEpoch, - options: ReadOptions, - timeout: Duration, -) -> StorageResult> { - const CHECK_TIMEOUT_PERIOD: usize = 100; - // use a struct here to avoid accidental copy instead of move on primitive usize - struct Count(usize); - let mut check_count = Count(0); - let mut total_count = Count(0); - let mut curr_iter_item_count = Count(0); - let mut start_time = Instant::now(); - let initial_start_time = start_time; - AutoRebuildStateStoreReadIter::new( - state_store, - move || { - check_count.0 += 1; - curr_iter_item_count.0 += 1; - total_count.0 += 1; - if check_count.0 == CHECK_TIMEOUT_PERIOD { - check_count.0 = 0; - if start_time.elapsed() > timeout { - let prev_iter_item_count = curr_iter_item_count.0; - curr_iter_item_count.0 = 0; - start_time = Instant::now(); - info!( - table_id = options.table_id.table_id, - iter_exist_time_secs = initial_start_time.elapsed().as_secs(), - prev_iter_item_count, - total_iter_item_count = total_count.0, - "kv log store iter is rebuilt" - ); - true + use risingwave_hummock_sdk::key::TableKeyRange; + use risingwave_hummock_sdk::HummockEpoch; + use risingwave_storage::error::StorageResult; + use risingwave_storage::store::{ReadOptions, StateStoreRead}; + + use crate::common::log_store_impl::kv_log_store::reader::AutoRebuildStateStoreReadIter; + + pub(super) type TimeoutAutoRebuildIter = + AutoRebuildStateStoreReadIter bool + Send>; + + pub(super) async fn iter_with_timeout_rebuild( + state_store: S, + range: TableKeyRange, + epoch: HummockEpoch, + options: ReadOptions, + timeout: Duration, + ) -> StorageResult> { + const CHECK_TIMEOUT_PERIOD: usize = 100; + // use a struct here to avoid accidental copy instead of move on primitive usize + struct Count(usize); + let mut check_count = Count(0); + let mut total_count = Count(0); + let mut curr_iter_item_count = Count(0); + let mut start_time = Instant::now(); + let initial_start_time = start_time; + AutoRebuildStateStoreReadIter::new( + state_store, + move || { + check_count.0 += 1; + curr_iter_item_count.0 += 1; + total_count.0 += 1; + if check_count.0 == CHECK_TIMEOUT_PERIOD { + check_count.0 = 0; + if start_time.elapsed() > timeout { + let prev_iter_item_count = curr_iter_item_count.0; + curr_iter_item_count.0 = 0; + start_time = Instant::now(); + info!( + table_id = options.table_id.table_id, + iter_exist_time_secs = initial_start_time.elapsed().as_secs(), + prev_iter_item_count, + total_iter_item_count = total_count.0, + "kv log store iter is rebuilt" + ); + true + } else { + false + } } else { false } - } else { - false - } - }, - range, - epoch, - options, - ) - .await + }, + range, + epoch, + options, + ) + .await + } } +use timeout_auto_rebuild::*; + impl bool + Send> StateStoreIter for AutoRebuildStateStoreReadIter { diff --git a/src/stream/src/common/log_store_impl/kv_log_store/serde.rs b/src/stream/src/common/log_store_impl/kv_log_store/serde.rs index 92a3caf4cd2e3..17ab103d758b4 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/serde.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/serde.rs @@ -25,7 +25,7 @@ use itertools::Itertools; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::ColumnDesc; -use risingwave_common::hash::VirtualNode; +use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; use risingwave_common::row::{OwnedRow, Row, RowExt}; use risingwave_common::types::{DataType, ScalarImpl}; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; @@ -42,7 +42,7 @@ use risingwave_storage::error::StorageResult; use risingwave_storage::row_serde::row_serde_util::{serialize_pk, serialize_pk_with_vnode}; use risingwave_storage::row_serde::value_serde::ValueRowSerdeNew; use risingwave_storage::store::{StateStoreIterExt, StateStoreReadIter}; -use risingwave_storage::table::{compute_vnode, TableDistribution, SINGLETON_VNODE}; +use risingwave_storage::table::{compute_vnode, SINGLETON_VNODE}; use rw_futures_util::select_all; use crate::common::log_store_impl::kv_log_store::{ @@ -201,8 +201,7 @@ impl LogStoreRowSerde { let vnodes = match vnodes { Some(vnodes) => vnodes, - - None => TableDistribution::singleton_vnode_bitmap(), + None => Bitmap::singleton().into(), }; // epoch and seq_id. The seq_id of barrier is set null, and therefore the second order type @@ -216,7 +215,7 @@ impl LogStoreRowSerde { ); let dist_key_indices = if dist_key_indices.is_empty() { - if &vnodes != TableDistribution::singleton_vnode_bitmap_ref() { + if !vnodes.is_singleton() { warn!( ?vnodes, "singleton log store gets non-singleton vnode bitmap" @@ -946,7 +945,7 @@ mod tests { use risingwave_storage::store::{ FromStreamStateStoreIter, StateStoreIterItem, StateStoreReadIter, }; - use risingwave_storage::table::DEFAULT_VNODE; + use risingwave_storage::table::SINGLETON_VNODE; use tokio::sync::oneshot; use tokio::sync::oneshot::Sender; @@ -1024,7 +1023,7 @@ mod tests { seq_id += 1; } - let (key, encoded_barrier) = serde.serialize_barrier(epoch, DEFAULT_VNODE, false); + let (key, encoded_barrier) = serde.serialize_barrier(epoch, SINGLETON_VNODE, false); let key = remove_vnode_prefix(&key.0); match serde.deserialize(&encoded_barrier).unwrap() { (decoded_epoch, LogStoreRowOp::Barrier { is_checkpoint }) => { @@ -1062,7 +1061,8 @@ mod tests { seq_id += 1; } - let (key, encoded_checkpoint_barrier) = serde.serialize_barrier(epoch, DEFAULT_VNODE, true); + let (key, encoded_checkpoint_barrier) = + serde.serialize_barrier(epoch, SINGLETON_VNODE, true); let key = remove_vnode_prefix(&key.0); match serde.deserialize(&encoded_checkpoint_barrier).unwrap() { (decoded_epoch, LogStoreRowOp::Barrier { is_checkpoint }) => { @@ -1200,7 +1200,7 @@ mod tests { ) { let (ops, rows) = gen_test_data(base); let first_barrier = { - let (key, value) = serde.serialize_barrier(EPOCH0, DEFAULT_VNODE, true); + let (key, value) = serde.serialize_barrier(EPOCH0, SINGLETON_VNODE, true); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH0), value)) }; let stream = stream::once(async move { first_barrier }); @@ -1210,7 +1210,7 @@ mod tests { let stream = stream.chain(stream::once({ let serde = serde.clone(); async move { - let (key, value) = serde.serialize_barrier(EPOCH1, DEFAULT_VNODE, false); + let (key, value) = serde.serialize_barrier(EPOCH1, SINGLETON_VNODE, false); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH1), value)) } })); @@ -1218,7 +1218,7 @@ mod tests { gen_row_stream(serde.clone(), ops.clone(), rows.clone(), EPOCH2, seq_id); let stream = stream.chain(row_stream).chain(stream::once({ async move { - let (key, value) = serde.serialize_barrier(EPOCH2, DEFAULT_VNODE, true); + let (key, value) = serde.serialize_barrier(EPOCH2, SINGLETON_VNODE, true); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH2), value)) } })); diff --git a/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs b/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs index 5fc10cd0cc58a..3114c22e63323 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs @@ -143,7 +143,7 @@ pub(crate) fn gen_multi_vnode_stream_chunks( .collect_vec(); let (ops, rows) = gen_sized_test_data(base, max_count); for (op, row) in zip_eq(ops, rows) { - let vnode = VirtualNode::compute_row(&row, &[TEST_SCHEMA_DIST_KEY_INDEX]); + let vnode = VirtualNode::compute_row_for_test(&row, &[TEST_SCHEMA_DIST_KEY_INDEX]); let (ops, builder) = &mut data_builder[vnode.to_index() % MOD_COUNT]; ops.push(op); assert!(builder.append_one_row(row).is_none()); @@ -177,9 +177,9 @@ pub(crate) fn gen_test_log_store_table(pk_info: &'static KvLogStorePkInfo) -> Pb pub(crate) fn calculate_vnode_bitmap<'a>( test_data: impl Iterator)>, ) -> Bitmap { - let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT); - for vnode in - test_data.map(|(_, row)| VirtualNode::compute_row(row, &[TEST_SCHEMA_DIST_KEY_INDEX])) + let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT_FOR_TEST); + for vnode in test_data + .map(|(_, row)| VirtualNode::compute_row_for_test(row, &[TEST_SCHEMA_DIST_KEY_INDEX])) { builder.set(vnode.to_index(), true); } diff --git a/src/stream/src/common/table/test_state_table.rs b/src/stream/src/common/table/test_state_table.rs index 098548c21ac93..dde0d8a581406 100644 --- a/src/stream/src/common/table/test_state_table.rs +++ b/src/stream/src/common/table/test_state_table.rs @@ -27,7 +27,7 @@ use risingwave_common::util::value_encoding::BasicSerde; use risingwave_hummock_test::test_utils::prepare_hummock_test_env; use risingwave_storage::hummock::HummockStorage; use risingwave_storage::store::PrefetchOptions; -use risingwave_storage::table::DEFAULT_VNODE; +use risingwave_storage::table::SINGLETON_VNODE; use crate::common::table::state_table::{ ReplicatedStateTable, StateTable, WatermarkCacheStateTable, @@ -445,7 +445,7 @@ async fn test_state_table_iter_with_pk_range() { std::ops::Bound::Included(OwnedRow::new(vec![Some(4_i32.into())])), ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &pk_range, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &pk_range, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -470,7 +470,7 @@ async fn test_state_table_iter_with_pk_range() { std::ops::Bound::::Unbounded, ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &pk_range, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &pk_range, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -1976,11 +1976,11 @@ async fn test_replicated_state_table_replication() { std::ops::Bound::Included(OwnedRow::new(vec![Some(2_i32.into())])), ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); let replicated_iter = replicated_state_table - .iter_with_vnode_and_output_indices(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode_and_output_indices(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -2039,7 +2039,7 @@ async fn test_replicated_state_table_replication() { ); let iter = state_table - .iter_with_vnode(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); @@ -2048,7 +2048,7 @@ async fn test_replicated_state_table_replication() { std::ops::Bound::Unbounded, ); let replicated_iter = replicated_state_table - .iter_with_vnode_and_output_indices(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode_and_output_indices(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -2079,7 +2079,7 @@ async fn test_replicated_state_table_replication() { let range_bounds: (Bound, Bound) = (std::ops::Bound::Unbounded, std::ops::Bound::Unbounded); let replicated_iter = replicated_state_table - .iter_with_vnode_and_output_indices(DEFAULT_VNODE, &range_bounds, Default::default()) + .iter_with_vnode_and_output_indices(SINGLETON_VNODE, &range_bounds, Default::default()) .await .unwrap(); pin_mut!(replicated_iter); diff --git a/src/stream/src/executor/agg_common.rs b/src/stream/src/executor/agg_common.rs index 2cb3cad8fb2d8..c185222298d80 100644 --- a/src/stream/src/executor/agg_common.rs +++ b/src/stream/src/executor/agg_common.rs @@ -46,7 +46,9 @@ pub struct AggExecutorArgs { pub trait AggExecutorExtraArgs {} -pub struct SimpleAggExecutorExtraArgs {} +pub struct SimpleAggExecutorExtraArgs { + pub must_output_per_barrier: bool, +} impl AggExecutorExtraArgs for SimpleAggExecutorExtraArgs {} /// Extra arguments needed to construct an `HashAggExecutor`. diff --git a/src/stream/src/executor/asof_join.rs b/src/stream/src/executor/asof_join.rs new file mode 100644 index 0000000000000..cb8a141481f28 --- /dev/null +++ b/src/stream/src/executor/asof_join.rs @@ -0,0 +1,1377 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +use std::collections::{BTreeMap, HashSet}; +use std::ops::Bound; +use std::time::Duration; + +use either::Either; +use itertools::Itertools; +use multimap::MultiMap; +use risingwave_common::array::Op; +use risingwave_common::hash::{HashKey, NullBitmap}; +use risingwave_common::util::epoch::EpochPair; +use risingwave_common::util::iter_util::ZipEqDebug; +use tokio::time::Instant; + +use self::builder::JoinChunkBuilder; +use super::barrier_align::*; +use super::join::hash_join::*; +use super::join::*; +use super::watermark::*; +use crate::executor::join::builder::JoinStreamChunkBuilder; +use crate::executor::prelude::*; + +/// Evict the cache every n rows. +const EVICT_EVERY_N_ROWS: u32 = 16; + +fn is_subset(vec1: Vec, vec2: Vec) -> bool { + HashSet::::from_iter(vec1).is_subset(&vec2.into_iter().collect()) +} + +pub struct JoinParams { + /// Indices of the join keys + pub join_key_indices: Vec, + /// Indices of the input pk after dedup + pub deduped_pk_indices: Vec, +} + +impl JoinParams { + pub fn new(join_key_indices: Vec, deduped_pk_indices: Vec) -> Self { + Self { + join_key_indices, + deduped_pk_indices, + } + } +} + +struct JoinSide { + /// Store all data from a one side stream + ht: JoinHashMap, + /// Indices of the join key columns + join_key_indices: Vec, + /// The data type of all columns without degree. + all_data_types: Vec, + /// The start position for the side in output new columns + start_pos: usize, + /// The mapping from input indices of a side to output columes. + i2o_mapping: Vec<(usize, usize)>, + i2o_mapping_indexed: MultiMap, + /// Whether degree table is needed for this side. + need_degree_table: bool, +} + +impl std::fmt::Debug for JoinSide { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("JoinSide") + .field("join_key_indices", &self.join_key_indices) + .field("col_types", &self.all_data_types) + .field("start_pos", &self.start_pos) + .field("i2o_mapping", &self.i2o_mapping) + .field("need_degree_table", &self.need_degree_table) + .finish() + } +} + +impl JoinSide { + // WARNING: Please do not call this until we implement it. + fn is_dirty(&self) -> bool { + unimplemented!() + } + + #[expect(dead_code)] + fn clear_cache(&mut self) { + assert!( + !self.is_dirty(), + "cannot clear cache while states of hash join are dirty" + ); + + // TODO: not working with rearranged chain + // self.ht.clear(); + } + + pub fn init(&mut self, epoch: EpochPair) { + self.ht.init(epoch); + } +} + +/// `AsOfJoinExecutor` takes two input streams and runs equal hash join on them. +/// The output columns are the concatenation of left and right columns. +pub struct AsOfJoinExecutor { + ctx: ActorContextRef, + info: ExecutorInfo, + + /// Left input executor + input_l: Option, + /// Right input executor + input_r: Option, + /// The data types of the formed new columns + actual_output_data_types: Vec, + /// The parameters of the left join executor + side_l: JoinSide, + /// The parameters of the right join executor + side_r: JoinSide, + + metrics: Arc, + /// The maximum size of the chunk produced by executor at a time + chunk_size: usize, + /// Count the messages received, clear to 0 when counted to `EVICT_EVERY_N_MESSAGES` + cnt_rows_received: u32, + + /// watermark column index -> `BufferedWatermarks` + watermark_buffers: BTreeMap>, + + high_join_amplification_threshold: usize, + /// `AsOf` join description + asof_desc: AsOfDesc, +} + +impl std::fmt::Debug + for AsOfJoinExecutor +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AsOfJoinExecutor") + .field("join_type", &T) + .field("input_left", &self.input_l.as_ref().unwrap().identity()) + .field("input_right", &self.input_r.as_ref().unwrap().identity()) + .field("side_l", &self.side_l) + .field("side_r", &self.side_r) + .field("pk_indices", &self.info.pk_indices) + .field("schema", &self.info.schema) + .field("actual_output_data_types", &self.actual_output_data_types) + .finish() + } +} + +impl Execute + for AsOfJoinExecutor +{ + fn execute(self: Box) -> BoxedMessageStream { + self.into_stream().boxed() + } +} + +struct EqJoinArgs<'a, K: HashKey, S: StateStore> { + ctx: &'a ActorContextRef, + side_l: &'a mut JoinSide, + side_r: &'a mut JoinSide, + asof_desc: &'a AsOfDesc, + actual_output_data_types: &'a [DataType], + // inequality_watermarks: &'a Watermark, + chunk: StreamChunk, + chunk_size: usize, + cnt_rows_received: &'a mut u32, + high_join_amplification_threshold: usize, +} + +impl AsOfJoinExecutor { + #[allow(clippy::too_many_arguments)] + pub fn new( + ctx: ActorContextRef, + info: ExecutorInfo, + input_l: Executor, + input_r: Executor, + params_l: JoinParams, + params_r: JoinParams, + null_safe: Vec, + output_indices: Vec, + state_table_l: StateTable, + degree_state_table_l: StateTable, + state_table_r: StateTable, + degree_state_table_r: StateTable, + watermark_epoch: AtomicU64Ref, + metrics: Arc, + chunk_size: usize, + high_join_amplification_threshold: usize, + asof_desc: AsOfDesc, + ) -> Self { + let side_l_column_n = input_l.schema().len(); + + let schema_fields = [ + input_l.schema().fields.clone(), + input_r.schema().fields.clone(), + ] + .concat(); + + let original_output_data_types = schema_fields + .iter() + .map(|field| field.data_type()) + .collect_vec(); + let actual_output_data_types = output_indices + .iter() + .map(|&idx| original_output_data_types[idx].clone()) + .collect_vec(); + + // Data types of of hash join state. + let state_all_data_types_l = input_l.schema().data_types(); + let state_all_data_types_r = input_r.schema().data_types(); + + let state_pk_indices_l = input_l.pk_indices().to_vec(); + let state_pk_indices_r = input_r.pk_indices().to_vec(); + + let state_order_key_indices_l = state_table_l.pk_indices(); + let state_order_key_indices_r = state_table_r.pk_indices(); + + let state_join_key_indices_l = params_l.join_key_indices; + let state_join_key_indices_r = params_r.join_key_indices; + + let degree_join_key_indices_l = (0..state_join_key_indices_l.len()).collect_vec(); + let degree_join_key_indices_r = (0..state_join_key_indices_r.len()).collect_vec(); + + let degree_pk_indices_l = (state_join_key_indices_l.len() + ..state_join_key_indices_l.len() + params_l.deduped_pk_indices.len()) + .collect_vec(); + let degree_pk_indices_r = (state_join_key_indices_r.len() + ..state_join_key_indices_r.len() + params_r.deduped_pk_indices.len()) + .collect_vec(); + + // If pk is contained in join key. + let pk_contained_in_jk_l = + is_subset(state_pk_indices_l.clone(), state_join_key_indices_l.clone()); + let pk_contained_in_jk_r = + is_subset(state_pk_indices_r.clone(), state_join_key_indices_r.clone()); + + let join_key_data_types_l = state_join_key_indices_l + .iter() + .map(|idx| state_all_data_types_l[*idx].clone()) + .collect_vec(); + + let join_key_data_types_r = state_join_key_indices_r + .iter() + .map(|idx| state_all_data_types_r[*idx].clone()) + .collect_vec(); + + assert_eq!(join_key_data_types_l, join_key_data_types_r); + + let degree_all_data_types_l = state_order_key_indices_l + .iter() + .map(|idx| state_all_data_types_l[*idx].clone()) + .collect_vec(); + let degree_all_data_types_r = state_order_key_indices_r + .iter() + .map(|idx| state_all_data_types_r[*idx].clone()) + .collect_vec(); + + let null_matched = K::Bitmap::from_bool_vec(null_safe); + + let need_degree_table_l = false; + let need_degree_table_r = false; + + let (left_to_output, right_to_output) = { + let (left_len, right_len) = if is_left_semi_or_anti(T) { + (state_all_data_types_l.len(), 0usize) + } else if is_right_semi_or_anti(T) { + (0usize, state_all_data_types_r.len()) + } else { + (state_all_data_types_l.len(), state_all_data_types_r.len()) + }; + JoinStreamChunkBuilder::get_i2o_mapping(&output_indices, left_len, right_len) + }; + + let l2o_indexed = MultiMap::from_iter(left_to_output.iter().copied()); + let r2o_indexed = MultiMap::from_iter(right_to_output.iter().copied()); + + // handle inequality watermarks + // https://github.com/risingwavelabs/risingwave/issues/18503 + // let inequality_watermarks = None; + let watermark_buffers = BTreeMap::new(); + + let inequal_key_idx_l = Some(asof_desc.left_idx); + let inequal_key_idx_r = Some(asof_desc.right_idx); + + Self { + ctx: ctx.clone(), + info, + input_l: Some(input_l), + input_r: Some(input_r), + actual_output_data_types, + side_l: JoinSide { + ht: JoinHashMap::new( + watermark_epoch.clone(), + join_key_data_types_l, + state_join_key_indices_l.clone(), + state_all_data_types_l.clone(), + state_table_l, + params_l.deduped_pk_indices, + degree_join_key_indices_l, + degree_all_data_types_l, + degree_state_table_l, + degree_pk_indices_l, + null_matched.clone(), + need_degree_table_l, + pk_contained_in_jk_l, + inequal_key_idx_l, + metrics.clone(), + ctx.id, + ctx.fragment_id, + "left", + ), + join_key_indices: state_join_key_indices_l, + all_data_types: state_all_data_types_l, + i2o_mapping: left_to_output, + i2o_mapping_indexed: l2o_indexed, + start_pos: 0, + need_degree_table: need_degree_table_l, + }, + side_r: JoinSide { + ht: JoinHashMap::new( + watermark_epoch, + join_key_data_types_r, + state_join_key_indices_r.clone(), + state_all_data_types_r.clone(), + state_table_r, + params_r.deduped_pk_indices, + degree_join_key_indices_r, + degree_all_data_types_r, + degree_state_table_r, + degree_pk_indices_r, + null_matched, + need_degree_table_r, + pk_contained_in_jk_r, + inequal_key_idx_r, + metrics.clone(), + ctx.id, + ctx.fragment_id, + "right", + ), + join_key_indices: state_join_key_indices_r, + all_data_types: state_all_data_types_r, + start_pos: side_l_column_n, + i2o_mapping: right_to_output, + i2o_mapping_indexed: r2o_indexed, + need_degree_table: need_degree_table_r, + }, + metrics, + chunk_size, + cnt_rows_received: 0, + watermark_buffers, + high_join_amplification_threshold, + asof_desc, + } + } + + #[try_stream(ok = Message, error = StreamExecutorError)] + async fn into_stream(mut self) { + let input_l = self.input_l.take().unwrap(); + let input_r = self.input_r.take().unwrap(); + let aligned_stream = barrier_align( + input_l.execute(), + input_r.execute(), + self.ctx.id, + self.ctx.fragment_id, + self.metrics.clone(), + "Join", + ); + pin_mut!(aligned_stream); + + let barrier = expect_first_barrier_from_aligned_stream(&mut aligned_stream).await?; + self.side_l.init(barrier.epoch); + self.side_r.init(barrier.epoch); + + // The first barrier message should be propagated. + yield Message::Barrier(barrier); + let actor_id_str = self.ctx.id.to_string(); + let fragment_id_str = self.ctx.fragment_id.to_string(); + + // initialized some metrics + let join_actor_input_waiting_duration_ns = self + .metrics + .join_actor_input_waiting_duration_ns + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str]); + let left_join_match_duration_ns = self + .metrics + .join_match_duration_ns + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str, "left"]); + let right_join_match_duration_ns = self + .metrics + .join_match_duration_ns + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str, "right"]); + + let barrier_join_match_duration_ns = self + .metrics + .join_match_duration_ns + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str, "barrier"]); + + let left_join_cached_entry_count = self + .metrics + .join_cached_entry_count + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str, "left"]); + + let right_join_cached_entry_count = self + .metrics + .join_cached_entry_count + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str, "right"]); + + let mut start_time = Instant::now(); + + while let Some(msg) = aligned_stream + .next() + .instrument_await("hash_join_barrier_align") + .await + { + join_actor_input_waiting_duration_ns.inc_by(start_time.elapsed().as_nanos() as u64); + match msg? { + AlignedMessage::WatermarkLeft(watermark) => { + for watermark_to_emit in self.handle_watermark(SideType::Left, watermark)? { + yield Message::Watermark(watermark_to_emit); + } + } + AlignedMessage::WatermarkRight(watermark) => { + for watermark_to_emit in self.handle_watermark(SideType::Right, watermark)? { + yield Message::Watermark(watermark_to_emit); + } + } + AlignedMessage::Left(chunk) => { + let mut left_time = Duration::from_nanos(0); + let mut left_start_time = Instant::now(); + #[for_await] + for chunk in Self::eq_join_left(EqJoinArgs { + ctx: &self.ctx, + side_l: &mut self.side_l, + side_r: &mut self.side_r, + asof_desc: &self.asof_desc, + actual_output_data_types: &self.actual_output_data_types, + // inequality_watermarks: &self.inequality_watermarks, + chunk, + chunk_size: self.chunk_size, + cnt_rows_received: &mut self.cnt_rows_received, + high_join_amplification_threshold: self.high_join_amplification_threshold, + }) { + left_time += left_start_time.elapsed(); + yield Message::Chunk(chunk?); + left_start_time = Instant::now(); + } + left_time += left_start_time.elapsed(); + left_join_match_duration_ns.inc_by(left_time.as_nanos() as u64); + self.try_flush_data().await?; + } + AlignedMessage::Right(chunk) => { + let mut right_time = Duration::from_nanos(0); + let mut right_start_time = Instant::now(); + #[for_await] + for chunk in Self::eq_join_right(EqJoinArgs { + ctx: &self.ctx, + side_l: &mut self.side_l, + side_r: &mut self.side_r, + asof_desc: &self.asof_desc, + actual_output_data_types: &self.actual_output_data_types, + // inequality_watermarks: &self.inequality_watermarks, + chunk, + chunk_size: self.chunk_size, + cnt_rows_received: &mut self.cnt_rows_received, + high_join_amplification_threshold: self.high_join_amplification_threshold, + }) { + right_time += right_start_time.elapsed(); + yield Message::Chunk(chunk?); + right_start_time = Instant::now(); + } + right_time += right_start_time.elapsed(); + right_join_match_duration_ns.inc_by(right_time.as_nanos() as u64); + self.try_flush_data().await?; + } + AlignedMessage::Barrier(barrier) => { + let barrier_start_time = Instant::now(); + self.flush_data(barrier.epoch).await?; + + // Update the vnode bitmap for state tables of both sides if asked. + if let Some(vnode_bitmap) = barrier.as_update_vnode_bitmap(self.ctx.id) { + if self.side_l.ht.update_vnode_bitmap(vnode_bitmap.clone()) { + self.watermark_buffers + .values_mut() + .for_each(|buffers| buffers.clear()); + // self.inequality_watermarks.fill(None); + } + self.side_r.ht.update_vnode_bitmap(vnode_bitmap); + } + + // Report metrics of cached join rows/entries + for (join_cached_entry_count, ht) in [ + (&left_join_cached_entry_count, &self.side_l.ht), + (&right_join_cached_entry_count, &self.side_r.ht), + ] { + join_cached_entry_count.set(ht.entry_count() as i64); + } + + barrier_join_match_duration_ns + .inc_by(barrier_start_time.elapsed().as_nanos() as u64); + yield Message::Barrier(barrier); + } + } + start_time = Instant::now(); + } + } + + async fn flush_data(&mut self, epoch: EpochPair) -> StreamExecutorResult<()> { + // All changes to the state has been buffered in the mem-table of the state table. Just + // `commit` them here. + self.side_l.ht.flush(epoch).await?; + self.side_r.ht.flush(epoch).await?; + Ok(()) + } + + async fn try_flush_data(&mut self) -> StreamExecutorResult<()> { + // All changes to the state has been buffered in the mem-table of the state table. Just + // `commit` them here. + self.side_l.ht.try_flush().await?; + self.side_r.ht.try_flush().await?; + Ok(()) + } + + // We need to manually evict the cache. + fn evict_cache( + side_update: &mut JoinSide, + side_match: &mut JoinSide, + cnt_rows_received: &mut u32, + ) { + *cnt_rows_received += 1; + if *cnt_rows_received == EVICT_EVERY_N_ROWS { + side_update.ht.evict(); + side_match.ht.evict(); + *cnt_rows_received = 0; + } + } + + fn handle_watermark( + &mut self, + side: SideTypePrimitive, + watermark: Watermark, + ) -> StreamExecutorResult> { + let (side_update, side_match) = if side == SideType::Left { + (&mut self.side_l, &mut self.side_r) + } else { + (&mut self.side_r, &mut self.side_l) + }; + + // State cleaning + if side_update.join_key_indices[0] == watermark.col_idx { + side_match.ht.update_watermark(watermark.val.clone()); + } + + // Select watermarks to yield. + let wm_in_jk = side_update + .join_key_indices + .iter() + .positions(|idx| *idx == watermark.col_idx); + let mut watermarks_to_emit = vec![]; + for idx in wm_in_jk { + let buffers = self + .watermark_buffers + .entry(idx) + .or_insert_with(|| BufferedWatermarks::with_ids([SideType::Left, SideType::Right])); + if let Some(selected_watermark) = buffers.handle_watermark(side, watermark.clone()) { + let empty_indices = vec![]; + let output_indices = side_update + .i2o_mapping_indexed + .get_vec(&side_update.join_key_indices[idx]) + .unwrap_or(&empty_indices) + .iter() + .chain( + side_match + .i2o_mapping_indexed + .get_vec(&side_match.join_key_indices[idx]) + .unwrap_or(&empty_indices), + ); + for output_idx in output_indices { + watermarks_to_emit.push(selected_watermark.clone().with_idx(*output_idx)); + } + }; + } + Ok(watermarks_to_emit) + } + + /// the data the hash table and match the coming + /// data chunk with the executor state + async fn hash_eq_match( + key: &K, + ht: &mut JoinHashMap, + ) -> StreamExecutorResult> { + if !key.null_bitmap().is_subset(ht.null_matched()) { + Ok(None) + } else { + ht.take_state(key).await.map(Some) + } + } + + #[try_stream(ok = StreamChunk, error = StreamExecutorError)] + async fn eq_join_left(args: EqJoinArgs<'_, K, S>) { + let EqJoinArgs { + ctx: _, + side_l, + side_r, + asof_desc, + actual_output_data_types, + // inequality_watermarks, + chunk, + chunk_size, + cnt_rows_received, + high_join_amplification_threshold: _, + } = args; + + let (side_update, side_match) = (side_l, side_r); + + let mut join_chunk_builder = + JoinChunkBuilder::::new(JoinStreamChunkBuilder::new( + chunk_size, + actual_output_data_types.to_vec(), + side_update.i2o_mapping.clone(), + side_match.i2o_mapping.clone(), + )); + + let keys = K::build_many(&side_update.join_key_indices, chunk.data_chunk()); + for (r, key) in chunk.rows_with_holes().zip_eq_debug(keys.iter()) { + let Some((op, row)) = r else { + continue; + }; + Self::evict_cache(side_update, side_match, cnt_rows_received); + + let matched_rows = if !side_update.ht.check_inequal_key_null(&row) { + Self::hash_eq_match(key, &mut side_match.ht).await? + } else { + None + }; + let inequal_key = side_update.ht.serialize_inequal_key_from_row(row); + + if let Some(matched_rows) = matched_rows { + let matched_row_by_inequality = match asof_desc.inequality_type { + AsOfInequalityType::Lt => matched_rows.lower_bound_by_inequality( + Bound::Excluded(&inequal_key), + &side_match.all_data_types, + ), + AsOfInequalityType::Le => matched_rows.lower_bound_by_inequality( + Bound::Included(&inequal_key), + &side_match.all_data_types, + ), + AsOfInequalityType::Gt => matched_rows.upper_bound_by_inequality( + Bound::Excluded(&inequal_key), + &side_match.all_data_types, + ), + AsOfInequalityType::Ge => matched_rows.upper_bound_by_inequality( + Bound::Included(&inequal_key), + &side_match.all_data_types, + ), + }; + match op { + Op::Insert | Op::UpdateInsert => { + if let Some(matched_row_by_inequality) = matched_row_by_inequality { + let matched_row = matched_row_by_inequality?; + + if let Some(chunk) = + join_chunk_builder.with_match_on_insert(&row, &matched_row) + { + yield chunk; + } + } else if let Some(chunk) = + join_chunk_builder.forward_if_not_matched(Op::Insert, row) + { + yield chunk; + } + side_update.ht.insert_row(key, row).await?; + } + Op::Delete | Op::UpdateDelete => { + if let Some(matched_row_by_inequality) = matched_row_by_inequality { + let matched_row = matched_row_by_inequality?; + + if let Some(chunk) = + join_chunk_builder.with_match_on_delete(&row, &matched_row) + { + yield chunk; + } + } else if let Some(chunk) = + join_chunk_builder.forward_if_not_matched(Op::Delete, row) + { + yield chunk; + } + side_update.ht.delete_row(key, row)?; + } + } + // Insert back the state taken from ht. + side_match.ht.update_state(key, matched_rows); + } else { + // Row which violates null-safe bitmap will never be matched so we need not + // store. + match op { + Op::Insert | Op::UpdateInsert => { + if let Some(chunk) = + join_chunk_builder.forward_if_not_matched(Op::Insert, row) + { + yield chunk; + } + } + Op::Delete | Op::UpdateDelete => { + if let Some(chunk) = + join_chunk_builder.forward_if_not_matched(Op::Delete, row) + { + yield chunk; + } + } + } + } + } + if let Some(chunk) = join_chunk_builder.take() { + yield chunk; + } + } + + #[try_stream(ok = StreamChunk, error = StreamExecutorError)] + async fn eq_join_right(args: EqJoinArgs<'_, K, S>) { + let EqJoinArgs { + ctx, + side_l, + side_r, + asof_desc, + actual_output_data_types, + // inequality_watermarks, + chunk, + chunk_size, + cnt_rows_received, + high_join_amplification_threshold, + } = args; + + let (side_update, side_match) = (side_r, side_l); + + let mut join_chunk_builder = JoinStreamChunkBuilder::new( + chunk_size, + actual_output_data_types.to_vec(), + side_update.i2o_mapping.clone(), + side_match.i2o_mapping.clone(), + ); + + let join_matched_rows_metrics = ctx + .streaming_metrics + .join_matched_join_keys + .with_guarded_label_values(&[ + &ctx.id.to_string(), + &ctx.fragment_id.to_string(), + &side_update.ht.table_id().to_string(), + ]); + + let keys = K::build_many(&side_update.join_key_indices, chunk.data_chunk()); + for (r, key) in chunk.rows_with_holes().zip_eq_debug(keys.iter()) { + let Some((op, row)) = r else { + continue; + }; + let mut join_matched_rows_cnt = 0; + + Self::evict_cache(side_update, side_match, cnt_rows_received); + + let matched_rows = if !side_update.ht.check_inequal_key_null(&row) { + Self::hash_eq_match(key, &mut side_match.ht).await? + } else { + None + }; + let inequal_key = side_update.ht.serialize_inequal_key_from_row(row); + + if let Some(matched_rows) = matched_rows { + let update_rows = Self::hash_eq_match(key, &mut side_update.ht).await?.expect("None is not expected because we have checked null in key when getting matched_rows"); + let right_inequality_index = update_rows.inequality_index(); + let (row_to_delete_r, row_to_insert_r) = + if let Some(pks) = right_inequality_index.get(&inequal_key) { + assert!(!pks.is_empty()); + let row_pk = side_match.ht.serialize_pk_from_row(row); + match op { + Op::Insert | Op::UpdateInsert => { + // If there are multiple rows match the inequality key in the right table, we use one with smallest pk. + let smallest_pk = pks.first_key_sorted().unwrap(); + if smallest_pk > &row_pk { + // smallest_pk is in the cache index, so it must exist in the cache. + if let Some(to_delete_row) = update_rows + .get_by_indexed_pk(smallest_pk, &side_update.all_data_types) + { + ( + Some(Either::Left(to_delete_row?.row)), + Some(Either::Right(row)), + ) + } else { + // Something wrong happened. Ignore this row in non strict consistency mode. + (None, None) + } + } else { + // No affected row in the right table. + (None, None) + } + } + Op::Delete | Op::UpdateDelete => { + let smallest_pk = pks.first_key_sorted().unwrap(); + if smallest_pk == &row_pk { + if let Some(second_smallest_pk) = pks.second_key_sorted() { + if let Some(to_insert_row) = update_rows.get_by_indexed_pk( + second_smallest_pk, + &side_update.all_data_types, + ) { + ( + Some(Either::Right(row)), + Some(Either::Left(to_insert_row?.row)), + ) + } else { + // Something wrong happened. Ignore this row in non strict consistency mode. + (None, None) + } + } else { + (Some(Either::Right(row)), None) + } + } else { + // No affected row in the right table. + (None, None) + } + } + } + } else { + match op { + // Decide the row_to_delete later + Op::Insert | Op::UpdateInsert => (None, Some(Either::Right(row))), + // Decide the row_to_insert later + Op::Delete | Op::UpdateDelete => (Some(Either::Right(row)), None), + } + }; + + // 4 cases for row_to_delete_r and row_to_insert_r: + // 1. Some(_), Some(_): delete row_to_delete_r and insert row_to_insert_r + // 2. None, Some(_) : row_to_delete to be decided by the nearest inequality key + // 3. Some(_), None : row_to_insert to be decided by the nearest inequality key + // 4. None, None : do nothing + if row_to_delete_r.is_none() && row_to_insert_r.is_none() { + // no row to delete or insert. + } else { + let prev_inequality_key = + right_inequality_index.upper_bound_key(Bound::Excluded(&inequal_key)); + let next_inequality_key = + right_inequality_index.lower_bound_key(Bound::Excluded(&inequal_key)); + let affected_row_r = match asof_desc.inequality_type { + AsOfInequalityType::Lt | AsOfInequalityType::Le => next_inequality_key + .and_then(|k| { + update_rows.get_first_by_inequality(k, &side_update.all_data_types) + }), + AsOfInequalityType::Gt | AsOfInequalityType::Ge => prev_inequality_key + .and_then(|k| { + update_rows.get_first_by_inequality(k, &side_update.all_data_types) + }), + } + .transpose()? + .map(|r| Either::Left(r.row)); + + let (row_to_delete_r, row_to_insert_r) = + match (&row_to_delete_r, &row_to_insert_r) { + (Some(_), Some(_)) => (row_to_delete_r, row_to_insert_r), + (None, Some(_)) => (affected_row_r, row_to_insert_r), + (Some(_), None) => (row_to_delete_r, affected_row_r), + (None, None) => unreachable!(), + }; + let range = match asof_desc.inequality_type { + AsOfInequalityType::Lt => ( + prev_inequality_key.map_or_else(|| Bound::Unbounded, Bound::Included), + Bound::Excluded(&inequal_key), + ), + AsOfInequalityType::Le => ( + prev_inequality_key.map_or_else(|| Bound::Unbounded, Bound::Excluded), + Bound::Included(&inequal_key), + ), + AsOfInequalityType::Gt => ( + Bound::Excluded(&inequal_key), + next_inequality_key.map_or_else(|| Bound::Unbounded, Bound::Included), + ), + AsOfInequalityType::Ge => ( + Bound::Included(&inequal_key), + next_inequality_key.map_or_else(|| Bound::Unbounded, Bound::Excluded), + ), + }; + + let rows_l = + matched_rows.range_by_inequality(range, &side_match.all_data_types); + for row_l in rows_l { + join_matched_rows_cnt += 1; + let row_l = row_l?.row; + if let Some(row_to_delete_r) = &row_to_delete_r { + if let Some(chunk) = + join_chunk_builder.append_row(Op::Delete, row_to_delete_r, &row_l) + { + yield chunk; + } + } else if is_as_of_left_outer(T) { + if let Some(chunk) = + join_chunk_builder.append_row_matched(Op::Delete, &row_l) + { + yield chunk; + } + } + if let Some(row_to_insert_r) = &row_to_insert_r { + if let Some(chunk) = + join_chunk_builder.append_row(Op::Insert, row_to_insert_r, &row_l) + { + yield chunk; + } + } else if is_as_of_left_outer(T) { + if let Some(chunk) = + join_chunk_builder.append_row_matched(Op::Insert, &row_l) + { + yield chunk; + } + } + } + } + // Insert back the state taken from ht. + side_match.ht.update_state(key, matched_rows); + side_update.ht.update_state(key, update_rows); + + match op { + Op::Insert | Op::UpdateInsert => { + side_update.ht.insert_row(key, row).await?; + } + Op::Delete | Op::UpdateDelete => { + side_update.ht.delete_row(key, row)?; + } + } + } else { + // Row which violates null-safe bitmap will never be matched so we need not + // store. + // Noop here because we only support left outer AsOf join. + } + join_matched_rows_metrics.observe(join_matched_rows_cnt as _); + if join_matched_rows_cnt > high_join_amplification_threshold { + let join_key_data_types = side_update.ht.join_key_data_types(); + let key = key.deserialize(join_key_data_types)?; + tracing::warn!(target: "high_join_amplification", + matched_rows_len = join_matched_rows_cnt, + update_table_id = side_update.ht.table_id(), + match_table_id = side_match.ht.table_id(), + join_key = ?key, + actor_id = ctx.id, + fragment_id = ctx.fragment_id, + "large rows matched for join key when AsOf join updating right side", + ); + } + } + if let Some(chunk) = join_chunk_builder.take() { + yield chunk; + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::AtomicU64; + + use risingwave_common::array::*; + use risingwave_common::catalog::{ColumnDesc, ColumnId, Field, TableId}; + use risingwave_common::hash::Key64; + use risingwave_common::util::epoch::test_epoch; + use risingwave_common::util::sort_util::OrderType; + use risingwave_storage::memory::MemoryStateStore; + + use super::*; + use crate::executor::test_utils::{MessageSender, MockSource, StreamExecutorTestExt}; + + async fn create_in_memory_state_table( + mem_state: MemoryStateStore, + data_types: &[DataType], + order_types: &[OrderType], + pk_indices: &[usize], + table_id: u32, + ) -> (StateTable, StateTable) { + let column_descs = data_types + .iter() + .enumerate() + .map(|(id, data_type)| ColumnDesc::unnamed(ColumnId::new(id as i32), data_type.clone())) + .collect_vec(); + let state_table = StateTable::new_without_distribution( + mem_state.clone(), + TableId::new(table_id), + column_descs, + order_types.to_vec(), + pk_indices.to_vec(), + ) + .await; + + // Create degree table + let mut degree_table_column_descs = vec![]; + pk_indices.iter().enumerate().for_each(|(pk_id, idx)| { + degree_table_column_descs.push(ColumnDesc::unnamed( + ColumnId::new(pk_id as i32), + data_types[*idx].clone(), + )) + }); + degree_table_column_descs.push(ColumnDesc::unnamed( + ColumnId::new(pk_indices.len() as i32), + DataType::Int64, + )); + let degree_state_table = StateTable::new_without_distribution( + mem_state, + TableId::new(table_id + 1), + degree_table_column_descs, + order_types.to_vec(), + pk_indices.to_vec(), + ) + .await; + (state_table, degree_state_table) + } + + async fn create_executor( + asof_desc: AsOfDesc, + ) -> (MessageSender, MessageSender, BoxedMessageStream) { + let schema = Schema { + fields: vec![ + Field::unnamed(DataType::Int64), // join key + Field::unnamed(DataType::Int64), + Field::unnamed(DataType::Int64), + ], + }; + let (tx_l, source_l) = MockSource::channel(); + let source_l = source_l.into_executor(schema.clone(), vec![1]); + let (tx_r, source_r) = MockSource::channel(); + let source_r = source_r.into_executor(schema, vec![1]); + let params_l = JoinParams::new(vec![0], vec![1]); + let params_r = JoinParams::new(vec![0], vec![1]); + + let mem_state = MemoryStateStore::new(); + + let (state_l, degree_state_l) = create_in_memory_state_table( + mem_state.clone(), + &[DataType::Int64, DataType::Int64, DataType::Int64], + &[ + OrderType::ascending(), + OrderType::ascending(), + OrderType::ascending(), + ], + &[0, asof_desc.left_idx, 1], + 0, + ) + .await; + + let (state_r, degree_state_r) = create_in_memory_state_table( + mem_state, + &[DataType::Int64, DataType::Int64, DataType::Int64], + &[ + OrderType::ascending(), + OrderType::ascending(), + OrderType::ascending(), + ], + &[0, asof_desc.right_idx, 1], + 2, + ) + .await; + + let schema: Schema = [source_l.schema().fields(), source_r.schema().fields()] + .concat() + .into_iter() + .collect(); + let schema_len = schema.len(); + let info = ExecutorInfo { + schema, + pk_indices: vec![1], + identity: "HashJoinExecutor".to_string(), + }; + + let executor = AsOfJoinExecutor::::new( + ActorContext::for_test(123), + info, + source_l, + source_r, + params_l, + params_r, + vec![false], + (0..schema_len).collect_vec(), + state_l, + degree_state_l, + state_r, + degree_state_r, + Arc::new(AtomicU64::new(0)), + Arc::new(StreamingMetrics::unused()), + 1024, + 2048, + asof_desc, + ); + (tx_l, tx_r, executor.boxed().execute()) + } + + #[tokio::test] + async fn test_as_of_inner_join() -> StreamExecutorResult<()> { + let asof_desc = AsOfDesc { + left_idx: 0, + right_idx: 2, + inequality_type: AsOfInequalityType::Lt, + }; + + let chunk_l1 = StreamChunk::from_pretty( + " I I I + + 1 4 7 + + 2 5 8 + + 3 6 9", + ); + let chunk_l2 = StreamChunk::from_pretty( + " I I I + + 3 8 1 + - 3 8 1", + ); + let chunk_r1 = StreamChunk::from_pretty( + " I I I + + 2 1 7 + + 2 2 1 + + 2 3 4 + + 2 4 2 + + 6 1 9 + + 6 2 9", + ); + let chunk_r2 = StreamChunk::from_pretty( + " I I I + - 2 3 4", + ); + let chunk_r3 = StreamChunk::from_pretty( + " I I I + + 2 3 3", + ); + let chunk_l3 = StreamChunk::from_pretty( + " I I I + - 2 5 8", + ); + let chunk_l4 = StreamChunk::from_pretty( + " I I I + + 6 3 1 + + 6 4 1", + ); + let chunk_r4 = StreamChunk::from_pretty( + " I I I + - 6 1 9", + ); + + let (mut tx_l, mut tx_r, mut hash_join) = + create_executor::<{ AsOfJoinType::Inner }>(asof_desc).await; + + // push the init barrier for left and right + tx_l.push_barrier(test_epoch(1), false); + tx_r.push_barrier(test_epoch(1), false); + hash_join.next_unwrap_ready_barrier()?; + + // push the 1st left chunk + tx_l.push_chunk(chunk_l1); + hash_join.next_unwrap_pending(); + + // push the init barrier for left and right + tx_l.push_barrier(test_epoch(2), false); + tx_r.push_barrier(test_epoch(2), false); + hash_join.next_unwrap_ready_barrier()?; + + // push the 2nd left chunk + tx_l.push_chunk(chunk_l2); + hash_join.next_unwrap_pending(); + + // push the 1st right chunk + tx_r.push_chunk(chunk_r1); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + + 2 5 8 2 1 7 + - 2 5 8 2 1 7 + + 2 5 8 2 3 4" + ) + ); + + // push the 2nd right chunk + tx_r.push_chunk(chunk_r2); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 2 5 8 2 3 4 + + 2 5 8 2 1 7" + ) + ); + + // push the 3rd right chunk + tx_r.push_chunk(chunk_r3); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 2 5 8 2 1 7 + + 2 5 8 2 3 3" + ) + ); + + // push the 3rd left chunk + tx_l.push_chunk(chunk_l3); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 2 5 8 2 3 3" + ) + ); + + // push the 4th left chunk + tx_l.push_chunk(chunk_l4); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + + 6 3 1 6 1 9 + + 6 4 1 6 1 9" + ) + ); + + // push the 4th right chunk + tx_r.push_chunk(chunk_r4); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 6 3 1 6 1 9 + + 6 3 1 6 2 9 + - 6 4 1 6 1 9 + + 6 4 1 6 2 9" + ) + ); + + Ok(()) + } + + #[tokio::test] + async fn test_as_of_left_outer_join() -> StreamExecutorResult<()> { + let asof_desc = AsOfDesc { + left_idx: 1, + right_idx: 2, + inequality_type: AsOfInequalityType::Ge, + }; + + let chunk_l1 = StreamChunk::from_pretty( + " I I I + + 1 4 7 + + 2 5 8 + + 3 6 9", + ); + let chunk_l2 = StreamChunk::from_pretty( + " I I I + + 3 8 1 + - 3 8 1", + ); + let chunk_r1 = StreamChunk::from_pretty( + " I I I + + 2 3 4 + + 2 2 5 + + 2 1 5 + + 6 1 8 + + 6 2 9", + ); + let chunk_r2 = StreamChunk::from_pretty( + " I I I + - 2 3 4 + - 2 1 5 + - 2 2 5", + ); + let chunk_l3 = StreamChunk::from_pretty( + " I I I + + 6 8 9", + ); + let chunk_r3 = StreamChunk::from_pretty( + " I I I + - 6 1 8", + ); + + let (mut tx_l, mut tx_r, mut hash_join) = + create_executor::<{ AsOfJoinType::LeftOuter }>(asof_desc).await; + + // push the init barrier for left and right + tx_l.push_barrier(test_epoch(1), false); + tx_r.push_barrier(test_epoch(1), false); + hash_join.next_unwrap_ready_barrier()?; + + // push the 1st left chunk + tx_l.push_chunk(chunk_l1); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + + 1 4 7 . . . + + 2 5 8 . . . + + 3 6 9 . . ." + ) + ); + + // push the init barrier for left and right + tx_l.push_barrier(test_epoch(2), false); + tx_r.push_barrier(test_epoch(2), false); + hash_join.next_unwrap_ready_barrier()?; + + // push the 2nd left chunk + tx_l.push_chunk(chunk_l2); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + + 3 8 1 . . . + - 3 8 1 . . ." + ) + ); + + // push the 1st right chunk + tx_r.push_chunk(chunk_r1); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 2 5 8 . . . + + 2 5 8 2 3 4 + - 2 5 8 2 3 4 + + 2 5 8 2 2 5 + - 2 5 8 2 2 5 + + 2 5 8 2 1 5" + ) + ); + + // push the 2nd right chunk + tx_r.push_chunk(chunk_r2); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 2 5 8 2 1 5 + + 2 5 8 2 2 5 + - 2 5 8 2 2 5 + + 2 5 8 . . ." + ) + ); + + // push the 3rd left chunk + tx_l.push_chunk(chunk_l3); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + + 6 8 9 6 1 8" + ) + ); + + // push the 3rd right chunk + tx_r.push_chunk(chunk_r3); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 6 8 9 6 1 8 + + 6 8 9 . . ." + ) + ); + Ok(()) + } +} diff --git a/src/stream/src/executor/backfill/arrangement_backfill.rs b/src/stream/src/executor/backfill/arrangement_backfill.rs index e3979496731b5..540ffe1a020fc 100644 --- a/src/stream/src/executor/backfill/arrangement_backfill.rs +++ b/src/stream/src/executor/backfill/arrangement_backfill.rs @@ -34,7 +34,7 @@ use crate::executor::backfill::utils::{ update_pos_by_vnode, BackfillProgressPerVnode, BackfillRateLimiter, BackfillState, }; use crate::executor::prelude::*; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; type Builders = HashMap; @@ -56,7 +56,7 @@ pub struct ArrangementBackfillExecutor { /// The column indices need to be forwarded to the downstream from the upstream and table scan. output_indices: Vec, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, actor_id: ActorId, @@ -79,7 +79,7 @@ where upstream: Executor, state_table: StateTable, output_indices: Vec, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, metrics: Arc, chunk_size: usize, rate_limit: Option, diff --git a/src/stream/src/executor/backfill/cdc/cdc_backfill.rs b/src/stream/src/executor/backfill/cdc/cdc_backfill.rs index bfffa066fc265..066dc86ba551c 100644 --- a/src/stream/src/executor/backfill/cdc/cdc_backfill.rs +++ b/src/stream/src/executor/backfill/cdc/cdc_backfill.rs @@ -43,7 +43,7 @@ use crate::executor::backfill::CdcScanOptions; use crate::executor::monitor::CdcBackfillMetrics; use crate::executor::prelude::*; use crate::executor::UpdateMutation; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; /// `split_id`, `is_finished`, `row_count`, `cdc_offset` all occupy 1 column each. const METADATA_STATE_LEN: usize = 4; @@ -68,7 +68,7 @@ pub struct CdcBackfillExecutor { // TODO: introduce a CdcBackfillProgress to report finish to Meta // This object is just a stub right now - progress: Option, + progress: Option, metrics: CdcBackfillMetrics, @@ -86,7 +86,7 @@ impl CdcBackfillExecutor { upstream: Executor, output_indices: Vec, output_columns: Vec, - progress: Option, + progress: Option, metrics: Arc, state_table: StateTable, rate_limit_rps: Option, diff --git a/src/stream/src/executor/backfill/no_shuffle_backfill.rs b/src/stream/src/executor/backfill/no_shuffle_backfill.rs index 761aedfa55ee3..d8de07375d721 100644 --- a/src/stream/src/executor/backfill/no_shuffle_backfill.rs +++ b/src/stream/src/executor/backfill/no_shuffle_backfill.rs @@ -30,7 +30,7 @@ use crate::executor::backfill::utils::{ METADATA_STATE_LEN, }; use crate::executor::prelude::*; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; /// Schema: | vnode | pk ... | `backfill_finished` | `row_count` | /// We can decode that into `BackfillState` on recovery. @@ -76,7 +76,7 @@ pub struct BackfillExecutor { output_indices: Vec, /// PTAL at the docstring for `CreateMviewProgress` to understand how we compute it. - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, actor_id: ActorId, @@ -100,7 +100,7 @@ where upstream: Executor, state_table: Option>, output_indices: Vec, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, metrics: Arc, chunk_size: usize, rate_limit: Option, diff --git a/src/stream/src/executor/backfill/snapshot_backfill.rs b/src/stream/src/executor/backfill/snapshot_backfill.rs index 35adc33b81c4f..593a13df9cbcd 100644 --- a/src/stream/src/executor/backfill/snapshot_backfill.rs +++ b/src/stream/src/executor/backfill/snapshot_backfill.rs @@ -32,16 +32,18 @@ use risingwave_storage::table::batch_table::storage_table::StorageTable; use risingwave_storage::table::ChangeLogRow; use risingwave_storage::StateStore; use tokio::select; +use tokio::sync::mpsc; use tokio::sync::mpsc::UnboundedReceiver; use crate::executor::backfill::utils::{create_builder, mapping_chunk}; use crate::executor::monitor::StreamingMetrics; use crate::executor::prelude::{try_stream, StreamExt}; use crate::executor::{ - expect_first_barrier, ActorContextRef, BackfillExecutor, Barrier, BoxedMessageStream, Execute, - Executor, Message, Mutation, StreamExecutorError, StreamExecutorResult, + expect_first_barrier, ActorContextRef, BackfillExecutor, Barrier, BoxedMessageStream, + DispatcherBarrier, DispatcherMessage, Execute, Executor, Message, Mutation, + StreamExecutorError, StreamExecutorResult, }; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; pub struct SnapshotBackfillExecutor { /// Upstream table @@ -53,7 +55,7 @@ pub struct SnapshotBackfillExecutor { /// The column indices need to be forwarded to the downstream from the upstream and table scan. output_indices: Vec, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, chunk_size: usize, rate_limit: Option, @@ -71,7 +73,7 @@ impl SnapshotBackfillExecutor { upstream: Executor, output_indices: Vec, actor_ctx: ActorContextRef, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, chunk_size: usize, rate_limit: Option, barrier_rx: UnboundedReceiver, @@ -99,7 +101,7 @@ impl SnapshotBackfillExecutor { #[try_stream(ok = Message, error = StreamExecutorError)] async fn execute_inner(mut self) { debug!("snapshot backfill executor start"); - let mut upstream = self.upstream.execute(); + let mut upstream = erase_upstream_mutation(self.upstream.execute()); let upstream_table_id = self.upstream_table.table_id(); let first_barrier = expect_first_barrier(&mut upstream).await?; debug!(epoch = ?first_barrier.epoch, "get first upstream barrier"); @@ -109,7 +111,7 @@ impl SnapshotBackfillExecutor { { if should_backfill { - let subscriber_ids = first_barrier + let subscriber_ids = first_recv_barrier .added_subscriber_on_mv_table(upstream_table_id) .collect_vec(); let snapshot_backfill_table_fragment_id = match subscriber_ids.as_slice() { @@ -183,12 +185,15 @@ impl SnapshotBackfillExecutor { let recv_barrier = self.barrier_rx.recv().await.expect("should exist"); assert_eq!(first_barrier.epoch, recv_barrier.epoch); - yield Message::Barrier(first_barrier); + yield Message::Barrier(recv_barrier); } + let mut upstream_buffer = + upstream_buffer.start_consuming_log_store(&mut self.barrier_rx); + let mut barrier_epoch = first_barrier_epoch; - let initial_pending_barrier = upstream_buffer.barrier.len(); + let initial_pending_barrier = upstream_buffer.state.barrier_count(); info!( ?barrier_epoch, table_id = self.upstream_table.table_id().table_id, @@ -207,8 +212,6 @@ impl SnapshotBackfillExecutor { // Phase 2: consume upstream log store while let Some(barrier) = upstream_buffer.take_buffered_barrier().await? { - let recv_barrier = receive_next_barrier(&mut self.barrier_rx).await?; - assert_eq!(barrier.epoch, recv_barrier.epoch); assert_eq!(barrier_epoch.curr, barrier.epoch.prev); barrier_epoch = barrier.epoch; @@ -254,16 +257,20 @@ impl SnapshotBackfillExecutor { ); let first_recv_barrier = receive_next_barrier(&mut self.barrier_rx).await?; assert_eq!(first_barrier.epoch, first_recv_barrier.epoch); - yield Message::Barrier(first_barrier); + yield Message::Barrier(first_recv_barrier); } } // Phase 3: consume upstream while let Some(msg) = upstream.try_next().await? { - if let Message::Barrier(barrier) = &msg { - let recv_barrier = receive_next_barrier(&mut self.barrier_rx).await?; - assert_eq!(barrier.epoch, recv_barrier.epoch); - } - yield msg; + yield match msg { + DispatcherMessage::Chunk(chunk) => Message::Chunk(chunk), + DispatcherMessage::Watermark(watermark) => Message::Watermark(watermark), + DispatcherMessage::Barrier(barrier) => { + let recv_barrier = receive_next_barrier(&mut self.barrier_rx).await?; + assert_eq!(barrier.epoch, recv_barrier.epoch); + Message::Barrier(recv_barrier) + } + }; } } } @@ -324,101 +331,236 @@ async fn read_change_log( } } -struct UpstreamBuffer<'a> { - upstream: &'a mut BoxedMessageStream, - // newer barrier at the front - barrier: VecDeque, - consume_upstream_row_count: LabelGuardedIntCounter<3>, +trait UpstreamBufferState { + // The future must be cancellation-safe + async fn is_finished(&mut self) -> StreamExecutorResult; + fn on_upstream_barrier(&mut self, upstream_barrier: DispatcherBarrier); +} + +struct StateOfConsumingSnapshot { + pending_barriers: Vec, +} + +impl UpstreamBufferState for StateOfConsumingSnapshot { + async fn is_finished(&mut self) -> StreamExecutorResult { + // never finish when consuming snapshot + Ok(false) + } + + fn on_upstream_barrier(&mut self, upstream_barrier: DispatcherBarrier) { + self.pending_barriers.push(upstream_barrier) + } +} + +struct StateOfConsumingLogStore<'a> { + barrier_rx: &'a mut mpsc::UnboundedReceiver, + /// Barriers received from upstream but not yet received the barrier from local barrier worker + /// newer barrier at the front + upstream_pending_barriers: VecDeque, + /// Barriers received from both upstream and local barrier worker + /// newer barrier at the front + barriers: VecDeque, is_finished: bool, + current_subscriber_id: u32, + upstream_table_id: TableId, +} + +impl<'a> StateOfConsumingLogStore<'a> { + fn barrier_count(&self) -> usize { + self.upstream_pending_barriers.len() + self.barriers.len() + } + + async fn handle_one_pending_barrier(&mut self) -> StreamExecutorResult { + assert!(!self.is_finished); + let barrier = receive_next_barrier(self.barrier_rx).await?; + assert_eq!( + self.upstream_pending_barriers + .pop_back() + .expect("non-empty") + .epoch, + barrier.epoch + ); + if is_finish_barrier(&barrier, self.current_subscriber_id, self.upstream_table_id) { + self.is_finished = true; + } + Ok(barrier) + } +} + +impl<'a> UpstreamBufferState for StateOfConsumingLogStore<'a> { + async fn is_finished(&mut self) -> StreamExecutorResult { + while !self.upstream_pending_barriers.is_empty() { + let barrier = self.handle_one_pending_barrier().await?; + self.barriers.push_front(barrier); + } + if self.is_finished { + assert!(self.upstream_pending_barriers.is_empty()); + } + Ok(self.is_finished) + } + + fn on_upstream_barrier(&mut self, upstream_barrier: DispatcherBarrier) { + self.upstream_pending_barriers.push_front(upstream_barrier); + } +} + +mod erase_upstream_mutation { + use futures::TryStreamExt; + + use crate::executor::prelude::Stream; + use crate::executor::{BoxedMessageStream, DispatcherMessageStreamItem}; + + pub(super) fn erase_upstream_mutation(upstream: BoxedMessageStream) -> UpstreamStream { + upstream.map_ok(|msg| { + msg.map_mutation(|mutation| { + if let Some(mutation) = mutation { + // TODO: assert none mutation after we explicitly erase mutation + warn!( + ?mutation, + "receive non-empty mutation from upstream. ignored" + ); + }; + }) + }) + } + + pub(super) type UpstreamStream = impl Stream + Unpin; +} + +use erase_upstream_mutation::*; + +struct UpstreamBuffer<'a, S> { + upstream: &'a mut UpstreamStream, + state: S, + consume_upstream_row_count: LabelGuardedIntCounter<3>, upstream_table_id: TableId, current_subscriber_id: u32, } -impl<'a> UpstreamBuffer<'a> { +impl<'a> UpstreamBuffer<'a, StateOfConsumingSnapshot> { fn new( - upstream: &'a mut BoxedMessageStream, + upstream: &'a mut UpstreamStream, upstream_table_id: TableId, current_subscriber_id: u32, consume_upstream_row_count: LabelGuardedIntCounter<3>, ) -> Self { Self { upstream, - barrier: Default::default(), + state: StateOfConsumingSnapshot { + pending_barriers: vec![], + }, consume_upstream_row_count, - is_finished: false, upstream_table_id, current_subscriber_id, } } + fn start_consuming_log_store<'s>( + self, + barrier_rx: &'s mut UnboundedReceiver, + ) -> UpstreamBuffer<'a, StateOfConsumingLogStore<'s>> { + let StateOfConsumingSnapshot { pending_barriers } = self.state; + let mut upstream_pending_barriers = VecDeque::with_capacity(pending_barriers.len()); + for pending_barrier in pending_barriers { + upstream_pending_barriers.push_front(pending_barrier); + } + UpstreamBuffer { + upstream: self.upstream, + state: StateOfConsumingLogStore { + barrier_rx, + upstream_pending_barriers, + barriers: Default::default(), + is_finished: false, + current_subscriber_id: self.current_subscriber_id, + upstream_table_id: self.upstream_table_id, + }, + consume_upstream_row_count: self.consume_upstream_row_count, + upstream_table_id: self.upstream_table_id, + current_subscriber_id: self.current_subscriber_id, + } + } +} + +impl<'a, S: UpstreamBufferState> UpstreamBuffer<'a, S> { async fn concurrently_consume_upstream(&mut self) -> StreamExecutorError { - while !self.is_finished { - let result = self.consume_until_next_barrier().await; - let barrier = match result { - Ok(barrier) => barrier, - Err(e) => { - return e; - } - }; - self.barrier.push_front(barrier); + if let Err(e) = try { + while !self.state.is_finished().await? { + self.consume_until_next_barrier().await?; + } + } { + return e; } pending().await } - async fn consume_until_next_barrier(&mut self) -> StreamExecutorResult { - assert!(!self.is_finished); + /// Consume the upstream until seeing the next barrier. + /// `pending_barriers` must be non-empty after this method returns. + async fn consume_until_next_barrier(&mut self) -> StreamExecutorResult<()> { loop { - let msg: Message = self + let msg: DispatcherMessage = self .upstream .try_next() .await? .ok_or_else(|| anyhow!("end of upstream"))?; match msg { - Message::Chunk(chunk) => { + DispatcherMessage::Chunk(chunk) => { self.consume_upstream_row_count .inc_by(chunk.cardinality() as _); } - Message::Barrier(barrier) => { - self.is_finished = self.is_finish_barrier(&barrier); - break Ok(barrier); + DispatcherMessage::Barrier(barrier) => { + self.state.on_upstream_barrier(barrier); + break Ok(()); } - Message::Watermark(_) => {} + DispatcherMessage::Watermark(_) => {} } } } +} +impl<'a, 's> UpstreamBuffer<'a, StateOfConsumingLogStore<'s>> { async fn take_buffered_barrier(&mut self) -> StreamExecutorResult> { - Ok(if let Some(barrier) = self.barrier.pop_back() { + Ok(if let Some(barrier) = self.state.barriers.pop_back() { Some(barrier) - } else if self.is_finished { + } else if !self.state.upstream_pending_barriers.is_empty() { + let barrier = self.state.handle_one_pending_barrier().await?; + Some(barrier) + } else if self.state.is_finished { None } else { - Some(self.consume_until_next_barrier().await?) + self.consume_until_next_barrier().await?; + let barrier = self.state.handle_one_pending_barrier().await?; + Some(barrier) }) } +} - fn is_finish_barrier(&self, barrier: &Barrier) -> bool { - if let Some(Mutation::DropSubscriptions { - subscriptions_to_drop, - }) = barrier.mutation.as_deref() - { - let is_finished = subscriptions_to_drop - .iter() - .any(|(subscriber_id, _)| *subscriber_id == self.current_subscriber_id); - if is_finished { - assert!(subscriptions_to_drop.iter().any( - |(subscriber_id, subscribed_upstream_table_id)| { - *subscriber_id == self.current_subscriber_id - && self.upstream_table_id == *subscribed_upstream_table_id - } - )) - } - is_finished - } else { - false +fn is_finish_barrier( + barrier: &Barrier, + current_subscriber_id: u32, + upstream_table_id: TableId, +) -> bool { + if let Some(Mutation::DropSubscriptions { + subscriptions_to_drop, + }) = barrier.mutation.as_deref() + { + let is_finished = subscriptions_to_drop + .iter() + .any(|(subscriber_id, _)| *subscriber_id == current_subscriber_id); + if is_finished { + assert!(subscriptions_to_drop.iter().any( + |(subscriber_id, subscribed_upstream_table_id)| { + *subscriber_id == current_subscriber_id + && upstream_table_id == *subscribed_upstream_table_id + } + )) } + is_finished + } else { + false } +} +impl<'a, S: UpstreamBufferState> UpstreamBuffer<'a, S> { /// Run a future while concurrently polling the upstream so that the upstream /// won't be back-pressured. async fn run_future>( @@ -475,7 +617,7 @@ async fn make_consume_snapshot_stream<'a, S: StateStore>( rate_limit: Option, barrier_rx: &'a mut UnboundedReceiver, output_indices: &'a [usize], - mut progress: CreateMviewProgress, + mut progress: CreateMviewProgressReporter, first_recv_barrier: Barrier, ) { let mut barrier_epoch = first_recv_barrier.epoch; diff --git a/src/stream/src/executor/chain.rs b/src/stream/src/executor/chain.rs index 6f198ff2b7e12..ca06319e11bfb 100644 --- a/src/stream/src/executor/chain.rs +++ b/src/stream/src/executor/chain.rs @@ -13,7 +13,7 @@ // limitations under the License. use crate::executor::prelude::*; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; /// [`ChainExecutor`] is an executor that enables synchronization between the existing stream and /// newly appended executors. Currently, [`ChainExecutor`] is mainly used to implement MV on MV @@ -24,7 +24,7 @@ pub struct ChainExecutor { upstream: Executor, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, actor_id: ActorId, @@ -36,7 +36,7 @@ impl ChainExecutor { pub fn new( snapshot: Executor, upstream: Executor, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, upstream_only: bool, ) -> Self { Self { @@ -115,12 +115,12 @@ mod test { use super::ChainExecutor; use crate::executor::test_utils::MockSource; use crate::executor::{AddMutation, Barrier, Execute, Message, Mutation, PkIndices}; - use crate::task::{CreateMviewProgress, LocalBarrierManager}; + use crate::task::{CreateMviewProgressReporter, LocalBarrierManager}; #[tokio::test] async fn test_basic() { let barrier_manager = LocalBarrierManager::for_test(); - let progress = CreateMviewProgress::for_test(barrier_manager); + let progress = CreateMviewProgressReporter::for_test(barrier_manager); let actor_id = progress.actor_id(); let schema = Schema::new(vec![Field::unnamed(DataType::Int64)]); diff --git a/src/stream/src/executor/dispatch.rs b/src/stream/src/executor/dispatch.rs index 82d11db49513b..bb1db4662b0d7 100644 --- a/src/stream/src/executor/dispatch.rs +++ b/src/stream/src/executor/dispatch.rs @@ -755,7 +755,8 @@ impl Dispatcher for HashDataDispatcher { let num_outputs = self.outputs.len(); // get hash value of every line by its key - let vnodes = VirtualNode::compute_chunk(chunk.data_chunk(), &self.keys); + let vnode_count = self.hash_mapping.len(); + let vnodes = VirtualNode::compute_chunk(chunk.data_chunk(), &self.keys, vnode_count); tracing::debug!(target: "events::stream::dispatch::hash", "\n{}\n keys {:?} => {:?}", chunk.to_pretty(), self.keys, vnodes); @@ -1102,8 +1103,8 @@ mod tests { } async fn test_hash_dispatcher_complex_inner() { - // This test only works when VirtualNode::COUNT is 256. - static_assertions::const_assert_eq!(VirtualNode::COUNT, 256); + // This test only works when vnode count is 256. + assert_eq!(VirtualNode::COUNT_FOR_TEST, 256); let num_outputs = 2; // actor id ranges from 1 to 2 let key_indices = &[0, 2]; @@ -1118,9 +1119,9 @@ mod tests { }) .collect::>(); let mut hash_mapping = (1..num_outputs + 1) - .flat_map(|id| vec![id as ActorId; VirtualNode::COUNT / num_outputs]) + .flat_map(|id| vec![id as ActorId; VirtualNode::COUNT_FOR_TEST / num_outputs]) .collect_vec(); - hash_mapping.resize(VirtualNode::COUNT, num_outputs as u32); + hash_mapping.resize(VirtualNode::COUNT_FOR_TEST, num_outputs as u32); let mut hash_dispatcher = HashDataDispatcher::new( outputs, key_indices.to_vec(), @@ -1225,6 +1226,32 @@ mod tests { ) .unwrap(); + let dispatcher_updates = maplit::hashmap! { + actor_id => vec![PbDispatcherUpdate { + actor_id, + dispatcher_id: broadcast_dispatcher_id, + added_downstream_actor_id: vec![new], + removed_downstream_actor_id: vec![old], + hash_mapping: Default::default(), + }] + }; + let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( + UpdateMutation { + dispatchers: dispatcher_updates, + merges: Default::default(), + vnode_bitmaps: Default::default(), + dropped_actors: Default::default(), + actor_splits: Default::default(), + actor_new_dispatchers: Default::default(), + }, + )); + barrier_test_env.inject_barrier(&b1, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let executor = Box::new(DispatchExecutor::new( input, vec![broadcast_dispatcher, simple_dispatcher], @@ -1253,27 +1280,6 @@ mod tests { .await .unwrap(); - // 4. Send a configuration change barrier for broadcast dispatcher. - let dispatcher_updates = maplit::hashmap! { - actor_id => vec![PbDispatcherUpdate { - actor_id, - dispatcher_id: broadcast_dispatcher_id, - added_downstream_actor_id: vec![new], - removed_downstream_actor_id: vec![old], - hash_mapping: Default::default(), - }] - }; - let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( - UpdateMutation { - dispatchers: dispatcher_updates, - merges: Default::default(), - vnode_bitmaps: Default::default(), - dropped_actors: Default::default(), - actor_splits: Default::default(), - actor_new_dispatchers: Default::default(), - }, - )); - barrier_test_env.inject_barrier(&b1, [actor_id]); tx.send(Message::Barrier(b1.clone().into_dispatcher())) .await .unwrap(); @@ -1359,6 +1365,9 @@ mod tests { #[tokio::test] async fn test_hash_dispatcher() { + // This test only works when vnode count is 256. + assert_eq!(VirtualNode::COUNT_FOR_TEST, 256); + let num_outputs = 5; // actor id ranges from 1 to 5 let cardinality = 10; let dimension = 4; @@ -1374,9 +1383,9 @@ mod tests { }) .collect::>(); let mut hash_mapping = (1..num_outputs + 1) - .flat_map(|id| vec![id as ActorId; VirtualNode::COUNT / num_outputs]) + .flat_map(|id| vec![id as ActorId; VirtualNode::COUNT_FOR_TEST / num_outputs]) .collect_vec(); - hash_mapping.resize(VirtualNode::COUNT, num_outputs as u32); + hash_mapping.resize(VirtualNode::COUNT_FOR_TEST, num_outputs as u32); let mut hash_dispatcher = HashDataDispatcher::new( outputs, key_indices.to_vec(), @@ -1410,7 +1419,7 @@ mod tests { hasher.update(&bytes); } let output_idx = - hash_mapping[hasher.finish() as usize % VirtualNode::COUNT] as usize - 1; + hash_mapping[hasher.finish() as usize % VirtualNode::COUNT_FOR_TEST] as usize - 1; for (builder, val) in builders.iter_mut().zip_eq_fast(one_row.iter()) { builder.append(Some(*val)); } diff --git a/src/stream/src/executor/error.rs b/src/stream/src/executor/error.rs index fa625d8bb8cec..66070ba81e90c 100644 --- a/src/stream/src/executor/error.rs +++ b/src/stream/src/executor/error.rs @@ -67,7 +67,12 @@ pub enum ErrorKind { ), #[error("Sink error: sink_id={1}, error: {0}")] - SinkError(SinkError, u32), + SinkError( + #[source] + #[backtrace] + SinkError, + u32, + ), #[error(transparent)] RpcError( @@ -90,7 +95,11 @@ pub enum ErrorKind { AlignBarrier(Box, Box), #[error("Connector error: {0}")] - ConnectorError(BoxedError), + ConnectorError( + #[source] + #[backtrace] + BoxedError, + ), #[error(transparent)] DmlError( diff --git a/src/stream/src/executor/exchange/input.rs b/src/stream/src/executor/exchange/input.rs index e00a0da45979a..7ecac2c625e69 100644 --- a/src/stream/src/executor/exchange/input.rs +++ b/src/stream/src/executor/exchange/input.rs @@ -15,16 +15,13 @@ use std::pin::Pin; use std::task::{Context, Poll}; -use anyhow::{anyhow, Context as _}; -use futures::pin_mut; -use futures_async_stream::try_stream; +use anyhow::anyhow; +use local_input::LocalInputStreamInner; use pin_project::pin_project; use risingwave_common::util::addr::{is_local_address, HostAddr}; -use risingwave_pb::task_service::{permits, GetStreamResponse}; use risingwave_rpc_client::ComputeClientPool; use tokio::sync::mpsc; -use super::error::ExchangeChannelClosed; use super::permit::Receiver; use crate::executor::prelude::*; use crate::executor::{DispatcherBarrier, DispatcherMessage}; @@ -64,7 +61,6 @@ pub struct LocalInput { actor_id: ActorId, } -type LocalInputStreamInner = impl MessageStream; async fn process_msg<'a>( msg: DispatcherMessage, @@ -110,7 +106,7 @@ impl LocalInput { local_barrier_manager: LocalBarrierManager, ) -> Self { Self { - inner: Self::run( + inner: local_input::run( channel, upstream_actor_id, self_actor_id, @@ -119,9 +115,36 @@ impl LocalInput { actor_id: upstream_actor_id, } } +} + +mod local_input { + use await_tree::InstrumentAwait; + + use crate::executor::exchange::error::ExchangeChannelClosed; + use crate::executor::exchange::input::process_msg; + use crate::executor::exchange::permit::Receiver; + use crate::executor::prelude::try_stream; + use crate::executor::{Message, StreamExecutorError}; + use crate::task::{ActorId, LocalBarrierManager}; + + pub(super) type LocalInputStreamInner = impl crate::executor::MessageStream; + + pub(super) fn run( + channel: Receiver, + upstream_actor_id: ActorId, + self_actor_id: ActorId, + local_barrier_manager: LocalBarrierManager, + ) -> LocalInputStreamInner { + run_inner( + channel, + upstream_actor_id, + self_actor_id, + local_barrier_manager, + ) + } #[try_stream(ok = Message, error = StreamExecutorError)] - async fn run( + async fn run_inner( mut channel: Receiver, upstream_actor_id: ActorId, self_actor_id: ActorId, @@ -166,7 +189,8 @@ pub struct RemoteInput { actor_id: ActorId, } -type RemoteInputStreamInner = impl MessageStream; + +use remote_input::RemoteInputStreamInner; impl RemoteInput { /// Create a remote input from compute client and related info. Should provide the corresponding @@ -184,7 +208,7 @@ impl RemoteInput { Self { actor_id, - inner: Self::run( + inner: remote_input::run( local_barrier_manager, client_pool, upstream_addr, @@ -195,9 +219,48 @@ impl RemoteInput { ), } } +} + +mod remote_input { + use std::sync::Arc; + + use anyhow::Context; + use await_tree::InstrumentAwait; + use risingwave_common::util::addr::HostAddr; + use risingwave_pb::task_service::{permits, GetStreamResponse}; + use risingwave_rpc_client::ComputeClientPool; + + use crate::executor::exchange::error::ExchangeChannelClosed; + use crate::executor::exchange::input::process_msg; + use crate::executor::monitor::StreamingMetrics; + use crate::executor::prelude::{pin_mut, try_stream, StreamExt}; + use crate::executor::{DispatcherMessage, Message, StreamExecutorError}; + use crate::task::{LocalBarrierManager, UpDownActorIds, UpDownFragmentIds}; + + pub(super) type RemoteInputStreamInner = impl crate::executor::MessageStream; + + pub(super) fn run( + local_barrier_manager: LocalBarrierManager, + client_pool: ComputeClientPool, + upstream_addr: HostAddr, + up_down_ids: UpDownActorIds, + up_down_frag: UpDownFragmentIds, + metrics: Arc, + batched_permits_limit: usize, + ) -> RemoteInputStreamInner { + run_inner( + local_barrier_manager, + client_pool, + upstream_addr, + up_down_ids, + up_down_frag, + metrics, + batched_permits_limit, + ) + } #[try_stream(ok = Message, error = StreamExecutorError)] - async fn run( + async fn run_inner( local_barrier_manager: LocalBarrierManager, client_pool: ComputeClientPool, upstream_addr: HostAddr, diff --git a/src/stream/src/executor/hash_join.rs b/src/stream/src/executor/hash_join.rs index e1a1b177bcfcc..e23c17724be02 100644 --- a/src/stream/src/executor/hash_join.rs +++ b/src/stream/src/executor/hash_join.rs @@ -396,6 +396,7 @@ impl HashJoinExecutor HashJoinExecutor> = vec![]; // input and output channels of the local aggregation actors let mut inputs = vec![]; @@ -113,7 +115,7 @@ async fn test_merger_sum_aggr() { let (tx, rx) = channel_for_test(); let (actor, channel) = make_actor(rx); outputs.push(channel); - handles.push(tokio::spawn(actor.run())); + actor_futures.push(actor.run().boxed()); inputs.push(Box::new(LocalOutput::new(233, tx)) as BoxedOutput); } @@ -154,7 +156,7 @@ async fn test_merger_sum_aggr() { .local_barrier_manager .clone(), ); - handles.push(tokio::spawn(actor.run())); + actor_futures.push(actor.run().boxed()); let actor_ctx = ActorContext::for_test(gen_next_actor_id()); @@ -192,6 +194,7 @@ async fn test_merger_sum_aggr() { 2, // row_count_index vec![], 2, + false, ) .await; @@ -224,11 +227,21 @@ async fn test_merger_sum_aggr() { .local_barrier_manager .clone(), ); - handles.push(tokio::spawn(actor.run())); + actor_futures.push(actor.run().boxed()); let mut epoch = test_epoch(1); let b1 = Barrier::new_test_barrier(epoch); barrier_test_env.inject_barrier(&b1, actors.clone()); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let handles = actor_futures + .into_iter() + .map(|actor_future| tokio::spawn(actor_future)) + .collect_vec(); + input .send(Message::Barrier(b1.into_dispatcher())) .await diff --git a/src/stream/src/executor/join/hash_join.rs b/src/stream/src/executor/join/hash_join.rs index 10e9e26f784cd..33f93ceb3c682 100644 --- a/src/stream/src/executor/join/hash_join.rs +++ b/src/stream/src/executor/join/hash_join.rs @@ -14,13 +14,14 @@ use std::alloc::Global; use std::cmp::Ordering; -use std::ops::{Bound, Deref, DerefMut}; +use std::ops::{Bound, Deref, DerefMut, RangeBounds}; use std::sync::Arc; -use anyhow::Context; +use anyhow::{anyhow, Context}; use futures::future::{join, try_join}; use futures::{pin_mut, stream, StreamExt}; use futures_async_stream::for_await; +use join_row_set::JoinRowSet; use local_stats_alloc::{SharedStatsAlloc, StatsAlloc}; use risingwave_common::bitmap::Bitmap; use risingwave_common::hash::{HashKey, PrecomputedBuildHasher}; @@ -34,6 +35,7 @@ use risingwave_common::util::sort_util::OrderType; use risingwave_common_estimate_size::EstimateSize; use risingwave_storage::store::PrefetchOptions; use risingwave_storage::StateStore; +use thiserror_ext::AsReport; use super::row::{DegreeType, EncodedJoinRow}; use crate::cache::ManagedLruCache; @@ -47,6 +49,7 @@ use crate::task::{ActorId, AtomicU64Ref, FragmentId}; /// Memcomparable encoding. type PkType = Vec; +type InequalKeyType = Vec; pub type StateValueType = EncodedJoinRow; pub type HashValueType = Box; @@ -154,6 +157,21 @@ impl JoinHashMapMetrics { } } +/// Inequality key description for `AsOf` join. +struct InequalityKeyDesc { + idx: usize, + serializer: OrderedRowSerde, +} + +impl InequalityKeyDesc { + /// Serialize the inequality key from a row. + pub fn serialize_inequal_key_from_row(&self, row: impl Row) -> InequalKeyType { + let indices = vec![self.idx]; + let inequality_key = row.project(&indices); + inequality_key.memcmp_serialize(&self.serializer) + } +} + pub struct JoinHashMap { /// Store the join states. inner: JoinHashMapInner, @@ -182,6 +200,8 @@ pub struct JoinHashMap { need_degree_table: bool, /// Pk is part of the join key. pk_contained_in_jk: bool, + /// Inequality key description for `AsOf` join. + inequality_key_desc: Option, /// Metrics of the hash map metrics: JoinHashMapMetrics, } @@ -230,6 +250,7 @@ impl JoinHashMap { null_matched: K::Bitmap, need_degree_table: bool, pk_contained_in_jk: bool, + inequality_key_idx: Option, metrics: Arc, actor_id: ActorId, fragment_id: FragmentId, @@ -246,6 +267,14 @@ impl JoinHashMap { vec![OrderType::ascending(); state_pk_indices.len()], ); + let inequality_key_desc = inequality_key_idx.map(|idx| { + let serializer = OrderedRowSerde::new( + vec![state_all_data_types[idx].clone()], + vec![OrderType::ascending()], + ); + InequalityKeyDesc { idx, serializer } + }); + let join_table_id = state_table.table_id(); let state = TableInner { pk_indices: state_pk_indices, @@ -286,6 +315,7 @@ impl JoinHashMap { degree_state, need_degree_table, pk_contained_in_jk, + inequality_key_desc, metrics: JoinHashMapMetrics::new(&metrics, actor_id, fragment_id, side, join_table_id), } } @@ -427,11 +457,16 @@ impl JoinHashMap { let degree_i64 = degree_row .datum_at(degree_row.len() - 1) .expect("degree should not be NULL"); + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(row.row())); entry_state .insert( pk, JoinRow::new(row.row(), degree_i64.into_int64() as u64) .encode(), + inequality_key, ) .with_context(|| self.state.error_context(row.row()))?; } @@ -459,6 +494,10 @@ impl JoinHashMap { .as_ref() .project(&self.state.pk_indices) .memcmp_serialize(&self.pk_serializer); + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(row.row())); let degree_i64 = degree_row .datum_at(degree_row.len() - 1) .expect("degree should not be NULL"); @@ -466,6 +505,7 @@ impl JoinHashMap { .insert( pk, JoinRow::new(row.row(), degree_i64.into_int64() as u64).encode(), + inequality_key, ) .with_context(|| self.state.error_context(row.row()))?; } @@ -486,8 +526,12 @@ impl JoinHashMap { .as_ref() .project(&self.state.pk_indices) .memcmp_serialize(&self.pk_serializer); + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(row.row())); entry_state - .insert(pk, JoinRow::new(row.row(), 0).encode()) + .insert(pk, JoinRow::new(row.row(), 0).encode(), inequality_key) .with_context(|| self.state.error_context(row.row()))?; } }; @@ -511,9 +555,12 @@ impl JoinHashMap { /// Insert a join row #[allow(clippy::unused_async)] pub async fn insert(&mut self, key: &K, value: JoinRow) -> StreamExecutorResult<()> { - let pk = (&value.row) - .project(&self.state.pk_indices) - .memcmp_serialize(&self.pk_serializer); + let pk = self.serialize_pk_from_row(&value.row); + + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(&value.row)); // TODO(yuhao): avoid this `contains`. // https://github.com/risingwavelabs/risingwave/issues/9233 @@ -521,14 +568,14 @@ impl JoinHashMap { // Update cache let mut entry = self.inner.get_mut(key).unwrap(); entry - .insert(pk, value.encode()) + .insert(pk, value.encode(), inequality_key) .with_context(|| self.state.error_context(&value.row))?; } else if self.pk_contained_in_jk { // Refill cache when the join key exist in neither cache or storage. self.metrics.insert_cache_miss_count += 1; let mut state = JoinEntryState::default(); state - .insert(pk, value.encode()) + .insert(pk, value.encode(), inequality_key) .with_context(|| self.state.error_context(&value.row))?; self.update_state(key, state.into()); } @@ -545,24 +592,25 @@ impl JoinHashMap { #[allow(clippy::unused_async)] pub async fn insert_row(&mut self, key: &K, value: impl Row) -> StreamExecutorResult<()> { let join_row = JoinRow::new(&value, 0); - let pk = (&value) - .project(&self.state.pk_indices) - .memcmp_serialize(&self.pk_serializer); - + let pk = self.serialize_pk_from_row(&value); + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(&value)); // TODO(yuhao): avoid this `contains`. // https://github.com/risingwavelabs/risingwave/issues/9233 if self.inner.contains(key) { // Update cache let mut entry = self.inner.get_mut(key).unwrap(); entry - .insert(pk, join_row.encode()) + .insert(pk, join_row.encode(), inequality_key) .with_context(|| self.state.error_context(&value))?; } else if self.pk_contained_in_jk { // Refill cache when the join key exist in neither cache or storage. self.metrics.insert_cache_miss_count += 1; let mut state = JoinEntryState::default(); state - .insert(pk, join_row.encode()) + .insert(pk, join_row.encode(), inequality_key) .with_context(|| self.state.error_context(&value))?; self.update_state(key, state.into()); } @@ -578,8 +626,12 @@ impl JoinHashMap { let pk = (&value.row) .project(&self.state.pk_indices) .memcmp_serialize(&self.pk_serializer); + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(&value.row)); entry - .remove(pk) + .remove(pk, inequality_key.as_ref()) .with_context(|| self.state.error_context(&value.row))?; } @@ -597,8 +649,13 @@ impl JoinHashMap { let pk = (&value) .project(&self.state.pk_indices) .memcmp_serialize(&self.pk_serializer); + + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(&value)); entry - .remove(pk) + .remove(pk, inequality_key.as_ref()) .with_context(|| self.state.error_context(&value))?; } @@ -680,6 +737,29 @@ impl JoinHashMap { pub fn join_key_data_types(&self) -> &[DataType] { &self.join_key_data_types } + + /// Return true if the inequality key is null. + /// # Panics + /// Panics if the inequality key is not set. + pub fn check_inequal_key_null(&self, row: &impl Row) -> bool { + let desc = self.inequality_key_desc.as_ref().unwrap(); + row.datum_at(desc.idx).is_none() + } + + /// Serialize the inequality key from a row. + /// # Panics + /// Panics if the inequality key is not set. + pub fn serialize_inequal_key_from_row(&self, row: impl Row) -> InequalKeyType { + self.inequality_key_desc + .as_ref() + .unwrap() + .serialize_inequal_key_from_row(&row) + } + + pub fn serialize_pk_from_row(&self, row: impl Row) -> PkType { + row.project(&self.state.pk_indices) + .memcmp_serialize(&self.pk_serializer) + } } use risingwave_common_estimate_size::KvSize; @@ -695,7 +775,9 @@ use super::*; #[derive(Default)] pub struct JoinEntryState { /// The full copy of the state. - cached: join_row_set::JoinRowSet, + cached: JoinRowSet, + /// Index used for AS OF join. The key is inequal column value. The value is the primary key in `cached`. + inequality_index: JoinRowSet>, kv_heap_size: KvSize, } @@ -710,9 +792,11 @@ impl EstimateSize for JoinEntryState { #[derive(Error, Debug)] pub enum JoinEntryError { #[error("double inserting a join state entry")] - OccupiedError, + Occupied, #[error("removing a join state entry but it is not in the cache")] - RemoveError, + Remove, + #[error("retrieving a pk from the inequality index but it is not in the cache")] + InequalIndex, } impl JoinEntryState { @@ -721,11 +805,15 @@ impl JoinEntryState { &mut self, key: PkType, value: StateValueType, + inequality_key: Option, ) -> Result<&mut StateValueType, JoinEntryError> { let mut removed = false; if !enable_strict_consistency() { // strict consistency is off, let's remove existing (if any) first if let Some(old_value) = self.cached.remove(&key) { + if let Some(inequality_key) = inequality_key.as_ref() { + self.remove_pk_from_inequality_index(&key, inequality_key); + } self.kv_heap_size.sub(&key, &old_value); removed = true; } @@ -733,6 +821,9 @@ impl JoinEntryState { self.kv_heap_size.add(&key, &value); + if let Some(inequality_key) = inequality_key { + self.insert_pk_to_inequality_index(key.clone(), inequality_key); + } let ret = self.cached.try_insert(key.clone(), value); if !enable_strict_consistency() { @@ -743,22 +834,77 @@ impl JoinEntryState { } } - ret.map_err(|_| JoinEntryError::OccupiedError) + ret.map_err(|_| JoinEntryError::Occupied) } /// Delete from the cache. - pub fn remove(&mut self, pk: PkType) -> Result<(), JoinEntryError> { + pub fn remove( + &mut self, + pk: PkType, + inequality_key: Option<&InequalKeyType>, + ) -> Result<(), JoinEntryError> { if let Some(value) = self.cached.remove(&pk) { self.kv_heap_size.sub(&pk, &value); + if let Some(inequality_key) = inequality_key { + self.remove_pk_from_inequality_index(&pk, inequality_key); + } Ok(()) } else if enable_strict_consistency() { - Err(JoinEntryError::RemoveError) + Err(JoinEntryError::Remove) } else { consistency_error!(?pk, "removing a join state entry but it's not in the cache"); Ok(()) } } + fn remove_pk_from_inequality_index(&mut self, pk: &PkType, inequality_key: &InequalKeyType) { + if let Some(pk_set) = self.inequality_index.get_mut(inequality_key) { + if pk_set.remove(pk).is_none() { + if enable_strict_consistency() { + panic!("removing a pk that it not in the inequality index"); + } else { + consistency_error!(?pk, "removing a pk that it not in the inequality index"); + }; + } else { + self.kv_heap_size.sub(pk, &()); + } + if pk_set.is_empty() { + self.inequality_index.remove(inequality_key); + } + } + } + + fn insert_pk_to_inequality_index(&mut self, pk: PkType, inequality_key: InequalKeyType) { + if let Some(pk_set) = self.inequality_index.get_mut(&inequality_key) { + let pk_size = pk.estimated_size(); + if pk_set.try_insert(pk, ()).is_err() { + if enable_strict_consistency() { + panic!("inserting a pk that it already in the inequality index"); + } else { + consistency_error!("inserting a pk that it already in the inequality index"); + }; + } else { + self.kv_heap_size.add_size(pk_size); + } + } else { + let mut pk_set = JoinRowSet::default(); + pk_set.try_insert(pk, ()).unwrap(); + self.inequality_index + .try_insert(inequality_key, pk_set) + .unwrap(); + } + } + + pub fn get( + &self, + pk: &PkType, + data_types: &[DataType], + ) -> Option>> { + self.cached + .get(pk) + .map(|encoded| encoded.decode(data_types)) + } + /// Note: the first item in the tuple is the mutable reference to the value in this entry, while /// the second item is the decoded value. To mutate the degree, one **must not** forget to apply /// the changes to the first item. @@ -782,6 +928,92 @@ impl JoinEntryState { pub fn len(&self) -> usize { self.cached.len() } + + /// Range scan the cache using the inequality index. + pub fn range_by_inequality<'a, R>( + &'a self, + range: R, + data_types: &'a [DataType], + ) -> impl Iterator>> + 'a + where + R: RangeBounds + 'a, + { + self.inequality_index.range(range).flat_map(|(_, pk_set)| { + pk_set + .keys() + .flat_map(|pk| self.get_by_indexed_pk(pk, data_types)) + }) + } + + /// Get the records whose inequality key upper bound satisfy the given bound. + pub fn upper_bound_by_inequality<'a>( + &'a self, + bound: Bound<&InequalKeyType>, + data_types: &'a [DataType], + ) -> Option>> { + if let Some((_, pk_set)) = self.inequality_index.upper_bound(bound) { + if let Some(pk) = pk_set.first_key_sorted() { + self.get_by_indexed_pk(pk, data_types) + } else { + panic!("pk set for a index record must has at least one element"); + } + } else { + None + } + } + + pub fn get_by_indexed_pk( + &self, + pk: &PkType, + data_types: &[DataType], + ) -> Option>> +where { + if let Some(value) = self.cached.get(pk) { + Some(value.decode(data_types)) + } else if enable_strict_consistency() { + Some(Err(anyhow!(JoinEntryError::InequalIndex).into())) + } else { + consistency_error!(?pk, "{}", JoinEntryError::InequalIndex.as_report()); + None + } + } + + /// Get the records whose inequality key lower bound satisfy the given bound. + pub fn lower_bound_by_inequality<'a>( + &'a self, + bound: Bound<&InequalKeyType>, + data_types: &'a [DataType], + ) -> Option>> { + if let Some((_, pk_set)) = self.inequality_index.lower_bound(bound) { + if let Some(pk) = pk_set.first_key_sorted() { + self.get_by_indexed_pk(pk, data_types) + } else { + panic!("pk set for a index record must has at least one element"); + } + } else { + None + } + } + + pub fn get_first_by_inequality<'a>( + &'a self, + inequality_key: &InequalKeyType, + data_types: &'a [DataType], + ) -> Option>> { + if let Some(pk_set) = self.inequality_index.get(inequality_key) { + if let Some(pk) = pk_set.first_key_sorted() { + self.get_by_indexed_pk(pk, data_types) + } else { + panic!("pk set for a index record must has at least one element"); + } + } else { + None + } + } + + pub fn inequality_index(&self) -> &JoinRowSet> { + &self.inequality_index + } } #[cfg(test)] @@ -795,16 +1027,36 @@ mod tests { fn insert_chunk( managed_state: &mut JoinEntryState, pk_indices: &[usize], + col_types: &[DataType], + inequality_key_idx: Option, data_chunk: &DataChunk, ) { + let pk_col_type = pk_indices + .iter() + .map(|idx| col_types[*idx].clone()) + .collect_vec(); + let pk_serializer = + OrderedRowSerde::new(pk_col_type, vec![OrderType::ascending(); pk_indices.len()]); + let inequality_key_type = inequality_key_idx.map(|idx| col_types[idx].clone()); + let inequality_key_serializer = inequality_key_type + .map(|data_type| OrderedRowSerde::new(vec![data_type], vec![OrderType::ascending()])); for row_ref in data_chunk.rows() { let row: OwnedRow = row_ref.into_owned_row(); let value_indices = (0..row.len() - 1).collect_vec(); let pk = pk_indices.iter().map(|idx| row[*idx].clone()).collect_vec(); // Pk is only a `i64` here, so encoding method does not matter. - let pk = OwnedRow::new(pk).project(&value_indices).value_serialize(); + let pk = OwnedRow::new(pk) + .project(&value_indices) + .memcmp_serialize(&pk_serializer); + let inequality_key = inequality_key_idx.map(|idx| { + (&row) + .project(&[idx]) + .memcmp_serialize(inequality_key_serializer.as_ref().unwrap()) + }); let join_row = JoinRow { row, degree: 0 }; - managed_state.insert(pk, join_row.encode()).unwrap(); + managed_state + .insert(pk, join_row.encode(), inequality_key) + .unwrap(); } } @@ -826,7 +1078,7 @@ mod tests { } #[tokio::test] - async fn test_managed_all_or_none_state() { + async fn test_managed_join_state() { let mut managed_state = JoinEntryState::default(); let col_types = vec![DataType::Int64, DataType::Int64]; let pk_indices = [0]; @@ -841,7 +1093,13 @@ mod tests { ); // `Vec` in state - insert_chunk(&mut managed_state, &pk_indices, &data_chunk1); + insert_chunk( + &mut managed_state, + &pk_indices, + &col_types, + None, + &data_chunk1, + ); check(&mut managed_state, &col_types, &col1, &col2); // `BtreeMap` in state @@ -852,7 +1110,76 @@ mod tests { 5 8 4 9", ); - insert_chunk(&mut managed_state, &pk_indices, &data_chunk2); + insert_chunk( + &mut managed_state, + &pk_indices, + &col_types, + None, + &data_chunk2, + ); check(&mut managed_state, &col_types, &col1, &col2); } + + #[tokio::test] + async fn test_managed_join_state_w_inequality_index() { + let mut managed_state = JoinEntryState::default(); + let col_types = vec![DataType::Int64, DataType::Int64]; + let pk_indices = [0]; + let inequality_key_idx = Some(1); + let inequality_key_serializer = + OrderedRowSerde::new(vec![DataType::Int64], vec![OrderType::ascending()]); + + let col1 = [3, 2, 1]; + let col2 = [4, 5, 5]; + let data_chunk1 = DataChunk::from_pretty( + "I I + 3 4 + 2 5 + 1 5", + ); + + // `Vec` in state + insert_chunk( + &mut managed_state, + &pk_indices, + &col_types, + inequality_key_idx, + &data_chunk1, + ); + check(&mut managed_state, &col_types, &col1, &col2); + let bound = OwnedRow::new(vec![Some(ScalarImpl::Int64(5))]) + .memcmp_serialize(&inequality_key_serializer); + let row = managed_state + .upper_bound_by_inequality(Bound::Included(&bound), &col_types) + .unwrap() + .unwrap(); + assert_eq!(row.row[0], Some(ScalarImpl::Int64(1))); + let row = managed_state + .upper_bound_by_inequality(Bound::Excluded(&bound), &col_types) + .unwrap() + .unwrap(); + assert_eq!(row.row[0], Some(ScalarImpl::Int64(3))); + + // `BtreeMap` in state + let col1 = [1, 2, 3, 4, 5]; + let col2 = [5, 5, 4, 4, 8]; + let data_chunk2 = DataChunk::from_pretty( + "I I + 5 8 + 4 4", + ); + insert_chunk( + &mut managed_state, + &pk_indices, + &col_types, + inequality_key_idx, + &data_chunk2, + ); + check(&mut managed_state, &col_types, &col1, &col2); + + let bound = OwnedRow::new(vec![Some(ScalarImpl::Int64(8))]) + .memcmp_serialize(&inequality_key_serializer); + let row = managed_state.lower_bound_by_inequality(Bound::Excluded(&bound), &col_types); + assert!(row.is_none()); + } } diff --git a/src/stream/src/executor/join/join_row_set.rs b/src/stream/src/executor/join/join_row_set.rs index de6f5ce2f0279..b34e163410eec 100644 --- a/src/stream/src/executor/join/join_row_set.rs +++ b/src/stream/src/executor/join/join_row_set.rs @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::borrow::Borrow; use std::collections::btree_map::OccupiedError as BTreeMapOccupiedError; use std::collections::BTreeMap; use std::fmt::Debug; use std::mem; +use std::ops::{Bound, RangeBounds}; use auto_enums::auto_enum; use enum_as_inner::EnumAsInner; @@ -110,6 +112,13 @@ impl JoinRowSet { } } + pub fn is_empty(&self) -> bool { + match self { + Self::BTree(inner) => inner.is_empty(), + Self::Vec(inner) => inner.is_empty(), + } + } + #[auto_enum(Iterator)] pub fn values_mut(&mut self) -> impl Iterator { match self { @@ -117,4 +126,161 @@ impl JoinRowSet { Self::Vec(inner) => inner.iter_mut().map(|(_, v)| v), } } + + #[auto_enum(Iterator)] + pub fn keys(&self) -> impl Iterator { + match self { + Self::BTree(inner) => inner.keys(), + Self::Vec(inner) => inner.iter().map(|(k, _v)| k), + } + } + + #[auto_enum(Iterator)] + pub fn range(&self, range: R) -> impl Iterator + where + T: Ord + ?Sized, + K: Borrow + Ord, + R: RangeBounds, + { + match self { + Self::BTree(inner) => inner.range(range), + Self::Vec(inner) => inner + .iter() + .filter(move |(k, _)| range.contains(k.borrow())) + .map(|(k, v)| (k, v)), + } + } + + pub fn lower_bound_key(&self, bound: Bound<&K>) -> Option<&K> { + self.lower_bound(bound).map(|(k, _v)| k) + } + + pub fn upper_bound_key(&self, bound: Bound<&K>) -> Option<&K> { + self.upper_bound(bound).map(|(k, _v)| k) + } + + pub fn lower_bound(&self, bound: Bound<&K>) -> Option<(&K, &V)> { + match self { + Self::BTree(inner) => inner.lower_bound(bound).next(), + Self::Vec(inner) => inner + .iter() + .filter(|(k, _)| (bound, Bound::Unbounded).contains(k)) + .min_by_key(|(k, _)| k) + .map(|(k, v)| (k, v)), + } + } + + pub fn upper_bound(&self, bound: Bound<&K>) -> Option<(&K, &V)> { + match self { + Self::BTree(inner) => inner.upper_bound(bound).prev(), + Self::Vec(inner) => inner + .iter() + .filter(|(k, _)| (Bound::Unbounded, bound).contains(k)) + .max_by_key(|(k, _)| k) + .map(|(k, v)| (k, v)), + } + } + + pub fn get_mut(&mut self, key: &K) -> Option<&mut V> { + match self { + Self::BTree(inner) => inner.get_mut(key), + Self::Vec(inner) => inner.iter_mut().find(|(k, _)| k == key).map(|(_, v)| v), + } + } + + pub fn get(&self, key: &K) -> Option<&V> { + match self { + Self::BTree(inner) => inner.get(key), + Self::Vec(inner) => inner.iter().find(|(k, _)| k == key).map(|(_, v)| v), + } + } + + /// Returns the key-value pair with smallest key in the map. + pub fn first_key_sorted(&self) -> Option<&K> { + match self { + Self::BTree(inner) => inner.first_key_value().map(|(k, _)| k), + Self::Vec(inner) => inner.iter().map(|(k, _)| k).min(), + } + } + + /// Returns the key-value pair with the second smallest key in the map. + pub fn second_key_sorted(&self) -> Option<&K> { + match self { + Self::BTree(inner) => inner.iter().nth(1).map(|(k, _)| k), + Self::Vec(inner) => { + let mut res = None; + let mut smallest = None; + for (k, _) in inner { + if let Some(smallest_k) = smallest { + if k < smallest_k { + res = Some(smallest_k); + smallest = Some(k); + } else if let Some(res_k) = res { + if k < res_k { + res = Some(k); + } + } else { + res = Some(k); + } + } else { + smallest = Some(k); + } + } + res + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_join_row_set_bounds() { + let mut join_row_set: JoinRowSet = JoinRowSet::default(); + + // Insert elements + assert!(join_row_set.try_insert(1, 10).is_ok()); + assert!(join_row_set.try_insert(2, 20).is_ok()); + assert!(join_row_set.try_insert(3, 30).is_ok()); + + // Check lower bound + assert_eq!(join_row_set.lower_bound_key(Bound::Included(&2)), Some(&2)); + assert_eq!(join_row_set.lower_bound_key(Bound::Excluded(&2)), Some(&3)); + + // Check upper bound + assert_eq!(join_row_set.upper_bound_key(Bound::Included(&2)), Some(&2)); + assert_eq!(join_row_set.upper_bound_key(Bound::Excluded(&2)), Some(&1)); + } + + #[test] + fn test_join_row_set_first_and_second_key_sorted() { + { + let mut join_row_set: JoinRowSet = JoinRowSet::default(); + + // Insert elements + assert!(join_row_set.try_insert(3, 30).is_ok()); + assert!(join_row_set.try_insert(1, 10).is_ok()); + assert!(join_row_set.try_insert(2, 20).is_ok()); + + // Check first key sorted + assert_eq!(join_row_set.first_key_sorted(), Some(&1)); + + // Check second key sorted + assert_eq!(join_row_set.second_key_sorted(), Some(&2)); + } + { + let mut join_row_set: JoinRowSet = JoinRowSet::default(); + + // Insert elements + assert!(join_row_set.try_insert(1, 10).is_ok()); + assert!(join_row_set.try_insert(2, 20).is_ok()); + + // Check first key sorted + assert_eq!(join_row_set.first_key_sorted(), Some(&1)); + + // Check second key sorted + assert_eq!(join_row_set.second_key_sorted(), Some(&2)); + } + } } diff --git a/src/stream/src/executor/join/mod.rs b/src/stream/src/executor/join/mod.rs index b8bd5ff84d95f..ea53a7992f265 100644 --- a/src/stream/src/executor/join/mod.rs +++ b/src/stream/src/executor/join/mod.rs @@ -12,6 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +use risingwave_expr::bail; +use risingwave_pb::plan_common::{AsOfJoinDesc, AsOfJoinInequalityType}; + +use crate::error::StreamResult; + pub mod builder; pub mod hash_join; pub mod join_row_set; @@ -35,6 +40,15 @@ pub mod JoinType { pub const RightAnti: JoinTypePrimitive = 7; } +pub type AsOfJoinTypePrimitive = u8; + +#[allow(non_snake_case, non_upper_case_globals)] +pub mod AsOfJoinType { + use super::AsOfJoinTypePrimitive; + pub const Inner: AsOfJoinTypePrimitive = 0; + pub const LeftOuter: AsOfJoinTypePrimitive = 1; +} + pub type SideTypePrimitive = u8; #[allow(non_snake_case, non_upper_case_globals)] pub mod SideType { @@ -43,6 +57,38 @@ pub mod SideType { pub const Right: SideTypePrimitive = 1; } +pub enum AsOfInequalityType { + Le, + Lt, + Ge, + Gt, +} + +pub struct AsOfDesc { + pub left_idx: usize, + pub right_idx: usize, + pub inequality_type: AsOfInequalityType, +} + +impl AsOfDesc { + pub fn from_protobuf(desc_proto: &AsOfJoinDesc) -> StreamResult { + let typ = match desc_proto.inequality_type() { + AsOfJoinInequalityType::AsOfInequalityTypeLt => AsOfInequalityType::Lt, + AsOfJoinInequalityType::AsOfInequalityTypeLe => AsOfInequalityType::Le, + AsOfJoinInequalityType::AsOfInequalityTypeGt => AsOfInequalityType::Gt, + AsOfJoinInequalityType::AsOfInequalityTypeGe => AsOfInequalityType::Ge, + AsOfJoinInequalityType::AsOfInequalityTypeUnspecified => { + bail!("unspecified AsOf join inequality type") + } + }; + Ok(Self { + left_idx: desc_proto.left_idx as usize, + right_idx: desc_proto.right_idx as usize, + inequality_type: typ, + }) + } +} + pub const fn is_outer_side(join_type: JoinTypePrimitive, side_type: SideTypePrimitive) -> bool { join_type == JoinType::FullOuter || (join_type == JoinType::LeftOuter && side_type == SideType::Left) @@ -106,3 +152,7 @@ pub const fn need_right_degree(join_type: JoinTypePrimitive) -> bool { || join_type == JoinType::RightAnti || join_type == JoinType::RightSemi } + +pub const fn is_as_of_left_outer(join_type: AsOfJoinTypePrimitive) -> bool { + join_type == AsOfJoinType::LeftOuter +} diff --git a/src/stream/src/executor/merge.rs b/src/stream/src/executor/merge.rs index 393b800895151..d45d75604fa57 100644 --- a/src/stream/src/executor/merge.rs +++ b/src/stream/src/executor/merge.rs @@ -531,6 +531,11 @@ mod tests { let b2 = Barrier::with_prev_epoch_for_test(test_epoch(1000), *prev_epoch) .with_mutation(Mutation::Stop(HashSet::default())); barrier_test_env.inject_barrier(&b2, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; for (tx_id, tx) in txs.into_iter().enumerate() { let epochs = epochs.clone(); @@ -634,6 +639,33 @@ mod tests { .try_collect() .unwrap(); + let merge_updates = maplit::hashmap! { + (actor_id, upstream_fragment_id) => MergeUpdate { + actor_id, + upstream_fragment_id, + new_upstream_fragment_id: None, + added_upstream_actor_id: vec![new], + removed_upstream_actor_id: vec![old], + } + }; + + let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( + UpdateMutation { + dispatchers: Default::default(), + merges: merge_updates, + vnode_bitmaps: Default::default(), + dropped_actors: Default::default(), + actor_splits: Default::default(), + actor_new_dispatchers: Default::default(), + }, + )); + barrier_test_env.inject_barrier(&b1, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let mut merge = MergeExecutor::new( ActorContext::for_test(actor_id), fragment_id, @@ -682,28 +714,6 @@ mod tests { recv!().unwrap().as_chunk().unwrap(); assert_recv_pending!(); - // 4. Send a configuration change barrier. - let merge_updates = maplit::hashmap! { - (actor_id, upstream_fragment_id) => MergeUpdate { - actor_id, - upstream_fragment_id, - new_upstream_fragment_id: None, - added_upstream_actor_id: vec![new], - removed_upstream_actor_id: vec![old], - } - }; - - let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( - UpdateMutation { - dispatchers: Default::default(), - merges: merge_updates, - vnode_bitmaps: Default::default(), - dropped_actors: Default::default(), - actor_splits: Default::default(), - actor_new_dispatchers: Default::default(), - }, - )); - barrier_test_env.inject_barrier(&b1, [actor_id]); send!( [untouched, old], Message::Barrier(b1.clone().into_dispatcher()) diff --git a/src/stream/src/executor/mod.rs b/src/stream/src/executor/mod.rs index 7b22a48a25ab6..3d1ca35b6d610 100644 --- a/src/stream/src/executor/mod.rs +++ b/src/stream/src/executor/mod.rs @@ -57,6 +57,7 @@ pub mod monitor; pub mod agg_common; pub mod aggregation; +pub mod asof_join; mod backfill; mod barrier_recv; mod batch_query; @@ -133,7 +134,7 @@ pub use filter::FilterExecutor; pub use hash_agg::HashAggExecutor; pub use hash_join::*; pub use hop_window::HopWindowExecutor; -pub use join::JoinType; +pub use join::{AsOfDesc, AsOfJoinType, JoinType}; pub use lookup::*; pub use lookup_union::LookupUnionExecutor; pub use merge::MergeExecutor; @@ -164,13 +165,17 @@ pub use wrapper::WrapperExecutor; use self::barrier_align::AlignedMessageStream; -pub type MessageStreamItem = StreamExecutorResult; +pub type MessageStreamItemInner = StreamExecutorResult>; +pub type MessageStreamItem = MessageStreamItemInner; +pub type DispatcherMessageStreamItem = MessageStreamItemInner<()>; pub type BoxedMessageStream = BoxStream<'static, MessageStreamItem>; pub use risingwave_common::util::epoch::task_local::{curr_epoch, epoch, prev_epoch}; use risingwave_pb::stream_plan::throttle_mutation::RateLimit; -pub trait MessageStream = futures::Stream + Send; +pub trait MessageStreamInner = Stream> + Send; +pub trait MessageStream = Stream + Send; +pub trait DispatcherMessageStream = Stream + Send; /// Static information of an executor. #[derive(Debug, Default, Clone)] @@ -913,6 +918,16 @@ impl BarrierInner { tracing_context: TracingContext::from_protobuf(&prost.tracing_context), }) } + + pub fn map_mutation(self, f: impl FnOnce(M) -> M2) -> BarrierInner { + BarrierInner { + epoch: self.epoch, + mutation: f(self.mutation), + kind: self.kind, + tracing_context: self.tracing_context, + passed_actors: self.passed_actors, + } + } } impl DispatcherBarrier { @@ -1017,6 +1032,16 @@ pub enum MessageInner { Watermark(Watermark), } +impl MessageInner { + pub fn map_mutation(self, f: impl FnOnce(M) -> M2) -> MessageInner { + match self { + MessageInner::Chunk(chunk) => MessageInner::Chunk(chunk), + MessageInner::Barrier(barrier) => MessageInner::Barrier(barrier.map_mutation(f)), + MessageInner::Watermark(watermark) => MessageInner::Watermark(watermark), + } + } +} + pub type Message = MessageInner; pub type DispatcherMessage = MessageInner<()>; @@ -1102,9 +1127,9 @@ pub type PkIndicesRef<'a> = &'a [usize]; pub type PkDataTypes = SmallVec<[DataType; 1]>; /// Expect the first message of the given `stream` as a barrier. -pub async fn expect_first_barrier( - stream: &mut (impl MessageStream + Unpin), -) -> StreamExecutorResult { +pub async fn expect_first_barrier( + stream: &mut (impl MessageStreamInner + Unpin), +) -> StreamExecutorResult> { let message = stream .next() .instrument_await("expect_first_barrier") diff --git a/src/stream/src/executor/nested_loop_temporal_join.rs b/src/stream/src/executor/nested_loop_temporal_join.rs index 0888d8981fc8c..55d21b468a777 100644 --- a/src/stream/src/executor/nested_loop_temporal_join.rs +++ b/src/stream/src/executor/nested_loop_temporal_join.rs @@ -98,8 +98,7 @@ async fn phase1_handle_chunk( } impl NestedLoopTemporalJoinExecutor { - #[allow(clippy::too_many_arguments)] - #[expect(dead_code)] + #[expect(clippy::too_many_arguments)] pub fn new( ctx: ActorContextRef, info: ExecutorInfo, diff --git a/src/stream/src/executor/rearranged_chain.rs b/src/stream/src/executor/rearranged_chain.rs index 37717d270d90e..d70d6c2955c3a 100644 --- a/src/stream/src/executor/rearranged_chain.rs +++ b/src/stream/src/executor/rearranged_chain.rs @@ -17,7 +17,7 @@ use futures::stream; use futures::stream::select_with_strategy; use crate::executor::prelude::*; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; /// `ChainExecutor` is an executor that enables synchronization between the existing stream and /// newly appended executors. Currently, `ChainExecutor` is mainly used to implement MV on MV @@ -31,7 +31,7 @@ pub struct RearrangedChainExecutor { upstream: Executor, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, actor_id: ActorId, } @@ -74,7 +74,11 @@ impl RearrangedMessage { } impl RearrangedChainExecutor { - pub fn new(snapshot: Executor, upstream: Executor, progress: CreateMviewProgress) -> Self { + pub fn new( + snapshot: Executor, + upstream: Executor, + progress: CreateMviewProgressReporter, + ) -> Self { Self { snapshot, upstream, diff --git a/src/stream/src/executor/receiver.rs b/src/stream/src/executor/receiver.rs index 6cabb79388333..9a99e59214bd5 100644 --- a/src/stream/src/executor/receiver.rs +++ b/src/stream/src/executor/receiver.rs @@ -231,6 +231,35 @@ mod tests { let (upstream_fragment_id, fragment_id) = (10, 18); + // 4. Send a configuration change barrier. + let merge_updates = maplit::hashmap! { + (actor_id, upstream_fragment_id) => MergeUpdate { + actor_id, + upstream_fragment_id, + new_upstream_fragment_id: None, + added_upstream_actor_id: vec![new], + removed_upstream_actor_id: vec![old], + } + }; + + let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( + UpdateMutation { + dispatchers: Default::default(), + merges: merge_updates, + vnode_bitmaps: Default::default(), + dropped_actors: Default::default(), + actor_splits: Default::default(), + actor_new_dispatchers: Default::default(), + }, + )); + + barrier_test_env.inject_barrier(&b1, [actor_id]); + barrier_test_env + .shared_context + .local_barrier_manager + .flush_all_events() + .await; + let input = new_input( &ctx, metrics.clone(), @@ -297,30 +326,6 @@ mod tests { recv!().unwrap().as_chunk().unwrap(); // We should be able to receive the chunk. assert_recv_pending!(); - // 4. Send a configuration change barrier. - let merge_updates = maplit::hashmap! { - (actor_id, upstream_fragment_id) => MergeUpdate { - actor_id, - upstream_fragment_id, - new_upstream_fragment_id: None, - added_upstream_actor_id: vec![new], - removed_upstream_actor_id: vec![old], - } - }; - - let b1 = Barrier::new_test_barrier(test_epoch(1)).with_mutation(Mutation::Update( - UpdateMutation { - dispatchers: Default::default(), - merges: merge_updates, - vnode_bitmaps: Default::default(), - dropped_actors: Default::default(), - actor_splits: Default::default(), - actor_new_dispatchers: Default::default(), - }, - )); - - barrier_test_env.inject_barrier(&b1, [actor_id]); - send!([new], Message::Barrier(b1.clone().into_dispatcher())); assert_recv_pending!(); // We should not receive the barrier, as new is not the upstream. diff --git a/src/stream/src/executor/row_id_gen.rs b/src/stream/src/executor/row_id_gen.rs index 1fcb85c26f88e..5465a1b54ec2e 100644 --- a/src/stream/src/executor/row_id_gen.rs +++ b/src/stream/src/executor/row_id_gen.rs @@ -134,13 +134,16 @@ mod tests { #[tokio::test] async fn test_row_id_gen_executor() { + // This test only works when vnode count is 256. + assert_eq!(VirtualNode::COUNT_FOR_TEST, 256); + let schema = Schema::new(vec![ Field::unnamed(DataType::Serial), Field::unnamed(DataType::Int64), ]); let pk_indices = vec![0]; let row_id_index = 0; - let row_id_generator = Bitmap::ones(VirtualNode::COUNT); + let row_id_generator = Bitmap::ones(VirtualNode::COUNT_FOR_TEST); let (mut tx, upstream) = MockSource::channel(); let upstream = upstream.into_executor(schema.clone(), pk_indices.clone()); diff --git a/src/stream/src/executor/simple_agg.rs b/src/stream/src/executor/simple_agg.rs index a08049268e5b4..fdecd5b7a4502 100644 --- a/src/stream/src/executor/simple_agg.rs +++ b/src/stream/src/executor/simple_agg.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; +use risingwave_common::array::stream_record::Record; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::iter_util::ZipEqFast; use risingwave_expr::aggregate::{build_retractable, AggCall, BoxedAggregateFunction}; @@ -83,6 +84,10 @@ struct ExecutorInner { /// Extreme state cache size extreme_cache_size: usize, + + /// Required by the downstream `RowMergeExecutor`, + /// currently only used by the `approx_percentile`'s two phase plan + must_output_per_barrier: bool, } impl ExecutorInner { @@ -129,6 +134,7 @@ impl SimpleAggExecutor { distinct_dedup_tables: args.distinct_dedup_tables, watermark_epoch: args.watermark_epoch, extreme_cache_size: args.extreme_cache_size, + must_output_per_barrier: args.extra.must_output_per_barrier, }, }) } @@ -201,7 +207,16 @@ impl SimpleAggExecutor { .agg_group .build_change(&this.storages, &this.agg_funcs) .await? - .map(|change| change.to_stream_chunk(&this.info.schema.data_types())); + .and_then(|change| { + if !this.must_output_per_barrier { + if let Record::Update { old_row, new_row } = &change { + if old_row == new_row { + return None; + } + }; + } + Some(change.to_stream_chunk(&this.info.schema.data_types())) + }); // Commit all state tables. futures::future::try_join_all(this.all_state_tables_mut().map(|table| table.commit(epoch))) @@ -343,6 +358,7 @@ mod tests { 0, vec![2], 1, + false, ) .await; let mut simple_agg = simple_agg.execute(); @@ -431,6 +447,7 @@ mod tests { 0, vec![2], 1, + true, ) .await; let mut simple_agg = simple_agg.execute(); @@ -481,4 +498,80 @@ mod tests { Message::Barrier { .. } ); } + + // NOTE(kwannoel): `approx_percentile` + `keyed_merge` depend on this property for correctness. + #[tokio::test] + async fn test_simple_aggregation_omit_noop_update() { + let store = MemoryStateStore::new(); + let schema = Schema { + fields: vec![ + Field::unnamed(DataType::Int64), + Field::unnamed(DataType::Int64), + // primary key column` + Field::unnamed(DataType::Int64), + ], + }; + let (mut tx, source) = MockSource::channel(); + let source = source.into_executor(schema, vec![2]); + // initial barrier + tx.push_barrier(test_epoch(1), false); + // next barrier + tx.push_barrier(test_epoch(2), false); + tx.push_chunk(StreamChunk::from_pretty( + " I I I + + 100 200 1001 + - 100 200 1001", + )); + tx.push_barrier(test_epoch(3), false); + tx.push_barrier(test_epoch(4), false); + + let agg_calls = vec![ + AggCall::from_pretty("(count:int8)"), + AggCall::from_pretty("(sum:int8 $0:int8)"), + AggCall::from_pretty("(sum:int8 $1:int8)"), + AggCall::from_pretty("(min:int8 $0:int8)"), + ]; + + let simple_agg = new_boxed_simple_agg_executor( + ActorContext::for_test(123), + store, + source, + false, + agg_calls, + 0, + vec![2], + 1, + false, + ) + .await; + let mut simple_agg = simple_agg.execute(); + + // Consume the init barrier + simple_agg.next().await.unwrap().unwrap(); + // Consume stream chunk + let msg = simple_agg.next().await.unwrap().unwrap(); + assert_eq!( + *msg.as_chunk().unwrap(), + StreamChunk::from_pretty( + " I I I I + + 0 . . . " + ) + ); + assert_matches!( + simple_agg.next().await.unwrap().unwrap(), + Message::Barrier { .. } + ); + + // No stream chunk + assert_matches!( + simple_agg.next().await.unwrap().unwrap(), + Message::Barrier { .. } + ); + + // No stream chunk + assert_matches!( + simple_agg.next().await.unwrap().unwrap(), + Message::Barrier { .. } + ); + } } diff --git a/src/stream/src/executor/source/source_backfill_executor.rs b/src/stream/src/executor/source/source_backfill_executor.rs index 9c3336878f952..3f2cd83aca286 100644 --- a/src/stream/src/executor/source/source_backfill_executor.rs +++ b/src/stream/src/executor/source/source_backfill_executor.rs @@ -14,6 +14,7 @@ use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; +use std::sync::Once; use std::time::Instant; use anyhow::anyhow; @@ -27,8 +28,10 @@ use risingwave_common::system_param::reader::SystemParamsRead; use risingwave_common::types::JsonbVal; use risingwave_connector::source::reader::desc::{SourceDesc, SourceDescBuilder}; use risingwave_connector::source::{ - BoxChunkSourceStream, SourceContext, SourceCtrlOpts, SplitId, SplitImpl, SplitMetaData, + BackfillInfo, BoxChunkSourceStream, SourceContext, SourceCtrlOpts, SplitId, SplitImpl, + SplitMetaData, }; +use risingwave_hummock_sdk::HummockReadEpoch; use serde::{Deserialize, Serialize}; use thiserror_ext::AsReport; @@ -39,10 +42,12 @@ use crate::common::rate_limit::limited_chunk_size; use crate::executor::prelude::*; use crate::executor::source::source_executor::WAIT_BARRIER_MULTIPLE_TIMES; use crate::executor::UpdateMutation; +use crate::task::CreateMviewProgressReporter; #[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] pub enum BackfillState { /// `None` means not started yet. It's the initial state. + /// XXX: perhaps we can also set to low-watermark instead of `None` Backfilling(Option), /// Backfill is stopped at this offset (inclusive). Source needs to filter out messages before this offset. SourceCachingUp(String), @@ -86,10 +91,14 @@ pub struct SourceBackfillExecutorInner { /// Rate limit in rows/s. rate_limit_rps: Option, + + progress: CreateMviewProgressReporter, } /// Local variables used in the backfill stage. /// +/// See for a state diagram about how it works. +/// /// Note: all off the fields should contain all available splits, and we can `unwrap()` safely when `get()`. #[derive(Debug)] struct BackfillStage { @@ -99,8 +108,8 @@ struct BackfillStage { /// Note: the offsets are not updated. Should use `state`'s offset to update before using it (`get_latest_unfinished_splits`). splits: Vec, /// The latest offset from upstream (inclusive). After we reach this offset, we can stop backfilling. - /// TODO: initialize this with high watermark so that we can finish backfilling even when upstream - /// doesn't emit any data. + /// This is initialized with the latest available offset in the connector (if the connector provides the ability to fetch it) + /// so that we can finish backfilling even when upstream doesn't emit any data. target_offsets: HashMap>, } @@ -226,6 +235,7 @@ impl BackfillStage { } impl SourceBackfillExecutorInner { + #[expect(clippy::too_many_arguments)] pub fn new( actor_ctx: ActorContextRef, info: ExecutorInfo, @@ -234,6 +244,7 @@ impl SourceBackfillExecutorInner { system_params: SystemParamsReaderRef, backfill_state_store: BackfillStateTableHandler, rate_limit_rps: Option, + progress: CreateMviewProgressReporter, ) -> Self { let source_split_change_count = metrics .source_split_change_count @@ -243,6 +254,7 @@ impl SourceBackfillExecutorInner { &actor_ctx.id.to_string(), &actor_ctx.fragment_id.to_string(), ]); + Self { actor_ctx, info, @@ -252,6 +264,7 @@ impl SourceBackfillExecutorInner { source_split_change_count, system_params, rate_limit_rps, + progress, } } @@ -259,7 +272,7 @@ impl SourceBackfillExecutorInner { &self, source_desc: &SourceDesc, splits: Vec, - ) -> StreamExecutorResult { + ) -> StreamExecutorResult<(BoxChunkSourceStream, HashMap)> { let column_ids = source_desc .columns .iter() @@ -278,12 +291,22 @@ impl SourceBackfillExecutorInner { source_desc.source.config.clone(), None, ); - let stream = source_desc + + // We will check watermark to decide whether we need to backfill. + // e.g., when there's a Kafka topic-partition without any data, + // we don't need to backfill at all. But if we do not check here, + // the executor can only know it's finished when data coming in. + // For blocking DDL, this would be annoying. + + let (stream, backfill_info) = source_desc .source - .build_stream(Some(splits), column_ids, Arc::new(source_ctx)) + .build_stream_for_backfill(Some(splits), column_ids, Arc::new(source_ctx)) .await .map_err(StreamExecutorError::connector_error)?; - Ok(apply_rate_limit(stream, self.rate_limit_rps).boxed()) + Ok(( + apply_rate_limit(stream, self.rate_limit_rps).boxed(), + backfill_info, + )) } #[try_stream(ok = Message, error = StreamExecutorError)] @@ -332,18 +355,30 @@ impl SourceBackfillExecutorInner { splits: owned_splits, }; backfill_stage.debug_assert_consistent(); - tracing::debug!(?backfill_stage, "source backfill started"); // Return the ownership of `stream_source_core` to the source executor. self.stream_source_core = core; - let source_chunk_reader = self + let (source_chunk_reader, backfill_info) = self .build_stream_source_reader( &source_desc, backfill_stage.get_latest_unfinished_splits()?, ) .instrument_await("source_build_reader") .await?; + for (split_id, info) in &backfill_info { + match info { + BackfillInfo::NoDataToBackfill => { + *backfill_stage.states.get_mut(split_id).unwrap() = BackfillState::Finished; + } + BackfillInfo::HasDataToBackfill { latest_offset } => { + // Note: later we will override it with the offset from the source message, and it's possible to become smaller than this value. + *backfill_stage.target_offsets.get_mut(split_id).unwrap() = + Some(latest_offset.clone()); + } + } + } + tracing::debug!(?backfill_stage, "source backfill started"); fn select_strategy(_: &mut ()) -> PollNext { futures::stream::PollNext::Left @@ -381,9 +416,23 @@ impl SourceBackfillExecutorInner { pause_reader!(); } + let state_store = self.backfill_state_store.state_store.state_store().clone(); + static STATE_TABLE_INITIALIZED: Once = Once::new(); + tokio::spawn(async move { + // This is for self.backfill_finished() to be safe. + // We wait for 1st epoch's curr, i.e., the 2nd epoch's prev. + let epoch = barrier.epoch.curr; + tracing::info!("waiting for epoch: {}", epoch); + state_store + .try_wait_epoch(HummockReadEpoch::Committed(epoch)) + .await + .expect("failed to wait epoch"); + STATE_TABLE_INITIALIZED.call_once(|| ()); + tracing::info!("finished waiting for epoch: {}", epoch); + }); yield Message::Barrier(barrier); - if !self.backfill_finished(&backfill_stage.states).await? { + { let source_backfill_row_count = self .metrics .source_backfill_row_count @@ -422,7 +471,7 @@ impl SourceBackfillExecutorInner { self.actor_ctx.fragment_id.to_string(), ]); - let reader = self + let (reader, _backfill_info) = self .build_stream_source_reader( &source_desc, backfill_stage.get_latest_unfinished_splits()?, @@ -504,7 +553,7 @@ impl SourceBackfillExecutorInner { ); // Replace the source reader with a new one of the new state. - let reader = self + let (reader, _backfill_info) = self .build_stream_source_reader( &source_desc, latest_unfinished_splits, @@ -526,10 +575,26 @@ impl SourceBackfillExecutorInner { .commit(barrier.epoch) .await?; - yield Message::Barrier(barrier); - - if self.backfill_finished(&backfill_stage.states).await? { - break 'backfill_loop; + if self.should_report_finished(&backfill_stage.states) { + // TODO: use a specialized progress for source + // Currently, `CreateMviewProgress` is designed for MV backfill, and rw_ddl_progress calculates + // progress based on the number of consumed rows and an estimated total number of rows from hummock. + // For now, we just rely on the same code path, and for source backfill, the progress will always be 99.99%. + tracing::info!("progress finish"); + let epoch = barrier.epoch; + self.progress.finish(epoch, 114514); + // yield barrier after reporting progress + yield Message::Barrier(barrier); + + // After we reported finished, we still don't exit the loop. + // Because we need to handle split migration. + if STATE_TABLE_INITIALIZED.is_completed() + && self.backfill_finished(&backfill_stage.states).await? + { + break 'backfill_loop; + } + } else { + yield Message::Barrier(barrier); } } Message::Chunk(chunk) => { @@ -602,6 +667,15 @@ impl SourceBackfillExecutorInner { } let mut splits: HashSet = backfill_stage.states.keys().cloned().collect(); + // Make sure `Finished` state is persisted. + self.backfill_state_store + .set_states( + splits + .iter() + .map(|s| (s.clone(), BackfillState::Finished)) + .collect(), + ) + .await?; // All splits finished backfilling. Now we only forward the source data. #[for_await] @@ -630,7 +704,7 @@ impl SourceBackfillExecutorInner { self.apply_split_change_forward_stage( actor_splits, &mut splits, - true, + false, ) .await?; } @@ -653,11 +727,34 @@ impl SourceBackfillExecutorInner { } } - /// All splits finished backfilling. + /// When we should call `progress.finish()` to let blocking DDL return. + /// We report as soon as `SourceCachingUp`. Otherwise the DDL might be blocked forever until upstream messages come. + /// + /// Note: split migration (online scaling) is related with progress tracking. + /// - For foreground DDL, scaling is not allowed before progress is finished. + /// - For background DDL, scaling is skipped when progress is not finished, and can be triggered by recreating actors during recovery. + /// + /// See for more details. + fn should_report_finished(&self, states: &BackfillStates) -> bool { + states.values().all(|state| { + matches!( + state, + BackfillState::Finished | BackfillState::SourceCachingUp(_) + ) + }) + } + + /// All splits entered `Finished` state. /// /// We check all splits for the source, including other actors' splits here, before going to the forward stage. - /// Otherwise if we break early, but after rescheduling, an unfinished split is migrated to + /// Otherwise if we `break` early, but after rescheduling, an unfinished split is migrated to /// this actor, we still need to backfill it. + /// + /// Note: at the beginning, the actor will only read the state written by itself. + /// It needs to _wait until it can read all actors' written data_. + /// i.e., wait for the first checkpoint has been available. + /// + /// See for more details. async fn backfill_finished(&self, states: &BackfillStates) -> StreamExecutorResult { Ok(states .values() @@ -726,7 +823,6 @@ impl SourceBackfillExecutorInner { } Some(backfill_state) => { // Migrated split. Backfill if unfinished. - // TODO: disallow online scaling during backfilling. target_state.insert(split_id, backfill_state); } } diff --git a/src/stream/src/executor/source/source_backfill_state_table.rs b/src/stream/src/executor/source/source_backfill_state_table.rs index be9abe8490e63..3579aff2ec4fb 100644 --- a/src/stream/src/executor/source/source_backfill_state_table.rs +++ b/src/stream/src/executor/source/source_backfill_state_table.rs @@ -76,6 +76,7 @@ impl BackfillStateTableHandler { }; ret.push(state); } + tracing::trace!("scan SourceBackfill state table: {:?}", ret); Ok(ret) } diff --git a/src/stream/src/executor/stream_reader.rs b/src/stream/src/executor/stream_reader.rs index 30de0804b0ac0..bd22e47c737ad 100644 --- a/src/stream/src/executor/stream_reader.rs +++ b/src/stream/src/executor/stream_reader.rs @@ -16,7 +16,7 @@ use std::pin::Pin; use std::task::Poll; use either::Either; -use futures::stream::{select_with_strategy, BoxStream, PollNext, SelectWithStrategy}; +use futures::stream::BoxStream; use futures::{Stream, StreamExt, TryStreamExt}; use crate::executor::error::StreamExecutorResult; @@ -25,8 +25,34 @@ use crate::executor::Message; type ExecutorMessageStream = BoxStream<'static, StreamExecutorResult>; type StreamReaderData = StreamExecutorResult>; type ReaderArm = BoxStream<'static, StreamReaderData>; -type StreamReaderWithPauseInner = - SelectWithStrategy, ReaderArm, impl FnMut(&mut PollNext) -> PollNext, PollNext>; + +mod stream_reader_with_pause { + use futures::stream::{select_with_strategy, PollNext, SelectWithStrategy}; + + use crate::executor::stream_reader::ReaderArm; + + pub(super) type StreamReaderWithPauseInner = SelectWithStrategy< + ReaderArm, + ReaderArm, + impl FnMut(&mut PollNext) -> PollNext, + PollNext, + >; + + pub(super) fn new_inner( + message_stream: ReaderArm, + data_stream: ReaderArm, + ) -> StreamReaderWithPauseInner { + let strategy = if BIASED { + |_: &mut PollNext| PollNext::Left + } else { + // The poll strategy is not biased: we poll the two streams in a round robin way. + |last: &mut PollNext| last.toggle() + }; + select_with_strategy(message_stream, data_stream, strategy) + } +} + +use stream_reader_with_pause::*; /// [`StreamReaderWithPause`] merges two streams, with one receiving barriers (and maybe other types /// of messages) and the other receiving data only (no barrier). The merged stream can be paused @@ -40,7 +66,7 @@ type StreamReaderWithPauseInner = /// priority over the right-hand one. Otherwise, the two streams will be polled in a round robin /// fashion. pub(super) struct StreamReaderWithPause { - inner: StreamReaderWithPauseInner, + inner: StreamReaderWithPauseInner, /// Whether the source stream is paused. paused: bool, } @@ -54,26 +80,13 @@ impl StreamReaderWithPause { ) -> Self { let message_stream_arm = message_stream.map_ok(Either::Left).boxed(); let data_stream_arm = data_stream.map_ok(Either::Right).boxed(); - let inner = Self::new_inner(message_stream_arm, data_stream_arm); + let inner = new_inner(message_stream_arm, data_stream_arm); Self { inner, paused: false, } } - fn new_inner( - message_stream: ReaderArm, - data_stream: ReaderArm, - ) -> StreamReaderWithPauseInner { - let strategy = if BIASED { - |_: &mut PollNext| PollNext::Left - } else { - // The poll strategy is not biased: we poll the two streams in a round robin way. - |last: &mut PollNext| last.toggle() - }; - select_with_strategy(message_stream, data_stream, strategy) - } - /// Replace the data stream with a new one for given `stream`. Used for split change. pub fn replace_data_stream( &mut self, @@ -87,7 +100,7 @@ impl StreamReaderWithPause { // Note: create a new `SelectWithStrategy` instead of replacing the source stream arm here, // to ensure the internal state of the `SelectWithStrategy` is reset. (#6300) - self.inner = Self::new_inner( + self.inner = new_inner( barrier_receiver_arm, data_stream.map_ok(Either::Right).boxed(), ); diff --git a/src/stream/src/executor/test_utils.rs b/src/stream/src/executor/test_utils.rs index db024411ea0ad..4744bae374bfb 100644 --- a/src/stream/src/executor/test_utils.rs +++ b/src/stream/src/executor/test_utils.rs @@ -515,6 +515,7 @@ pub mod agg_executor { row_count_index: usize, pk_indices: PkIndices, executor_id: u64, + must_output_per_barrier: bool, ) -> Executor { let storages = future::join_all(agg_calls.iter().enumerate().map(|(idx, agg_call)| { create_agg_state_storage( @@ -560,7 +561,9 @@ pub mod agg_executor { intermediate_state_table, distinct_dedup_tables: Default::default(), watermark_epoch: Arc::new(AtomicU64::new(0)), - extra: SimpleAggExecutorExtraArgs {}, + extra: SimpleAggExecutorExtraArgs { + must_output_per_barrier, + }, }) .unwrap(); (info, exec).into() diff --git a/src/stream/src/executor/values.rs b/src/stream/src/executor/values.rs index 83da0ff68a7d5..89946d9dc94e6 100644 --- a/src/stream/src/executor/values.rs +++ b/src/stream/src/executor/values.rs @@ -21,7 +21,7 @@ use risingwave_expr::expr::NonStrictExpression; use tokio::sync::mpsc::UnboundedReceiver; use crate::executor::prelude::*; -use crate::task::CreateMviewProgress; +use crate::task::CreateMviewProgressReporter; const DEFAULT_CHUNK_SIZE: usize = 1024; @@ -33,7 +33,7 @@ pub struct ValuesExecutor { schema: Schema, // Receiver of barrier channel. barrier_receiver: UnboundedReceiver, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, rows: vec::IntoIter>, } @@ -43,7 +43,7 @@ impl ValuesExecutor { pub fn new( ctx: ActorContextRef, schema: Schema, - progress: CreateMviewProgress, + progress: CreateMviewProgressReporter, rows: Vec>, barrier_receiver: UnboundedReceiver, ) -> Self { @@ -150,12 +150,12 @@ mod tests { use super::ValuesExecutor; use crate::executor::test_utils::StreamExecutorTestExt; use crate::executor::{ActorContext, AddMutation, Barrier, Execute, Mutation}; - use crate::task::{CreateMviewProgress, LocalBarrierManager}; + use crate::task::{CreateMviewProgressReporter, LocalBarrierManager}; #[tokio::test] async fn test_values() { let barrier_manager = LocalBarrierManager::for_test(); - let progress = CreateMviewProgress::for_test(barrier_manager); + let progress = CreateMviewProgressReporter::for_test(barrier_manager); let actor_id = progress.actor_id(); let (tx, barrier_receiver) = unbounded_channel(); let value = StructValue::new(vec![Some(1.into()), Some(2.into()), Some(3.into())]); diff --git a/src/stream/src/executor/watermark_filter.rs b/src/stream/src/executor/watermark_filter.rs index 8f8b166626d21..01497c37fdab5 100644 --- a/src/stream/src/executor/watermark_filter.rs +++ b/src/stream/src/executor/watermark_filter.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::cmp; -use std::ops::Deref; use futures::future::{try_join, try_join_all}; use risingwave_common::hash::VnodeBitmapExt; @@ -27,7 +26,6 @@ use risingwave_expr::Result as ExprResult; use risingwave_hummock_sdk::HummockReadEpoch; use risingwave_pb::expr::expr_node::Type; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::TableDistribution; use super::filter::FilterExecutor; use crate::executor::prelude::*; @@ -219,10 +217,7 @@ impl WatermarkFilterExecutor { let mut need_update_global_max_watermark = false; // Update the vnode bitmap for state tables of all agg calls if asked. if let Some(vnode_bitmap) = barrier.as_update_vnode_bitmap(ctx.id) { - let other_vnodes_bitmap = Arc::new( - (!(*vnode_bitmap).clone()) - & TableDistribution::all_vnodes_ref().deref(), - ); + let other_vnodes_bitmap = Arc::new(!(*vnode_bitmap).clone()); let _ = global_watermark_table.update_vnode_bitmap(other_vnodes_bitmap); let (previous_vnode_bitmap, _cache_may_stale) = table.update_vnode_bitmap(vnode_bitmap.clone()); @@ -373,7 +368,9 @@ impl WatermarkFilterExecutor { #[cfg(test)] mod tests { use itertools::Itertools; + use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{ColumnDesc, ColumnId, Field, TableDesc}; + use risingwave_common::hash::VirtualNode; use risingwave_common::test_prelude::StreamChunkTestExt; use risingwave_common::types::Date; use risingwave_common::util::epoch::test_epoch; @@ -431,7 +428,7 @@ mod tests { let state_table = StateTable::from_table_catalog_inconsistent_op( &table, mem_state.clone(), - Some(TableDistribution::all_vnodes()), + Some(Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into()), ) .await; @@ -440,7 +437,7 @@ mod tests { let storage_table = StorageTable::new_partial( mem_state, val_indices.iter().map(|i| ColumnId::new(*i as _)).collect(), - Some(TableDistribution::all_vnodes()), + Some(Bitmap::ones(VirtualNode::COUNT_FOR_TEST).into()), &desc, ); (storage_table, state_table) diff --git a/src/stream/src/from_proto/asof_join.rs b/src/stream/src/from_proto/asof_join.rs new file mode 100644 index 0000000000000..3d74ac884b4f0 --- /dev/null +++ b/src/stream/src/from_proto/asof_join.rs @@ -0,0 +1,192 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use risingwave_common::hash::{HashKey, HashKeyDispatcher}; +use risingwave_common::types::DataType; +use risingwave_pb::plan_common::AsOfJoinType as JoinTypeProto; +use risingwave_pb::stream_plan::AsOfJoinNode; + +use super::*; +use crate::common::table::state_table::StateTable; +use crate::executor::asof_join::*; +use crate::executor::monitor::StreamingMetrics; +use crate::executor::{ActorContextRef, AsOfDesc, AsOfJoinType, JoinType}; +use crate::task::AtomicU64Ref; + +pub struct AsOfJoinExecutorBuilder; + +impl ExecutorBuilder for AsOfJoinExecutorBuilder { + type Node = AsOfJoinNode; + + async fn new_boxed_executor( + params: ExecutorParams, + node: &Self::Node, + store: impl StateStore, + ) -> StreamResult { + // This assert is to make sure AsOf join can use `JoinChunkBuilder` as Hash join. + assert_eq!(AsOfJoinType::Inner, JoinType::Inner); + assert_eq!(AsOfJoinType::LeftOuter, JoinType::LeftOuter); + let vnodes = Arc::new(params.vnode_bitmap.expect("vnodes not set for AsOf join")); + + let [source_l, source_r]: [_; 2] = params.input.try_into().unwrap(); + + let table_l = node.get_left_table()?; + let degree_table_l = node.get_left_degree_table()?; + + let table_r = node.get_right_table()?; + let degree_table_r = node.get_right_degree_table()?; + + let params_l = JoinParams::new( + node.get_left_key() + .iter() + .map(|key| *key as usize) + .collect_vec(), + node.get_left_deduped_input_pk_indices() + .iter() + .map(|key| *key as usize) + .collect_vec(), + ); + let params_r = JoinParams::new( + node.get_right_key() + .iter() + .map(|key| *key as usize) + .collect_vec(), + node.get_right_deduped_input_pk_indices() + .iter() + .map(|key| *key as usize) + .collect_vec(), + ); + let null_safe = node.get_null_safe().to_vec(); + let output_indices = node + .get_output_indices() + .iter() + .map(|&x| x as usize) + .collect_vec(); + + let join_key_data_types = params_l + .join_key_indices + .iter() + .map(|idx| source_l.schema().fields[*idx].data_type()) + .collect_vec(); + + let state_table_l = + StateTable::from_table_catalog(table_l, store.clone(), Some(vnodes.clone())).await; + let degree_state_table_l = + StateTable::from_table_catalog(degree_table_l, store.clone(), Some(vnodes.clone())) + .await; + + let state_table_r = + StateTable::from_table_catalog(table_r, store.clone(), Some(vnodes.clone())).await; + let degree_state_table_r = + StateTable::from_table_catalog(degree_table_r, store, Some(vnodes)).await; + + let join_type_proto = node.get_join_type()?; + let as_of_desc_proto = node.get_asof_desc()?; + let asof_desc = AsOfDesc::from_protobuf(as_of_desc_proto)?; + + let args = AsOfJoinExecutorDispatcherArgs { + ctx: params.actor_context, + info: params.info.clone(), + source_l, + source_r, + params_l, + params_r, + null_safe, + output_indices, + state_table_l, + degree_state_table_l, + state_table_r, + degree_state_table_r, + lru_manager: params.watermark_epoch, + metrics: params.executor_stats, + join_type_proto, + join_key_data_types, + chunk_size: params.env.config().developer.chunk_size, + high_join_amplification_threshold: params + .env + .config() + .developer + .high_join_amplification_threshold, + asof_desc, + }; + + let exec = args.dispatch()?; + Ok((params.info, exec).into()) + } +} + +struct AsOfJoinExecutorDispatcherArgs { + ctx: ActorContextRef, + info: ExecutorInfo, + source_l: Executor, + source_r: Executor, + params_l: JoinParams, + params_r: JoinParams, + null_safe: Vec, + output_indices: Vec, + state_table_l: StateTable, + degree_state_table_l: StateTable, + state_table_r: StateTable, + degree_state_table_r: StateTable, + lru_manager: AtomicU64Ref, + metrics: Arc, + join_type_proto: JoinTypeProto, + join_key_data_types: Vec, + chunk_size: usize, + high_join_amplification_threshold: usize, + asof_desc: AsOfDesc, +} + +impl HashKeyDispatcher for AsOfJoinExecutorDispatcherArgs { + type Output = StreamResult>; + + fn dispatch_impl(self) -> Self::Output { + /// This macro helps to fill the const generic type parameter. + macro_rules! build { + ($join_type:ident) => { + Ok(AsOfJoinExecutor::::new( + self.ctx, + self.info, + self.source_l, + self.source_r, + self.params_l, + self.params_r, + self.null_safe, + self.output_indices, + self.state_table_l, + self.degree_state_table_l, + self.state_table_r, + self.degree_state_table_r, + self.lru_manager, + self.metrics, + self.chunk_size, + self.high_join_amplification_threshold, + self.asof_desc, + ) + .boxed()) + }; + } + match self.join_type_proto { + JoinTypeProto::Unspecified => unreachable!(), + JoinTypeProto::Inner => build!(Inner), + JoinTypeProto::LeftOuter => build!(LeftOuter), + } + } + + fn data_types(&self) -> &[DataType] { + &self.join_key_data_types + } +} diff --git a/src/stream/src/from_proto/mod.rs b/src/stream/src/from_proto/mod.rs index 6f185695eadf7..1f63b6cd5db85 100644 --- a/src/stream/src/from_proto/mod.rs +++ b/src/stream/src/from_proto/mod.rs @@ -16,6 +16,7 @@ mod agg_common; mod append_only_dedup; +mod asof_join; mod barrier_recv; mod batch_query; mod cdc_filter; diff --git a/src/stream/src/from_proto/mview.rs b/src/stream/src/from_proto/mview.rs index 41fc47609fba7..43fc929edf455 100644 --- a/src/stream/src/from_proto/mview.rs +++ b/src/stream/src/from_proto/mview.rs @@ -100,7 +100,7 @@ impl ExecutorBuilder for ArrangeExecutorBuilder { let table = node.get_table()?; // FIXME: Lookup is now implemented without cell-based table API and relies on all vnodes - // being `DEFAULT_VNODE`, so we need to make the Arrange a singleton. + // being `SINGLETON_VNODE`, so we need to make the Arrange a singleton. let vnodes = params.vnode_bitmap.map(Arc::new); let conflict_behavior = ConflictBehavior::from_protobuf(&table.handle_pk_conflict_behavior()); diff --git a/src/stream/src/from_proto/simple_agg.rs b/src/stream/src/from_proto/simple_agg.rs index 16809edb8bcaf..689acc7d16a9c 100644 --- a/src/stream/src/from_proto/simple_agg.rs +++ b/src/stream/src/from_proto/simple_agg.rs @@ -54,6 +54,7 @@ impl ExecutorBuilder for SimpleAggExecutorBuilder { let distinct_dedup_tables = build_distinct_dedup_table_from_proto(node.get_distinct_dedup_tables(), store, None) .await; + let must_output_per_barrier = node.get_must_output_per_barrier(); let exec = SimpleAggExecutor::new(AggExecutorArgs { version: node.version(), @@ -70,7 +71,9 @@ impl ExecutorBuilder for SimpleAggExecutorBuilder { intermediate_state_table, distinct_dedup_tables, watermark_epoch: params.watermark_epoch, - extra: SimpleAggExecutorExtraArgs {}, + extra: SimpleAggExecutorExtraArgs { + must_output_per_barrier, + }, })?; Ok((params.info, exec).into()) diff --git a/src/stream/src/from_proto/source_backfill.rs b/src/stream/src/from_proto/source_backfill.rs index ba3ab599af700..65329a26bd40b 100644 --- a/src/stream/src/from_proto/source_backfill.rs +++ b/src/stream/src/from_proto/source_backfill.rs @@ -72,6 +72,9 @@ impl ExecutorBuilder for SourceBackfillExecutorBuilder { source_desc_builder, state_table_handler, ); + let progress = params + .local_barrier_manager + .register_create_mview_progress(params.actor_context.id); let exec = SourceBackfillExecutorInner::new( params.actor_context.clone(), @@ -81,6 +84,7 @@ impl ExecutorBuilder for SourceBackfillExecutorBuilder { params.env.system_params_manager_ref().get_params(), backfill_state_table, node.rate_limit, + progress, ); let [input]: [_; 1] = params.input.try_into().unwrap(); diff --git a/src/stream/src/from_proto/watermark_filter.rs b/src/stream/src/from_proto/watermark_filter.rs index 0081f00cc39e6..4e3147d10853e 100644 --- a/src/stream/src/from_proto/watermark_filter.rs +++ b/src/stream/src/from_proto/watermark_filter.rs @@ -12,14 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::ops::Deref; use std::sync::Arc; use risingwave_common::catalog::{ColumnId, TableDesc}; use risingwave_expr::expr::build_non_strict_from_prost; use risingwave_pb::stream_plan::WatermarkFilterNode; use risingwave_storage::table::batch_table::storage_table::StorageTable; -use risingwave_storage::table::TableDistribution; use super::*; use crate::common::table::state_table::StateTable; @@ -57,8 +55,7 @@ impl ExecutorBuilder for WatermarkFilterBuilder { .iter() .map(|i| ColumnId::new(*i as _)) .collect_vec(); - let other_vnodes = - Arc::new((!(*vnodes).clone()) & TableDistribution::all_vnodes_ref().deref()); + let other_vnodes = Arc::new(!(*vnodes).clone()); let global_watermark_table = StorageTable::new_partial(store.clone(), column_ids, Some(other_vnodes), &desc); diff --git a/src/stream/src/lib.rs b/src/stream/src/lib.rs index 876deabc80f98..577b829945620 100644 --- a/src/stream/src/lib.rs +++ b/src/stream/src/lib.rs @@ -17,7 +17,6 @@ #![feature(trait_alias)] #![feature(type_alias_impl_trait)] #![feature(more_qualified_paths)] -#![feature(lint_reasons)] #![feature(let_chains)] #![feature(hash_extract_if)] #![feature(extract_if)] diff --git a/src/stream/src/task/barrier_manager.rs b/src/stream/src/task/barrier_manager.rs index 88e86a5998758..406459e25c389 100644 --- a/src/stream/src/task/barrier_manager.rs +++ b/src/stream/src/task/barrier_manager.rs @@ -37,8 +37,7 @@ use tonic::{Code, Status}; use self::managed_state::ManagedBarrierState; use crate::error::{IntoUnexpectedExit, StreamError, StreamResult}; use crate::task::{ - ActorHandle, ActorId, AtomicU64Ref, PartialGraphId, SharedContext, StreamEnvironment, - UpDownActorIds, + ActorId, AtomicU64Ref, PartialGraphId, SharedContext, StreamEnvironment, UpDownActorIds, }; mod managed_state; @@ -46,7 +45,7 @@ mod progress; #[cfg(test)] mod tests; -pub use progress::CreateMviewProgress; +pub use progress::CreateMviewProgressReporter; use risingwave_common::catalog::TableId; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::runtime::BackgroundShutdownRuntime; @@ -210,10 +209,6 @@ pub(super) enum LocalActorOperation { handle: ControlStreamHandle, init_request: InitRequest, }, - DropActors { - actors: Vec, - result_sender: oneshot::Sender<()>, - }, TakeReceiver { ids: UpDownActorIds, result_sender: oneshot::Sender>, @@ -228,29 +223,6 @@ pub(super) enum LocalActorOperation { }, } -pub(crate) struct StreamActorManagerState { - /// Each processor runs in a future. Upon receiving a `Terminate` message, they will exit. - /// `handles` store join handles of these futures, and therefore we could wait their - /// termination. - pub(super) handles: HashMap, - - /// Stores all actor information, taken after actor built. - pub(super) actors: HashMap, - - /// Stores all actor tokio runtime monitoring tasks. - pub(super) actor_monitor_tasks: HashMap, -} - -impl StreamActorManagerState { - fn new() -> Self { - Self { - handles: HashMap::new(), - actors: HashMap::new(), - actor_monitor_tasks: HashMap::new(), - } - } -} - pub(crate) struct StreamActorManager { pub(super) env: StreamEnvironment, pub(super) streaming_metrics: Arc, @@ -294,7 +266,7 @@ impl Display for LocalBarrierWorkerDebugInfo<'_> { /// barriers to and collect them from all actors, and finally report the progress. pub(super) struct LocalBarrierWorker { /// Current barrier collection state. - state: ManagedBarrierState, + pub(super) state: ManagedBarrierState, /// Record all unexpected exited actors. failure_actors: HashMap, @@ -303,8 +275,6 @@ pub(super) struct LocalBarrierWorker { pub(super) actor_manager: Arc, - pub(super) actor_manager_state: StreamActorManagerState, - pub(super) current_shared_context: Arc, barrier_event_rx: UnboundedReceiver, @@ -328,14 +298,9 @@ impl LocalBarrierWorker { )); Self { failure_actors: HashMap::default(), - state: ManagedBarrierState::new( - actor_manager.env.state_store(), - actor_manager.streaming_metrics.clone(), - actor_manager.await_tree_reg.clone(), - ), + state: ManagedBarrierState::new(actor_manager.clone(), shared_context.clone()), control_stream_handle: ControlStreamHandle::empty(), actor_manager, - actor_manager_state: StreamActorManagerState::new(), current_shared_context: shared_context, barrier_event_rx: event_rx, actor_failure_rx: failure_rx, @@ -345,7 +310,7 @@ impl LocalBarrierWorker { fn to_debug_info(&self) -> LocalBarrierWorkerDebugInfo<'_> { LocalBarrierWorkerDebugInfo { - running_actors: self.actor_manager_state.handles.keys().cloned().collect(), + running_actors: self.state.actor_states.keys().cloned().collect(), managed_barrier_state: self.state.to_debug_info(), has_control_stream_connected: self.control_stream_handle.connected(), } @@ -384,7 +349,7 @@ impl LocalBarrierWorker { }); } LocalActorOperation::Shutdown { result_sender } => { - if !self.actor_manager_state.handles.is_empty() { + if !self.state.actor_states.is_empty() { tracing::warn!( "shutdown with running actors, scaling or migration will be triggered" ); @@ -419,15 +384,9 @@ impl LocalBarrierWorker { Request::InjectBarrier(req) => { let barrier = Barrier::from_protobuf(req.get_barrier().unwrap())?; self.update_actor_info(req.broadcast_info)?; - let actors = req - .actors_to_build - .iter() - .map(|actor| actor.actor.as_ref().unwrap().actor_id) - .collect_vec(); - self.update_actors(req.actors_to_build)?; - self.start_create_actors(&actors)?; self.send_barrier( &barrier, + req.actors_to_build, req.actor_ids_to_collect.into_iter().collect(), req.table_ids_to_sync .into_iter() @@ -484,7 +443,13 @@ impl LocalBarrierWorker { .map_err(|e| (actor_id, e))?; } #[cfg(test)] - LocalBarrierEvent::Flush(sender) => sender.send(()).unwrap(), + LocalBarrierEvent::Flush(sender) => { + use futures::FutureExt; + while let Some(request) = self.control_stream_handle.next_request().now_or_never() { + self.handle_streaming_control_request(request).unwrap(); + } + sender.send(()).unwrap() + } } Ok(()) } @@ -494,13 +459,6 @@ impl LocalBarrierWorker { LocalActorOperation::NewControlStream { .. } | LocalActorOperation::Shutdown { .. } => { unreachable!("event {actor_op} should be handled separately in async context") } - LocalActorOperation::DropActors { - actors, - result_sender, - } => { - self.drop_actors(&actors); - let _ = result_sender.send(()); - } LocalActorOperation::TakeReceiver { ids, result_sender } => { let _ = result_sender.send(self.current_shared_context.take_receiver(ids)); } @@ -596,30 +554,12 @@ impl LocalBarrierWorker { fn send_barrier( &mut self, barrier: &Barrier, + to_build: Vec, to_collect: HashSet, table_ids: HashSet, partial_graph_id: PartialGraphId, actor_ids_to_pre_sync_barrier: HashSet, ) -> StreamResult<()> { - if !cfg!(test) { - // The barrier might be outdated and been injected after recovery in some certain extreme - // scenarios. So some newly creating actors in the barrier are possibly not rebuilt during - // recovery. Check it here and return an error here if some actors are not found to - // avoid collection hang. We need some refine in meta side to remove this workaround since - // it will cause another round of unnecessary recovery. - let missing_actor_ids = to_collect - .iter() - .filter(|id| !self.actor_manager_state.handles.contains_key(id)) - .collect_vec(); - if !missing_actor_ids.is_empty() { - tracing::warn!( - "to collect actors not found, they should be cleaned when recovering: {:?}", - missing_actor_ids - ); - return Err(anyhow!("to collect actors not found: {:?}", to_collect).into()); - } - } - if barrier.kind == BarrierKind::Initial { self.actor_manager .watermark_epoch @@ -647,20 +587,12 @@ impl LocalBarrierWorker { self.state.transform_to_issued( barrier, + to_build, to_collect, table_ids, partial_graph_id, actor_ids_to_pre_sync_barrier, )?; - - // Actors to stop should still accept this barrier, but won't get sent to in next times. - if let Some(actors) = barrier.all_stop_actors() { - debug!( - target: "events::stream::barrier::manager", - "remove actors {:?} from senders", - actors - ); - } Ok(()) } diff --git a/src/stream/src/task/barrier_manager/managed_state.rs b/src/stream/src/task/barrier_manager/managed_state.rs index 5ccde5004801d..8f4ab2b49ea2e 100644 --- a/src/stream/src/task/barrier_manager/managed_state.rs +++ b/src/stream/src/task/barrier_manager/managed_state.rs @@ -15,7 +15,7 @@ use std::assert_matches::assert_matches; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::{Debug, Display, Formatter}; -use std::future::{poll_fn, Future}; +use std::future::{pending, poll_fn, Future}; use std::mem::replace; use std::sync::Arc; use std::task::{ready, Context, Poll}; @@ -31,17 +31,18 @@ use risingwave_common::must_match; use risingwave_common::util::epoch::EpochPair; use risingwave_hummock_sdk::SyncResult; use risingwave_pb::stream_plan::barrier::BarrierKind; -use risingwave_pb::stream_service::barrier_complete_response::CreateMviewProgress; +use risingwave_pb::stream_service::BuildActorInfo; use risingwave_storage::{dispatch_state_store, StateStore, StateStoreImpl}; use thiserror_ext::AsReport; use tokio::sync::mpsc; +use tokio::task::JoinHandle; use super::progress::BackfillState; use super::{BarrierCompleteResult, SubscribeMutationItem}; use crate::error::{StreamError, StreamResult}; use crate::executor::monitor::StreamingMetrics; use crate::executor::{Barrier, Mutation}; -use crate::task::{await_tree_key, ActorId, PartialGraphId}; +use crate::task::{ActorId, PartialGraphId, SharedContext, StreamActorManager}; struct IssuedState { pub mutation: Option>, @@ -83,12 +84,63 @@ enum ManagedBarrierStateInner { #[derive(Debug)] pub(super) struct BarrierState { - curr_epoch: u64, + barrier: Barrier, inner: ManagedBarrierStateInner, } -type AwaitEpochCompletedFuture = - impl Future)> + 'static; +mod await_epoch_completed_future { + use std::future::Future; + + use futures::future::BoxFuture; + use futures::FutureExt; + use risingwave_hummock_sdk::SyncResult; + use risingwave_pb::stream_service::barrier_complete_response::PbCreateMviewProgress; + + use crate::error::StreamResult; + use crate::executor::Barrier; + use crate::task::{await_tree_key, BarrierCompleteResult}; + + pub(super) type AwaitEpochCompletedFuture = + impl Future)> + 'static; + + pub(super) fn instrument_complete_barrier_future( + complete_barrier_future: Option>>, + barrier: Barrier, + barrier_await_tree_reg: Option<&await_tree::Registry>, + create_mview_progress: Vec, + ) -> AwaitEpochCompletedFuture { + let prev_epoch = barrier.epoch.prev; + let future = async move { + if let Some(future) = complete_barrier_future { + let result = future.await; + result.map(Some) + } else { + Ok(None) + } + } + .map(move |result| { + ( + barrier, + result.map(|sync_result| BarrierCompleteResult { + sync_result, + create_mview_progress, + }), + ) + }); + if let Some(reg) = barrier_await_tree_reg { + reg.register( + await_tree_key::BarrierAwait { prev_epoch }, + format!("SyncEpoch({})", prev_epoch), + ) + .instrument(future) + .left_future() + } else { + future.right_future() + } + } +} + +use await_epoch_completed_future::*; fn sync_epoch( state_store: &S, @@ -192,8 +244,6 @@ impl Display for &'_ PartialGraphManagedBarrierState { } enum InflightActorStatus { - /// The actor is just spawned and not issued any barrier yet - NotStarted, /// The actor has been issued some barriers, but has not collected the first barrier IssuedFirst(Vec), /// The actor has been issued some barriers, and has collected the first barrier @@ -201,12 +251,11 @@ enum InflightActorStatus { } impl InflightActorStatus { - fn max_issued_epoch(&self) -> Option { + fn max_issued_epoch(&self) -> u64 { match self { - InflightActorStatus::NotStarted => None, - InflightActorStatus::Running(epoch) => Some(*epoch), + InflightActorStatus::Running(epoch) => *epoch, InflightActorStatus::IssuedFirst(issued_barriers) => { - Some(issued_barriers.last().expect("non-empty").epoch.prev) + issued_barriers.last().expect("non-empty").epoch.prev } } } @@ -223,18 +272,35 @@ pub(crate) struct InflightActorState { status: InflightActorStatus, /// Whether the actor has been issued a stop barrier is_stopping: bool, + + join_handle: JoinHandle<()>, + monitor_task_handle: Option>, } impl InflightActorState { - pub(super) fn not_started(actor_id: ActorId) -> Self { + pub(super) fn start( + actor_id: ActorId, + initial_partial_graph_id: PartialGraphId, + initial_barrier: &Barrier, + join_handle: JoinHandle<()>, + monitor_task_handle: Option>, + ) -> Self { Self { actor_id, pending_subscribers: Default::default(), barrier_senders: vec![], - inflight_barriers: BTreeMap::default(), - barrier_mutations: Default::default(), - status: InflightActorStatus::NotStarted, + inflight_barriers: BTreeMap::from_iter([( + initial_barrier.epoch.prev, + initial_partial_graph_id, + )]), + barrier_mutations: BTreeMap::from_iter([( + initial_barrier.epoch.prev, + (initial_barrier.mutation.clone(), initial_barrier.epoch.curr), + )]), + status: InflightActorStatus::IssuedFirst(vec![initial_barrier.clone()]), is_stopping: false, + join_handle, + monitor_task_handle, } } @@ -263,9 +329,7 @@ impl InflightActorState { barrier: &Barrier, is_stop: bool, ) -> StreamResult<()> { - if let Some(max_issued_epoch) = self.status.max_issued_epoch() { - assert!(barrier.epoch.prev > max_issued_epoch); - } + assert!(barrier.epoch.prev > self.status.max_issued_epoch()); if let Some((first_epoch, _)) = self.pending_subscribers.first_key_value() { assert!( @@ -312,9 +376,6 @@ impl InflightActorState { } match &mut self.status { - InflightActorStatus::NotStarted => { - self.status = InflightActorStatus::IssuedFirst(vec![barrier.clone()]); - } InflightActorStatus::IssuedFirst(pending_barriers) => { pending_barriers.push(barrier.clone()); } @@ -338,9 +399,6 @@ impl InflightActorState { let (min_mutation_epoch, _) = self.barrier_mutations.pop_first().expect("should exist"); assert_eq!(min_mutation_epoch, epoch.prev); match &self.status { - InflightActorStatus::NotStarted => { - unreachable!("should have issued a barrier when collect") - } InflightActorStatus::IssuedFirst(pending_barriers) => { assert_eq!( prev_epoch, @@ -372,6 +430,9 @@ pub(super) struct PartialGraphManagedBarrierState { prev_barrier_table_ids: Option<(EpochPair, HashSet)>, /// Record the progress updates of creating mviews for each epoch of concurrent checkpoints. + /// + /// This is updated by [`super::CreateMviewProgressReporter::update`] and will be reported to meta + /// in [`BarrierCompleteResult`]. pub(super) create_mview_progress: HashMap>, pub(super) state_store: StateStoreImpl, @@ -416,32 +477,27 @@ impl PartialGraphManagedBarrierState { } } -pub(super) struct ManagedBarrierState { +pub(crate) struct ManagedBarrierState { pub(super) actor_states: HashMap, pub(super) graph_states: HashMap, - pub(super) state_store: StateStoreImpl, - - pub(super) streaming_metrics: Arc, + actor_manager: Arc, - /// Manages the await-trees of all barriers. - barrier_await_tree_reg: Option, + current_shared_context: Arc, } impl ManagedBarrierState { /// Create a barrier manager state. This will be called only once. pub(super) fn new( - state_store: StateStoreImpl, - streaming_metrics: Arc, - barrier_await_tree_reg: Option, + actor_manager: Arc, + current_shared_context: Arc, ) -> Self { Self { actor_states: Default::default(), graph_states: Default::default(), - state_store, - streaming_metrics, - barrier_await_tree_reg, + actor_manager, + current_shared_context, } } @@ -450,6 +506,21 @@ impl ManagedBarrierState { graph_states: &self.graph_states, } } + + pub(crate) async fn abort_actors(&mut self) { + for (actor_id, state) in &self.actor_states { + tracing::debug!("force stopping actor {}", actor_id); + state.join_handle.abort(); + if let Some(monitor_task_handle) = &state.monitor_task_handle { + monitor_task_handle.abort(); + } + } + for (actor_id, state) in self.actor_states.drain() { + tracing::debug!("join actor {}", actor_id); + let result = state.join_handle.await; + assert!(result.is_ok() || result.unwrap_err().is_cancelled()); + } + } } impl InflightActorState { @@ -485,17 +556,13 @@ impl InflightActorState { .push(tx); } } else { - // Barrier has not issued yet. Store the pending tx - if let Some(max_issued_epoch) = self.status.max_issued_epoch() { - assert!( - max_issued_epoch < start_prev_epoch, - "later barrier {} has been issued, but skip the start epoch {:?}", - max_issued_epoch, - start_prev_epoch - ); - } else { - assert!(!self.is_stopping, "actor has been stopped and has not inflight barrier. unlikely to get further barrier"); - } + let max_issued_epoch = self.status.max_issued_epoch(); + assert!( + max_issued_epoch < start_prev_epoch, + "later barrier {} has been issued, but skip the start epoch {:?}", + max_issued_epoch, + start_prev_epoch + ); self.pending_subscribers .entry(start_prev_epoch) .or_default() @@ -508,9 +575,6 @@ impl InflightActorState { tx: mpsc::UnboundedSender, ) -> StreamResult<()> { match &self.status { - InflightActorStatus::NotStarted => { - self.barrier_senders.push(tx); - } InflightActorStatus::IssuedFirst(pending_barriers) => { for barrier in pending_barriers { tx.send(barrier.clone()).map_err(|_| { @@ -539,8 +603,8 @@ impl ManagedBarrierState { tx: mpsc::UnboundedSender, ) { self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) + .get_mut(&actor_id) + .expect("should exist") .subscribe_actor_mutation(start_prev_epoch, tx); } @@ -550,53 +614,105 @@ impl ManagedBarrierState { tx: mpsc::UnboundedSender, ) -> StreamResult<()> { self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) + .get_mut(&actor_id) + .expect("should exist") .register_barrier_sender(tx) } pub(super) fn transform_to_issued( &mut self, barrier: &Barrier, + actors_to_build: Vec, actor_ids_to_collect: HashSet, table_ids: HashSet, partial_graph_id: PartialGraphId, actor_ids_to_pre_sync_barrier: HashSet, ) -> StreamResult<()> { let actor_to_stop = barrier.all_stop_actors(); + let is_stop_actor = |actor_id| { + actor_to_stop + .map(|actors| actors.contains(&actor_id)) + .unwrap_or(false) + }; let graph_state = self .graph_states .entry(partial_graph_id) .or_insert_with(|| { PartialGraphManagedBarrierState::new( - self.state_store.clone(), - self.streaming_metrics.clone(), - self.barrier_await_tree_reg.clone(), + self.actor_manager.env.state_store(), + self.actor_manager.streaming_metrics.clone(), + self.actor_manager.await_tree_reg.clone(), ) }); graph_state.transform_to_issued(barrier, actor_ids_to_collect.clone(), table_ids); + let mut new_actors = HashSet::new(); + for actor in actors_to_build { + let actor_id = actor.actor.as_ref().unwrap().actor_id; + assert!(!is_stop_actor(actor_id)); + assert!(new_actors.insert(actor_id)); + assert!(actor_ids_to_collect.contains(&actor_id)); + let (join_handle, monitor_join_handle) = self + .actor_manager + .spawn_actor(actor, self.current_shared_context.clone()); + assert!(self + .actor_states + .try_insert( + actor_id, + InflightActorState::start( + actor_id, + partial_graph_id, + barrier, + join_handle, + monitor_join_handle + ) + ) + .is_ok()); + } + + // Spawn a trivial join handle to be compatible with the unit test + if cfg!(test) { + for actor_id in &actor_ids_to_collect { + if !self.actor_states.contains_key(actor_id) { + let join_handle = self.actor_manager.runtime.spawn(async { pending().await }); + assert!(self + .actor_states + .try_insert( + *actor_id, + InflightActorState::start( + *actor_id, + partial_graph_id, + barrier, + join_handle, + None, + ) + ) + .is_ok()); + new_actors.insert(*actor_id); + } + } + } + // Note: it's important to issue barrier to actor after issuing to graph to ensure that // we call `start_epoch` on the graph before the actors receive the barrier - for actor_id in actor_ids_to_collect { + for actor_id in &actor_ids_to_collect { + if new_actors.contains(actor_id) { + continue; + } self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) - .issue_barrier( - partial_graph_id, - barrier, - actor_to_stop - .map(|actors| actors.contains(&actor_id)) - .unwrap_or(false), - )?; + .get_mut(actor_id) + .unwrap_or_else(|| { + panic!("should exist: {} {:?}", actor_id, actor_ids_to_collect); + }) + .issue_barrier(partial_graph_id, barrier, is_stop_actor(*actor_id))?; } if partial_graph_id.is_global_graph() { for actor_id in actor_ids_to_pre_sync_barrier { self.actor_states - .entry(actor_id) - .or_insert_with(|| InflightActorState::not_started(actor_id)) + .get_mut(&actor_id) + .expect("should exist") .sync_barrier(barrier); } } else { @@ -610,9 +726,12 @@ impl ManagedBarrierState { ) -> impl Future + '_ { poll_fn(|cx| { for (partial_graph_id, graph_state) in &mut self.graph_states { - if let Poll::Ready(epoch) = graph_state.poll_next_completed_epoch(cx) { + if let Poll::Ready(barrier) = graph_state.poll_next_completed_barrier(cx) { + if let Some(actors_to_stop) = barrier.all_stop_actors() { + self.current_shared_context.drop_actors(actors_to_stop); + } let partial_graph_id = *partial_graph_id; - return Poll::Ready((partial_graph_id, epoch)); + return Poll::Ready((partial_graph_id, barrier.epoch.prev)); } } Poll::Pending @@ -626,7 +745,10 @@ impl ManagedBarrierState { .expect("should exist") .collect(epoch); if is_finished { - self.actor_states.remove(&actor_id); + let state = self.actor_states.remove(&actor_id).expect("should exist"); + if let Some(monitor_task_handle) = state.monitor_task_handle { + monitor_task_handle.abort(); + } } let prev_graph_state = self .graph_states @@ -677,21 +799,10 @@ impl PartialGraphManagedBarrierState { let create_mview_progress = self .create_mview_progress - .remove(&barrier_state.curr_epoch) + .remove(&barrier_state.barrier.epoch.curr) .unwrap_or_default() .into_iter() - .map(|(actor, state)| CreateMviewProgress { - backfill_actor_id: actor, - done: matches!(state, BackfillState::Done(_)), - consumed_epoch: match state { - BackfillState::ConsumingUpstream(consumed_epoch, _) => consumed_epoch, - BackfillState::Done(_) => barrier_state.curr_epoch, - }, - consumed_rows: match state { - BackfillState::ConsumingUpstream(_, consumed_rows) => consumed_rows, - BackfillState::Done(consumed_rows) => consumed_rows, - }, - }) + .map(|(actor, state)| state.to_pb(actor)) .collect(); let complete_barrier_future = match kind { @@ -724,34 +835,15 @@ impl PartialGraphManagedBarrierState { } }; + let barrier = barrier_state.barrier.clone(); + self.await_epoch_completed_futures.push_back({ - let future = async move { - if let Some(future) = complete_barrier_future { - let result = future.await; - result.map(Some) - } else { - Ok(None) - } - } - .map(move |result| { - ( - prev_epoch, - result.map(|sync_result| BarrierCompleteResult { - sync_result, - create_mview_progress, - }), - ) - }); - if let Some(reg) = &self.barrier_await_tree_reg { - reg.register( - await_tree_key::BarrierAwait { prev_epoch }, - format!("SyncEpoch({})", prev_epoch), - ) - .instrument(future) - .left_future() - } else { - future.right_future() - } + instrument_complete_barrier_future( + complete_barrier_future, + barrier, + self.barrier_await_tree_reg.as_ref(), + create_mview_progress, + ) }); } } @@ -775,7 +867,7 @@ impl PartialGraphManagedBarrierState { ) } Some(&mut BarrierState { - curr_epoch, + ref barrier, inner: ManagedBarrierStateInner::Issued(IssuedState { ref mut remaining_actors, @@ -789,7 +881,7 @@ impl PartialGraphManagedBarrierState { "the actor doesn't exist. actor_id: {:?}, curr_epoch: {:?}", actor_id, epoch.curr ); - assert_eq!(curr_epoch, epoch.curr); + assert_eq!(barrier.epoch.curr, epoch.curr); self.may_have_collected_all(epoch.prev); } Some(BarrierState { inner, .. }) => { @@ -871,7 +963,7 @@ impl PartialGraphManagedBarrierState { self.epoch_barrier_state_map.insert( barrier.epoch.prev, BarrierState { - curr_epoch: barrier.epoch.curr, + barrier: barrier.clone(), inner: ManagedBarrierStateInner::Issued(IssuedState { remaining_actors: BTreeSet::from_iter(actor_ids_to_collect), mutation: barrier.mutation.clone(), @@ -885,17 +977,17 @@ impl PartialGraphManagedBarrierState { } /// Return a future that yields the next completed epoch. The future is cancellation safe. - pub(crate) fn poll_next_completed_epoch(&mut self, cx: &mut Context<'_>) -> Poll { + pub(crate) fn poll_next_completed_barrier(&mut self, cx: &mut Context<'_>) -> Poll { ready!(self.await_epoch_completed_futures.next().poll_unpin(cx)) - .map(|(prev_epoch, result)| { + .map(|(barrier, result)| { let state = self .epoch_barrier_state_map - .get_mut(&prev_epoch) + .get_mut(&barrier.epoch.prev) .expect("should exist"); // sanity check on barrier state assert_matches!(&state.inner, ManagedBarrierStateInner::AllCollected); state.inner = ManagedBarrierStateInner::Completed(result); - prev_epoch + barrier }) .map(Poll::Ready) .unwrap_or(Poll::Pending) @@ -941,9 +1033,12 @@ impl PartialGraphManagedBarrierState { #[cfg(test)] async fn pop_next_completed_epoch(&mut self) -> u64 { - let epoch = poll_fn(|cx| self.poll_next_completed_epoch(cx)).await; - let _ = self.pop_completed_epoch(epoch).unwrap().unwrap(); - epoch + let barrier = poll_fn(|cx| self.poll_next_completed_barrier(cx)).await; + let _ = self + .pop_completed_epoch(barrier.epoch.prev) + .unwrap() + .unwrap(); + barrier.epoch.prev } } diff --git a/src/stream/src/task/barrier_manager/progress.rs b/src/stream/src/task/barrier_manager/progress.rs index 9a243c2e975d1..9b2820bb3bfed 100644 --- a/src/stream/src/task/barrier_manager/progress.rs +++ b/src/stream/src/task/barrier_manager/progress.rs @@ -15,6 +15,7 @@ use std::fmt::{Display, Formatter}; use risingwave_common::util::epoch::EpochPair; +use risingwave_pb::stream_service::barrier_complete_response::PbCreateMviewProgress; use super::LocalBarrierManager; use crate::task::barrier_manager::LocalBarrierEvent::ReportCreateProgress; @@ -30,6 +31,23 @@ pub(crate) enum BackfillState { Done(ConsumedRows), } +impl BackfillState { + pub fn to_pb(self, actor_id: ActorId) -> PbCreateMviewProgress { + PbCreateMviewProgress { + backfill_actor_id: actor_id, + done: matches!(self, BackfillState::Done(_)), + consumed_epoch: match self { + BackfillState::ConsumingUpstream(consumed_epoch, _) => consumed_epoch, + BackfillState::Done(_) => 0, // unused field for done + }, + consumed_rows: match self { + BackfillState::ConsumingUpstream(_, consumed_rows) => consumed_rows, + BackfillState::Done(consumed_rows) => consumed_rows, + }, + } + } +} + impl Display for BackfillState { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { @@ -103,7 +121,7 @@ impl LocalBarrierManager { /// TODO(kwannoel): Perhaps it is possible to get total key count of the replicated state table /// for arrangement backfill. We can use that to estimate the progress as well, and avoid recording /// `row_count` state for it. -pub struct CreateMviewProgress { +pub struct CreateMviewProgressReporter { barrier_manager: LocalBarrierManager, /// The id of the actor containing the backfill executors. @@ -112,7 +130,7 @@ pub struct CreateMviewProgress { state: Option, } -impl CreateMviewProgress { +impl CreateMviewProgressReporter { pub fn new(barrier_manager: LocalBarrierManager, backfill_actor_id: ActorId) -> Self { Self { barrier_manager, @@ -186,8 +204,8 @@ impl LocalBarrierManager { pub fn register_create_mview_progress( &self, backfill_actor_id: ActorId, - ) -> CreateMviewProgress { + ) -> CreateMviewProgressReporter { trace!("register create mview progress: {}", backfill_actor_id); - CreateMviewProgress::new(self.clone(), backfill_actor_id) + CreateMviewProgressReporter::new(self.clone(), backfill_actor_id) } } diff --git a/src/stream/src/task/barrier_manager/tests.rs b/src/stream/src/task/barrier_manager/tests.rs index d6a8256aebb61..112ee533d8e6d 100644 --- a/src/stream/src/task/barrier_manager/tests.rs +++ b/src/stream/src/task/barrier_manager/tests.rs @@ -40,19 +40,22 @@ async fn test_managed_barrier_collection() -> StreamResult<()> { // Register actors let actor_ids = vec![233, 234, 235]; - let count = actor_ids.len(); - let mut rxs = actor_ids - .clone() - .into_iter() - .map(register_sender) - .collect_vec(); // Send a barrier to all actors let curr_epoch = test_epoch(2); let barrier = Barrier::new_test_barrier(curr_epoch); let epoch = barrier.epoch.prev; - test_env.inject_barrier(&barrier, actor_ids); + test_env.inject_barrier(&barrier, actor_ids.clone()); + + manager.flush_all_events().await; + + let count = actor_ids.len(); + let mut rxs = actor_ids + .clone() + .into_iter() + .map(register_sender) + .collect_vec(); // Collect barriers from actors let collected_barriers = join_all(rxs.iter_mut().map(|(actor_id, rx)| async move { @@ -105,6 +108,14 @@ async fn test_managed_barrier_collection_separately() -> StreamResult<()> { .chain(once(extra_actor_id)) .collect_vec(); + // Prepare the barrier + let curr_epoch = test_epoch(2); + let barrier = Barrier::new_test_barrier(curr_epoch).with_stop(); + + test_env.inject_barrier(&barrier, actor_ids_to_collect.clone()); + + manager.flush_all_events().await; + // Register actors let count = actor_ids_to_send.len(); let mut rxs = actor_ids_to_send @@ -113,10 +124,6 @@ async fn test_managed_barrier_collection_separately() -> StreamResult<()> { .map(register_sender) .collect_vec(); - // Prepare the barrier - let curr_epoch = test_epoch(2); - let barrier = Barrier::new_test_barrier(curr_epoch).with_stop(); - let mut mutation_subscriber = manager.subscribe_barrier_mutation(extra_actor_id, &barrier.clone().into_dispatcher()); @@ -124,8 +131,6 @@ async fn test_managed_barrier_collection_separately() -> StreamResult<()> { let mut mutation_reader = pin!(mutation_subscriber.recv()); assert!(poll_fn(|cx| Poll::Ready(mutation_reader.as_mut().poll(cx).is_pending())).await); - test_env.inject_barrier(&barrier, actor_ids_to_collect); - let (epoch, mutation) = mutation_reader.await.unwrap(); assert_eq!((epoch, &mutation), (barrier.epoch.prev, &barrier.mutation)); @@ -196,6 +201,8 @@ async fn test_late_register_barrier_sender() -> StreamResult<()> { test_env.inject_barrier(&barrier1, actor_ids_to_collect.clone()); test_env.inject_barrier(&barrier2, actor_ids_to_collect.clone()); + manager.flush_all_events().await; + // register sender after inject barrier let mut rxs = actor_ids_to_send .clone() diff --git a/src/stream/src/task/mod.rs b/src/stream/src/task/mod.rs index b5382b3418052..59851fdf09ad8 100644 --- a/src/stream/src/task/mod.rs +++ b/src/stream/src/task/mod.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use anyhow::anyhow; use parking_lot::{MappedMutexGuard, Mutex, MutexGuard, RwLock}; @@ -194,7 +194,7 @@ impl SharedContext { &self.config } - pub fn drop_actors(&self, actors: &[ActorId]) { + pub(super) fn drop_actors(&self, actors: &HashSet) { self.channel_map .lock() .retain(|(up_id, _), _| !actors.contains(up_id)); diff --git a/src/stream/src/task/stream_manager.rs b/src/stream/src/task/stream_manager.rs index 60b7341371497..ba76e6fab791d 100644 --- a/src/stream/src/task/stream_manager.rs +++ b/src/stream/src/task/stream_manager.rs @@ -19,7 +19,6 @@ use std::sync::atomic::AtomicU64; use std::sync::Arc; use std::time::Instant; -use anyhow::anyhow; use async_recursion::async_recursion; use await_tree::InstrumentAwait; use futures::stream::BoxStream; @@ -59,8 +58,8 @@ use crate::task::barrier_manager::{ ControlStreamHandle, EventSender, LocalActorOperation, LocalBarrierWorker, }; use crate::task::{ - ActorId, FragmentId, LocalBarrierManager, SharedContext, StreamActorManager, - StreamActorManagerState, StreamEnvironment, UpDownActorIds, + ActorId, FragmentId, LocalBarrierManager, SharedContext, StreamActorManager, StreamEnvironment, + UpDownActorIds, }; #[cfg(test)] @@ -214,16 +213,6 @@ impl LocalStreamManager { }) } - /// Drop the resources of the given actors. - pub async fn drop_actors(&self, actors: Vec) -> StreamResult<()> { - self.actor_op_tx - .send_and_await(|result_sender| LocalActorOperation::DropActors { - actors, - result_sender, - }) - .await - } - pub async fn take_receiver(&self, ids: UpDownActorIds) -> StreamResult { self.actor_op_tx .send_and_await(|result_sender| LocalActorOperation::TakeReceiver { @@ -256,28 +245,9 @@ impl LocalStreamManager { } impl LocalBarrierWorker { - /// Drop the resources of the given actors. - pub(super) fn drop_actors(&mut self, actors: &[ActorId]) { - self.current_shared_context.drop_actors(actors); - for &id in actors { - self.actor_manager_state.drop_actor(id); - } - tracing::debug!(actors = ?actors, "drop actors"); - } - /// Force stop all actors on this worker, and then drop their resources. pub(super) async fn reset(&mut self, version_id: HummockVersionId) { - let actor_handles = self.actor_manager_state.drain_actor_handles(); - for (actor_id, handle) in &actor_handles { - tracing::debug!("force stopping actor {}", actor_id); - handle.abort(); - } - for (actor_id, handle) in actor_handles { - tracing::debug!("join actor {}", actor_id); - let result = handle.await; - assert!(result.is_ok() || result.unwrap_err().is_cancelled()); - } - self.actor_manager_state.clear_state(); + self.state.abort_actors().await; if let Some(m) = self.actor_manager.await_tree_reg.as_ref() { m.clear(); } @@ -291,26 +261,6 @@ impl LocalBarrierWorker { self.reset_state(); self.actor_manager.env.dml_manager_ref().clear(); } - - pub(super) fn update_actors(&mut self, actors: Vec) -> StreamResult<()> { - self.actor_manager_state.update_actors(actors) - } - - /// This function could only be called once during the lifecycle of `LocalStreamManager` for - /// now. - pub(super) fn start_create_actors(&mut self, actors: &[ActorId]) -> StreamResult<()> { - let actors: Vec<_> = actors - .iter() - .map(|actor_id| { - self.actor_manager_state - .actors - .remove(actor_id) - .ok_or_else(|| anyhow!("No such actor with actor id:{}", actor_id)) - }) - .try_collect()?; - self.spawn_actors(actors); - Ok(()) - } } impl StreamActorManager { @@ -559,18 +509,22 @@ impl StreamActorManager { } } -impl LocalBarrierWorker { - pub(super) fn spawn_actors(&mut self, actors: Vec) { - for actor in actors { +impl StreamActorManager { + pub(super) fn spawn_actor( + self: &Arc, + actor: BuildActorInfo, + current_shared_context: Arc, + ) -> (JoinHandle<()>, Option>) { + { let monitor = tokio_metrics::TaskMonitor::new(); let stream_actor_ref = actor.actor.as_ref().unwrap(); let actor_id = stream_actor_ref.actor_id; let handle = { let trace_span = format!("Actor {actor_id}: `{}`", stream_actor_ref.mview_definition); - let barrier_manager = self.current_shared_context.local_barrier_manager.clone(); + let barrier_manager = current_shared_context.local_barrier_manager.clone(); // wrap the future of `create_actor` with `boxed` to avoid stack overflow - let actor = self.actor_manager.clone().create_actor(actor, self.current_shared_context.clone()).boxed().and_then(|actor| actor.run()).map(move |result| { + let actor = self.clone().create_actor(actor, current_shared_context).boxed().and_then(|actor| actor.run()).map(move |result| { if let Err(err) = result { // TODO: check error type and panic if it's unexpected. // Intentionally use `?` on the report to also include the backtrace. @@ -578,7 +532,7 @@ impl LocalBarrierWorker { barrier_manager.notify_failure(actor_id, err); } }); - let traced = match &self.actor_manager.await_tree_reg { + let traced = match &self.await_tree_reg { Some(m) => m .register(await_tree_key::Actor(actor_id), trace_span) .instrument(actor) @@ -586,24 +540,17 @@ impl LocalBarrierWorker { None => actor.right_future(), }; let instrumented = monitor.instrument(traced); - let with_config = - crate::CONFIG.scope(self.actor_manager.env.config().clone(), instrumented); + let with_config = crate::CONFIG.scope(self.env.config().clone(), instrumented); - self.actor_manager.runtime.spawn(with_config) + self.runtime.spawn(with_config) }; - self.actor_manager_state.handles.insert(actor_id, handle); - - if self.actor_manager.streaming_metrics.level >= MetricLevel::Debug - || self - .actor_manager - .env - .config() - .developer - .enable_actor_tokio_metrics + + let monitor_handle = if self.streaming_metrics.level >= MetricLevel::Debug + || self.env.config().developer.enable_actor_tokio_metrics { tracing::info!("Tokio metrics are enabled."); - let streaming_metrics = self.actor_manager.streaming_metrics.clone(); - let actor_monitor_task = self.actor_manager.runtime.spawn(async move { + let streaming_metrics = self.streaming_metrics.clone(); + let actor_monitor_task = self.runtime.spawn(async move { let metrics = streaming_metrics.new_actor_metrics(actor_id); loop { let task_metrics = monitor.cumulative(); @@ -643,10 +590,11 @@ impl LocalBarrierWorker { tokio::time::sleep(Duration::from_secs(1)).await; } }); - self.actor_manager_state - .actor_monitor_tasks - .insert(actor_id, actor_monitor_task); - } + Some(actor_monitor_task) + } else { + None + }; + (handle, monitor_handle) } } } @@ -671,44 +619,6 @@ impl LocalBarrierWorker { } } -impl StreamActorManagerState { - /// `drop_actor` is invoked by meta node via RPC once the stop barrier arrives at the - /// sink. All the actors in the actors should stop themselves before this method is invoked. - fn drop_actor(&mut self, actor_id: ActorId) { - self.actor_monitor_tasks - .remove(&actor_id) - .inspect(|handle| handle.abort()); - self.actors.remove(&actor_id); - - // Task should have already stopped when this method is invoked. There might be some - // clean-up work left (like dropping in-memory data structures), but we don't have to wait - // for them to finish, in order to make this request non-blocking. - self.handles.remove(&actor_id); - } - - fn drain_actor_handles(&mut self) -> Vec<(ActorId, ActorHandle)> { - self.handles.drain().collect() - } - - /// `stop_all_actors` is invoked by meta node via RPC for recovery purpose. Different from the - /// `drop_actor`, the execution of the actors will be aborted. - fn clear_state(&mut self) { - self.actors.clear(); - self.actor_monitor_tasks.clear(); - } - - fn update_actors(&mut self, actors: Vec) -> StreamResult<()> { - for actor in actors { - let actor_id = actor.actor.as_ref().unwrap().get_actor_id(); - self.actors - .try_insert(actor_id, actor) - .map_err(|_| anyhow!("duplicated actor {}", actor_id))?; - } - - Ok(()) - } -} - #[cfg(test)] pub mod test_utils { use risingwave_pb::common::HostAddress; diff --git a/src/tests/simulation/Cargo.toml b/src/tests/simulation/Cargo.toml index 8729207c0d025..c82f2b7d5911e 100644 --- a/src/tests/simulation/Cargo.toml +++ b/src/tests/simulation/Cargo.toml @@ -25,6 +25,7 @@ glob = "0.3" itertools = { workspace = true } lru = { workspace = true } madsim = "0.2.30" +maplit = "1" paste = "1" pin-project = "1.1" pretty_assertions = "1" diff --git a/src/tests/simulation/src/cluster.rs b/src/tests/simulation/src/cluster.rs index 26fdc3a8757e1..a9ffba0063562 100644 --- a/src/tests/simulation/src/cluster.rs +++ b/src/tests/simulation/src/cluster.rs @@ -158,27 +158,16 @@ impl Configuration { /// Provides a configuration for scale test which ensures that the arrangement backfill is disabled, /// so table scan will use `no_shuffle`. pub fn for_scale_no_shuffle() -> Self { - // Embed the config file and create a temporary file at runtime. The file will be deleted - // automatically when it's dropped. - let config_path = { - let mut file = - tempfile::NamedTempFile::new().expect("failed to create temp config file"); - file.write_all(include_bytes!("risingwave-scale.toml")) - .expect("failed to write config file"); - file.into_temp_path() - }; + let mut conf = Self::for_scale(); + conf.per_session_queries = + vec!["SET STREAMING_USE_ARRANGEMENT_BACKFILL = false;".into()].into(); + conf + } - Configuration { - config_path: ConfigPath::Temp(config_path.into()), - frontend_nodes: 2, - compute_nodes: 3, - meta_nodes: 3, - compactor_nodes: 2, - compute_node_cores: 2, - per_session_queries: vec!["SET STREAMING_USE_ARRANGEMENT_BACKFILL = false;".into()] - .into(), - ..Default::default() - } + pub fn for_scale_shared_source() -> Self { + let mut conf = Self::for_scale(); + conf.per_session_queries = vec!["SET RW_ENABLE_SHARED_SOURCE = true;".into()].into(); + conf } pub fn for_auto_parallelism( diff --git a/src/tests/simulation/src/ctl_ext.rs b/src/tests/simulation/src/ctl_ext.rs index 9b57673e49c16..3986a826e21e7 100644 --- a/src/tests/simulation/src/ctl_ext.rs +++ b/src/tests/simulation/src/ctl_ext.rs @@ -12,9 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![cfg_attr(not(madsim), expect(unused_imports))] - -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::ffi::OsString; use std::fmt::Write; use std::sync::Arc; @@ -23,17 +21,17 @@ use anyhow::{anyhow, Result}; use cfg_or_panic::cfg_or_panic; use clap::Parser; use itertools::Itertools; -use rand::seq::{IteratorRandom, SliceRandom}; +use rand::seq::IteratorRandom; use rand::{thread_rng, Rng}; use risingwave_common::catalog::TableId; use risingwave_common::hash::WorkerSlotId; +use risingwave_connector::source::{SplitImpl, SplitMetaData}; use risingwave_hummock_sdk::{CompactionGroupId, HummockSstableId}; use risingwave_pb::meta::table_fragments::fragment::FragmentDistributionType; use risingwave_pb::meta::table_fragments::PbFragment; use risingwave_pb::meta::update_worker_node_schedulability_request::Schedulability; use risingwave_pb::meta::GetClusterInfoResponse; use risingwave_pb::stream_plan::StreamNode; -use serde::de::IntoDeserializer; use self::predicate::BoxedPredicate; use crate::cluster::Cluster; @@ -76,7 +74,7 @@ pub mod predicate { Box::new(p) } - /// There exists operators whose identity contains `s` in the fragment. + /// There exists operators whose identity contains `s` in the fragment (case insensitive). pub fn identity_contains(s: impl Into) -> BoxedPredicate { let s: String = s.into(); let p = move |f: &PbFragment| { @@ -363,6 +361,30 @@ impl Cluster { Ok(response) } + /// `table_id -> actor_id -> splits` + pub async fn list_source_splits(&self) -> Result>> { + let info = self.get_cluster_info().await?; + let mut res = BTreeMap::new(); + + for table in info.table_fragments { + let mut table_actor_splits = BTreeMap::new(); + + for (actor_id, splits) in table.actor_splits { + let splits = splits + .splits + .iter() + .map(|split| SplitImpl::try_from(split).unwrap()) + .map(|split| split.id()) + .collect_vec() + .join(","); + table_actor_splits.insert(actor_id, splits); + } + res.insert(table.table_id, table_actor_splits); + } + + Ok(res) + } + // update node schedulability #[cfg_or_panic(madsim)] async fn update_worker_node_schedulability( diff --git a/src/tests/simulation/src/lib.rs b/src/tests/simulation/src/lib.rs index aa6303b8e2f65..af9cf158a3350 100644 --- a/src/tests/simulation/src/lib.rs +++ b/src/tests/simulation/src/lib.rs @@ -13,7 +13,6 @@ // limitations under the License. #![feature(trait_alias)] -#![feature(lint_reasons)] #![feature(let_chains)] #![feature(try_blocks)] #![feature(register_tool)] diff --git a/src/tests/simulation/tests/integration_tests/scale/mod.rs b/src/tests/simulation/tests/integration_tests/scale/mod.rs index f6940f072409e..3c7a702dc6290 100644 --- a/src/tests/simulation/tests/integration_tests/scale/mod.rs +++ b/src/tests/simulation/tests/integration_tests/scale/mod.rs @@ -20,6 +20,7 @@ mod nexmark_q4; mod nexmark_source; mod no_shuffle; mod schedulability; +mod shared_source; mod singleton_migration; mod sink; mod streaming_parallelism; diff --git a/src/tests/simulation/tests/integration_tests/scale/shared_source.rs b/src/tests/simulation/tests/integration_tests/scale/shared_source.rs new file mode 100644 index 0000000000000..175b3a043100c --- /dev/null +++ b/src/tests/simulation/tests/integration_tests/scale/shared_source.rs @@ -0,0 +1,192 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; + +use anyhow::Result; +use itertools::Itertools; +use maplit::{convert_args, hashmap}; +use risingwave_common::hash::WorkerSlotId; +use risingwave_pb::meta::table_fragments::Fragment; +use risingwave_simulation::cluster::{Cluster, Configuration}; +use risingwave_simulation::ctl_ext::predicate::{identity_contains, no_identity_contains}; + +const CREATE_SOURCE: &str = r#" +CREATE SOURCE s(v1 int, v2 varchar) WITH ( + connector='kafka', + properties.bootstrap.server='192.168.11.1:29092', + topic='shared_source' +) FORMAT PLAIN ENCODE JSON;"#; + +fn actor_upstream(fragment: &Fragment) -> Vec<(u32, Vec)> { + fragment + .actors + .iter() + .map(|actor| (actor.actor_id, actor.upstream_actor_id.clone())) + .collect_vec() +} + +async fn validate_splits_aligned(cluster: &mut Cluster) -> Result<()> { + let source_backfill_fragment = cluster + .locate_one_fragment([identity_contains("StreamSourceScan")]) + .await?; + // The result of scaling is non-deterministic. + // So we just print the result here, instead of asserting with a fixed value. + let actor_upstream = actor_upstream(&source_backfill_fragment.inner); + tracing::info!( + "{}", + actor_upstream + .iter() + .format_with("\n", |(actor_id, upstream), f| f(&format_args!( + "{} <- {:?}", + actor_id, upstream + ))) + ); + let splits = cluster.list_source_splits().await?; + tracing::info!("{:#?}", splits); + let actor_splits: BTreeMap = splits + .values() + .flat_map(|m| m.clone().into_iter()) + .collect(); + for (actor, upstream) in actor_upstream { + assert!(upstream.len() == 1, "invalid upstream: {:?}", upstream); + let upstream_actor = upstream[0]; + assert_eq!( + actor_splits.get(&actor).unwrap(), + actor_splits.get(&upstream_actor).unwrap() + ); + } + Ok(()) +} + +#[tokio::test] +async fn test_shared_source() -> Result<()> { + tracing_subscriber::fmt::Subscriber::builder() + .with_max_level(tracing::Level::ERROR) + .with_env_filter("risingwave_stream::executor::source::source_backfill_executor=DEBUG,integration_tests=DEBUG") + .init(); + + let mut cluster = Cluster::start(Configuration::for_scale_shared_source()).await?; + cluster.create_kafka_topics(convert_args!(hashmap!( + "shared_source" => 4, + ))); + let mut session = cluster.start_session(); + + session.run("set rw_implicit_flush = true;").await?; + + session.run(CREATE_SOURCE).await?; + session + .run("create materialized view mv as select count(*) from s group by v1;") + .await?; + let source_fragment = cluster + .locate_one_fragment([ + identity_contains("Source"), + no_identity_contains("StreamSourceScan"), + ]) + .await?; + let source_workers = source_fragment.all_worker_count().into_keys().collect_vec(); + let source_backfill_fragment = cluster + .locate_one_fragment([identity_contains("StreamSourceScan")]) + .await?; + let source_backfill_workers = source_backfill_fragment + .all_worker_count() + .into_keys() + .collect_vec(); + let hash_agg_fragment = cluster + .locate_one_fragment([identity_contains("hashagg")]) + .await?; + let hash_agg_workers = hash_agg_fragment + .all_worker_count() + .into_keys() + .collect_vec(); + validate_splits_aligned(&mut cluster).await?; + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 6 + 2 3 HASH {4,3} {3} {MVIEW} 6 + 3 3 HASH {5} {1} {SOURCE_SCAN} 6"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + expect_test::expect![[r#" + 1 CREATED ADAPTIVE + 3 CREATED ADAPTIVE"#]] + .assert_eq(&cluster.run("select * from rw_table_fragments;").await?); + + // SourceBackfill cannot be scaled because of NoShuffle. + assert!( + &cluster + .reschedule( + source_backfill_fragment + .reschedule([WorkerSlotId::new(source_backfill_workers[0], 0)], []), + ) + .await.unwrap_err().to_string().contains("rescheduling NoShuffle downstream fragment (maybe Chain fragment) is forbidden, please use NoShuffle upstream fragment (like Materialized fragment) to scale"), + ); + + // hash agg can be scaled independently + cluster + .reschedule(hash_agg_fragment.reschedule([WorkerSlotId::new(hash_agg_workers[0], 0)], [])) + .await + .unwrap(); + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 6 + 2 3 HASH {4,3} {3} {MVIEW} 5 + 3 3 HASH {5} {1} {SOURCE_SCAN} 6"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + + // source is the NoShuffle upstream. It can be scaled, and the downstream SourceBackfill will be scaled together. + cluster + .reschedule(source_fragment.reschedule( + [ + WorkerSlotId::new(source_workers[0], 0), + WorkerSlotId::new(source_workers[0], 1), + WorkerSlotId::new(source_workers[2], 0), + ], + [], + )) + .await + .unwrap(); + validate_splits_aligned(&mut cluster).await?; + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 3 + 2 3 HASH {4,3} {3} {MVIEW} 5 + 3 3 HASH {5} {1} {SOURCE_SCAN} 3"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + expect_test::expect![[r#" + 1 CREATED CUSTOM + 3 CREATED CUSTOM"#]] + .assert_eq(&cluster.run("select * from rw_table_fragments;").await?); + + // resolve_no_shuffle for backfill fragment is OK, which will scale the upstream together. + cluster + .reschedule_resolve_no_shuffle(source_backfill_fragment.reschedule( + [], + [ + WorkerSlotId::new(source_workers[0], 0), + WorkerSlotId::new(source_workers[0], 1), + WorkerSlotId::new(source_workers[2], 0), + WorkerSlotId::new(source_workers[2], 1), + ], + )) + .await + .unwrap(); + validate_splits_aligned(&mut cluster).await?; + expect_test::expect![[r#" + 1 1 HASH {2} {} {SOURCE} 7 + 2 3 HASH {4,3} {3} {MVIEW} 5 + 3 3 HASH {5} {1} {SOURCE_SCAN} 7"#]] + .assert_eq(&cluster.run("select * from rw_fragments;").await?); + expect_test::expect![[r#" +1 CREATED CUSTOM +3 CREATED CUSTOM"#]] + .assert_eq(&cluster.run("select * from rw_table_fragments;").await?); + Ok(()) +} diff --git a/src/utils/futures_util/src/lib.rs b/src/utils/futures_util/src/lib.rs index 4d086951dbb5f..115da2e7676f9 100644 --- a/src/utils/futures_util/src/lib.rs +++ b/src/utils/futures_util/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] - use std::future::Future; use futures::stream::TryStream; diff --git a/src/utils/iter_util/src/lib.rs b/src/utils/iter_util/src/lib.rs index 58758c64a1ce5..92f19a0ee46fc 100644 --- a/src/utils/iter_util/src/lib.rs +++ b/src/utils/iter_util/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] - pub trait ZipEqFast: ExactSizeIterator + Sized where B::IntoIter: ExactSizeIterator, diff --git a/src/utils/local_stats_alloc/src/lib.rs b/src/utils/local_stats_alloc/src/lib.rs index 3950d0cb4931e..94265768815c2 100644 --- a/src/utils/local_stats_alloc/src/lib.rs +++ b/src/utils/local_stats_alloc/src/lib.rs @@ -13,7 +13,6 @@ // limitations under the License. #![feature(allocator_api)] -#![feature(lint_reasons)] use std::alloc::Allocator; use std::ops::Deref; diff --git a/src/utils/pgwire/src/lib.rs b/src/utils/pgwire/src/lib.rs index 8d1c00541bb95..fae5489e81097 100644 --- a/src/utils/pgwire/src/lib.rs +++ b/src/utils/pgwire/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(lint_reasons)] #![feature(trait_alias)] #![feature(iterator_try_collect)] #![feature(trusted_len)]