diff --git a/Cargo.lock b/Cargo.lock index 98d01dff60589..640a843a83a1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -991,7 +991,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "136d4d23bcc79e27423727b36823d86233aad06dfea531837b038394d11e9928" dependencies = [ "concurrent-queue", - "event-listener 5.2.0", + "event-listener 5.3.1", "event-listener-strategy", "futures-core", "pin-project-lite", @@ -1907,6 +1907,29 @@ dependencies = [ "tower-service", ] +[[package]] +name = "axum-extra" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0be6ea09c9b96cb5076af0de2e383bd2bc0c18f827cf1967bdd353e0b910d733" +dependencies = [ + "axum 0.7.4", + "axum-core 0.4.3", + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "mime", + "pin-project-lite", + "serde", + "serde_html_form", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "backon" version = "0.4.4" @@ -2345,9 +2368,9 @@ dependencies = [ [[package]] name = "bytecount" -version = "0.6.3" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" [[package]] name = "bytemuck" @@ -2418,9 +2441,9 @@ checksum = "981520c98f422fcc584dc1a95c334e6953900b9106bc47a9839b81790009eb21" [[package]] name = "camino" -version = "1.1.6" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +checksum = "8b96ec4966b5813e2c0507c1f86115c8c5abaadc3980879c3424042a02fd1ad3" dependencies = [ "serde", ] @@ -2498,9 +2521,9 @@ checksum = "1582e1c9e755dd6ad6b224dcffb135d199399a4568d454bd89fe515ca8425695" [[package]] name = "cargo-platform" -version = "0.1.3" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cfa25e60aea747ec7e1124f238816749faa93759c6ff5b31f1ccdda137f4479" +checksum = "24b1f0365a6c6bb4020cd05806fd0d33c44d38046b8bd7f0e40814b9763cabfc" dependencies = [ "serde", ] @@ -2852,9 +2875,9 @@ dependencies = [ [[package]] name = "concurrent-queue" -version = "2.2.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" dependencies = [ "crossbeam-utils", ] @@ -4568,9 +4591,9 @@ checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" [[package]] name = "event-listener" -version = "5.2.0" +version = "5.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b5fb89194fa3cad959b833185b3063ba881dbfc7030680b314250779fb4cc91" +checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba" dependencies = [ "concurrent-queue", "parking", @@ -4583,7 +4606,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "332f51cb23d20b0de8458b86580878211da09bcd4503cb579c225b3d124cabb3" dependencies = [ - "event-listener 5.2.0", + "event-listener 5.3.1", "pin-project-lite", ] @@ -6004,8 +6027,7 @@ dependencies = [ [[package]] name = "iceberg" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "651dfca7c429918e164607a549287cfdd1e7814d2e4cb577d0d6dc57fe19b785" +source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=84bf51c9d0d5886e4ee306ca4f383f029e1767a4#84bf51c9d0d5886e4ee306ca4f383f029e1767a4" dependencies = [ "anyhow", "apache-avro 0.17.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -6025,11 +6047,13 @@ dependencies = [ "fnv", "futures", "itertools 0.13.0", + "moka", "murmur3", "once_cell", "opendal 0.49.0", "ordered-float 4.1.1", "parquet 52.0.0", + "paste", "reqwest 0.12.4", "rust_decimal", "serde", @@ -6039,7 +6063,7 @@ dependencies = [ "serde_repr", "serde_with 3.8.0", "tokio", - "typed-builder 0.19.1", + "typed-builder 0.20.0", "url", "uuid", ] @@ -6047,8 +6071,7 @@ dependencies = [ [[package]] name = "iceberg-catalog-glue" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64ef7c992442a80c46975e08f3862140ca3e1c1c772aa68baaf65bb08f97ff07" +source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=84bf51c9d0d5886e4ee306ca4f383f029e1767a4#84bf51c9d0d5886e4ee306ca4f383f029e1767a4" dependencies = [ "anyhow", "async-trait", @@ -6058,15 +6081,14 @@ dependencies = [ "log", "serde_json", "tokio", - "typed-builder 0.19.1", + "typed-builder 0.20.0", "uuid", ] [[package]] name = "iceberg-catalog-rest" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f351c7b964fa6f3b4f976f8de3f16f1bf84eea8478606aaebdfd6a871d6b082c" +source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=84bf51c9d0d5886e4ee306ca4f383f029e1767a4#84bf51c9d0d5886e4ee306ca4f383f029e1767a4" dependencies = [ "async-trait", "chrono", @@ -6079,7 +6101,7 @@ dependencies = [ "serde_derive", "serde_json", "tokio", - "typed-builder 0.19.1", + "typed-builder 0.20.0", "uuid", ] @@ -9127,7 +9149,7 @@ checksum = "8bdf592881d821b83d471f8af290226c8d51402259e9bb5be7f9f8bdebbb11ac" dependencies = [ "bytes", "heck 0.4.1", - "itertools 0.10.5", + "itertools 0.11.0", "log", "multimap 0.8.3", "once_cell", @@ -9182,7 +9204,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "265baba7fabd416cf5078179f7d2cbeca4ce7a9041111900675ea7c4cb8a4c32" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.11.0", "proc-macro2", "quote", "syn 2.0.66", @@ -9311,11 +9333,11 @@ dependencies = [ [[package]] name = "pulldown-cmark" -version = "0.9.3" +version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a1a2f1f0a7ecff9c31abbe177637be0e97a0aef46cf8738ece09327985d998" +checksum = "57206b407293d2bcd3af849ce869d52068623f19e1b5ff8e8778e3309439682b" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", "memchr", "unicase", ] @@ -9368,7 +9390,7 @@ dependencies = [ "indoc", "libc", "memoffset", - "parking_lot 0.11.2", + "parking_lot 0.12.1", "portable-atomic", "pyo3-build-config", "pyo3-ffi", @@ -10455,6 +10477,7 @@ version = "2.1.0-alpha" dependencies = [ "async-trait", "axum 0.7.4", + "axum-extra", "futures", "http 1.1.0", "madsim-tokio", @@ -10463,6 +10486,7 @@ dependencies = [ "risingwave_common", "risingwave_pb", "risingwave_rpc_client", + "serde", "thiserror", "thiserror-ext", "tower", @@ -10614,6 +10638,7 @@ dependencies = [ "easy-ext", "enum-as-inner 0.6.0", "expect-test", + "fs-err", "futures", "futures-async-stream", "gcp-bigquery-client", @@ -10652,11 +10677,8 @@ dependencies = [ "pretty_assertions", "prometheus", "prost 0.13.1", - "prost-build 0.12.1", "prost-reflect", "prost-types 0.13.1", - "protobuf-native", - "protobuf-src", "pulsar", "quote", "rand", @@ -10718,11 +10740,19 @@ dependencies = [ "chrono", "easy-ext", "expect-test", + "fs-err", "hex", "itertools 0.12.1", "jsonbb", "jsonschema-transpiler", + "madsim-tokio", "num-bigint", + "prost 0.13.1", + "prost-build 0.12.1", + "prost-reflect", + "prost-types 0.13.1", + "protobuf-native", + "protobuf-src", "risingwave_common", "risingwave_pb", "rust_decimal", @@ -12758,6 +12788,19 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "serde_html_form" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de514ef58196f1fc96dcaef80fe6170a1ce6215df9687a93fe8300e773fefc5" +dependencies = [ + "form_urlencoded", + "indexmap 2.2.6", + "itoa", + "ryu", + "serde", +] + [[package]] name = "serde_json" version = "1.0.125" @@ -14833,6 +14876,15 @@ dependencies = [ "typed-builder-macro 0.19.1", ] +[[package]] +name = "typed-builder" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e14ed59dc8b7b26cacb2a92bad2e8b1f098806063898ab42a3bd121d7d45e75" +dependencies = [ + "typed-builder-macro 0.20.0", +] + [[package]] name = "typed-builder-macro" version = "0.16.2" @@ -14866,6 +14918,17 @@ dependencies = [ "syn 2.0.66", ] +[[package]] +name = "typed-builder-macro" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "typenum" version = "1.16.0" diff --git a/Cargo.toml b/Cargo.toml index 46ab2695a4ebb..c34b414ad0a7b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -122,6 +122,7 @@ aws-smithy-types = { version = "1", default-features = false, features = [ aws-endpoint = "0.60" aws-types = "1" axum = "=0.7.4" # TODO: 0.7.5+ does not work with current toolchain +axum-extra = "0.9" etcd-client = { package = "madsim-etcd-client", version = "0.6" } futures-async-stream = "0.2.9" hytra = "0.1" @@ -143,9 +144,10 @@ arrow-array-iceberg = { package = "arrow-array", version = "52" } arrow-schema-iceberg = { package = "arrow-schema", version = "52" } arrow-buffer-iceberg = { package = "arrow-buffer", version = "52" } arrow-cast-iceberg = { package = "arrow-cast", version = "52" } -iceberg = "0.3.0" -iceberg-catalog-rest = "0.3.0" -iceberg-catalog-glue = "0.3.0" +# branch dev +iceberg = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "84bf51c9d0d5886e4ee306ca4f383f029e1767a4" } +iceberg-catalog-rest = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "84bf51c9d0d5886e4ee306ca4f383f029e1767a4" } +iceberg-catalog-glue = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "84bf51c9d0d5886e4ee306ca4f383f029e1767a4" } opendal = "0.47" arrow-array = "50" arrow-arith = "50" diff --git a/ci/scripts/backwards-compat-test.sh b/ci/scripts/backwards-compat-test.sh index 90a01f57a51cc..c077d3d360eb2 100755 --- a/ci/scripts/backwards-compat-test.sh +++ b/ci/scripts/backwards-compat-test.sh @@ -79,6 +79,8 @@ ENABLE_BUILD_RUST=$ENABLE_BUILD # Use target/debug for simplicity. ENABLE_RELEASE_PROFILE=false +ENABLE_PYTHON_UDF=true +ENABLE_JS_UDF=true EOF # See https://github.com/risingwavelabs/risingwave/pull/15448 @@ -100,6 +102,10 @@ setup_old_cluster() { set -e echo "Failed to download ${OLD_VERSION} from github releases, build from source later during \`risedev d\`" configure_rw "$OLD_VERSION" true + elif [[ $OLD_VERSION = '1.10.1' || $OLD_VERSION = '1.10.0' ]]; then + set -e + echo "1.10.x has older openssl version, build from source later during \`risedev d\`" + configure_rw "$OLD_VERSION" true else set -e tar -xvf risingwave-v"${OLD_VERSION}"-x86_64-unknown-linux.tar.gz diff --git a/ci/scripts/build.sh b/ci/scripts/build.sh index f595bede10d24..b6a6a0fe3b542 100755 --- a/ci/scripts/build.sh +++ b/ci/scripts/build.sh @@ -60,7 +60,7 @@ cargo build \ --timings -artifacts=(risingwave sqlsmith compaction-test risingwave_regress_test risingwave_e2e_extended_mode_test risedev-dev delete-range-test) +artifacts=(risingwave sqlsmith compaction-test risingwave_regress_test risingwave_e2e_extended_mode_test risedev-dev) echo "--- Show link info" ldd target/"$profile"/risingwave diff --git a/ci/scripts/e2e-cassandra-sink-test.sh b/ci/scripts/e2e-cassandra-sink-test.sh index 0e1c9a98d49e8..678b97aac4b54 100755 --- a/ci/scripts/e2e-cassandra-sink-test.sh +++ b/ci/scripts/e2e-cassandra-sink-test.sh @@ -41,17 +41,24 @@ wget $(get_latest_cassandra_download_url) -O cassandra_latest.tar.gz tar xfvz cassandra_latest.tar.gz export LATEST_CASSANDRA_VERSION=$(get_latest_cassandra_version) export CASSANDRA_DIR="./apache-cassandra-${LATEST_CASSANDRA_VERSION}" -# remove bundled packages, and use installed packages, because Python 3.12 has removed asyncore, but I failed to install libev support for bundled Python driver. -rm ${CASSANDRA_DIR}/lib/six-1.12.0-py2.py3-none-any.zip -rm ${CASSANDRA_DIR}/lib/cassandra-driver-internal-only-3.25.0.zip -apt-get install -y libev4 libev-dev -pip3 install --break-system-packages cassandra-driver + +# Cassandra only support python 3.11 +apt-get install -y software-properties-common +add-apt-repository ppa:deadsnakes/ppa +apt-get update +apt-get install -y python3.11 +apt-get install -y python3.11-venv +python3.11 -m venv cqlsh_env +source cqlsh_env/bin/activate + export CQLSH_HOST=cassandra-server export CQLSH_PORT=9042 echo "--- testing sinks" sqllogictest -p 4566 -d dev './e2e_test/sink/cassandra_sink.slt' +deactivate + echo "--- Kill cluster" cd ../../ risedev ci-kill \ No newline at end of file diff --git a/ci/scripts/e2e-iceberg-sink-v2-test.sh b/ci/scripts/e2e-iceberg-sink-v2-test.sh index 1a46f30682bdd..c039c625aa213 100755 --- a/ci/scripts/e2e-iceberg-sink-v2-test.sh +++ b/ci/scripts/e2e-iceberg-sink-v2-test.sh @@ -46,6 +46,7 @@ poetry run python main.py -t ./test_case/range_partition_append_only.toml poetry run python main.py -t ./test_case/range_partition_upsert.toml poetry run python main.py -t ./test_case/append_only_with_checkpoint_interval.toml poetry run python main.py -t ./test_case/iceberg_select_empty_table.toml +poetry run python main.py -t ./test_case/iceberg_source_eq_delete.toml echo "--- Kill cluster" diff --git a/ci/scripts/e2e-source-test.sh b/ci/scripts/e2e-source-test.sh index 29f2a0ac7b5ce..6bf2f8a491576 100755 --- a/ci/scripts/e2e-source-test.sh +++ b/ci/scripts/e2e-source-test.sh @@ -45,7 +45,6 @@ risedev ci-kill echo "--- Prepare data" cp src/connector/src/test_data/simple-schema.avsc ./avro-simple-schema.avsc cp src/connector/src/test_data/complex-schema.avsc ./avro-complex-schema.avsc -cp src/connector/src/test_data/complex-schema ./proto-complex-schema cp src/connector/src/test_data/complex-schema.json ./json-complex-schema diff --git a/ci/workflows/main-cron.yml b/ci/workflows/main-cron.yml index 1c2083e1a0d4d..d35803e02d758 100644 --- a/ci/workflows/main-cron.yml +++ b/ci/workflows/main-cron.yml @@ -712,6 +712,8 @@ steps: run: source-test-env config: ci/docker-compose.yml mount-buildkite-agent: true + environment: + - BUILDKITE_BRANCH - ./ci/plugins/upload-failure-logs matrix: setup: diff --git a/ci/workflows/pull-request.yml b/ci/workflows/pull-request.yml index c76589e95afd0..151d5ce6ec057 100644 --- a/ci/workflows/pull-request.yml +++ b/ci/workflows/pull-request.yml @@ -711,6 +711,8 @@ steps: run: source-test-env config: ci/docker-compose.yml mount-buildkite-agent: true + environment: + - BUILDKITE_BRANCH - ./ci/plugins/upload-failure-logs matrix: setup: diff --git a/dashboard/package-lock.json b/dashboard/package-lock.json index c06e209600477..496093d8c2fe4 100644 --- a/dashboard/package-lock.json +++ b/dashboard/package-lock.json @@ -54,7 +54,7 @@ "eslint-plugin-n": "^15.2.5", "eslint-plugin-promise": "^6.0.1", "eslint-plugin-react": "^7.31.6", - "express": "^4.19.2", + "express": "^4.20.0", "prettier": "^2.7.1", "prettier-plugin-organize-imports": "^3.1.1", "typescript": "5.4.2" @@ -3792,9 +3792,9 @@ } }, "node_modules/body-parser": { - "version": "1.20.2", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz", - "integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==", + "version": "1.20.3", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz", + "integrity": "sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==", "dev": true, "dependencies": { "bytes": "3.1.2", @@ -3805,7 +3805,7 @@ "http-errors": "2.0.0", "iconv-lite": "0.4.24", "on-finished": "2.4.1", - "qs": "6.11.0", + "qs": "6.13.0", "raw-body": "2.5.2", "type-is": "~1.6.18", "unpipe": "1.0.0" @@ -3842,6 +3842,21 @@ "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", "dev": true }, + "node_modules/body-parser/node_modules/qs": { + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", + "integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==", + "dev": true, + "dependencies": { + "side-channel": "^1.0.6" + }, + "engines": { + "node": ">=0.6" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/bootstrap-icons": { "version": "1.9.1", "resolved": "https://registry.npmjs.org/bootstrap-icons/-/bootstrap-icons-1.9.1.tgz", @@ -3975,14 +3990,19 @@ } }, "node_modules/call-bind": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.5.tgz", - "integrity": "sha512-C3nQxfFZxFRVoJoGKKI8y3MOEo129NQ+FgQ08iye+Mk4zNZZGdjfs06bVTr+DBSlA66Q2VEcMki/cUCP4SercQ==", + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz", + "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==", "dev": true, "dependencies": { + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.1", - "set-function-length": "^1.1.1" + "get-intrinsic": "^1.2.4", + "set-function-length": "^1.2.1" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -4874,17 +4894,20 @@ } }, "node_modules/define-data-property": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.1.tgz", - "integrity": "sha512-E7uGkTzkk1d0ByLeSc6ZsFS79Axg+m1P/VsgYsxHgiuc3tFSj+MjMIwe90FC4lOAZzNBdY7kkO2P2wKdsQ1vgQ==", + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", + "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==", "dev": true, "dependencies": { - "get-intrinsic": "^1.2.1", - "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.0" + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", + "gopd": "^1.0.1" }, "engines": { "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, "node_modules/define-lazy-prop": { @@ -5166,6 +5189,27 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/es-define-property": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz", + "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==", + "dev": true, + "dependencies": { + "get-intrinsic": "^1.2.4" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/es-iterator-helpers": { "version": "1.0.15", "resolved": "https://registry.npmjs.org/es-iterator-helpers/-/es-iterator-helpers-1.0.15.tgz", @@ -6120,37 +6164,37 @@ } }, "node_modules/express": { - "version": "4.19.2", - "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz", - "integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==", + "version": "4.20.0", + "resolved": "https://registry.npmjs.org/express/-/express-4.20.0.tgz", + "integrity": "sha512-pLdae7I6QqShF5PnNTCVn4hI91Dx0Grkn2+IAsMTgMIKuQVte2dN9PeGSSAME2FR8anOhVA62QDIUaWVfEXVLw==", "dev": true, "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", - "body-parser": "1.20.2", + "body-parser": "1.20.3", "content-disposition": "0.5.4", "content-type": "~1.0.4", "cookie": "0.6.0", "cookie-signature": "1.0.6", "debug": "2.6.9", "depd": "2.0.0", - "encodeurl": "~1.0.2", + "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "etag": "~1.8.1", "finalhandler": "1.2.0", "fresh": "0.5.2", "http-errors": "2.0.0", - "merge-descriptors": "1.0.1", + "merge-descriptors": "1.0.3", "methods": "~1.1.2", "on-finished": "2.4.1", "parseurl": "~1.3.3", - "path-to-regexp": "0.1.7", + "path-to-regexp": "0.1.10", "proxy-addr": "~2.0.7", "qs": "6.11.0", "range-parser": "~1.2.1", "safe-buffer": "5.2.1", - "send": "0.18.0", - "serve-static": "1.15.0", + "send": "0.19.0", + "serve-static": "1.16.0", "setprototypeof": "1.2.0", "statuses": "2.0.1", "type-is": "~1.6.18", @@ -6170,6 +6214,15 @@ "ms": "2.0.0" } }, + "node_modules/express/node_modules/encodeurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", + "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", + "dev": true, + "engines": { + "node": ">= 0.8" + } + }, "node_modules/express/node_modules/ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", @@ -6643,16 +6696,20 @@ } }, "node_modules/get-intrinsic": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.2.tgz", - "integrity": "sha512-0gSo4ml/0j98Y3lngkFEot/zhiCeWsbYIlZ+uZOVgzLyLaUw7wxUL+nCTP0XJvJg1AXulJRI3UJi8GsbDuxdGA==", + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz", + "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==", "dev": true, "dependencies": { + "es-errors": "^1.3.0", "function-bind": "^1.1.2", "has-proto": "^1.0.1", "has-symbols": "^1.0.3", "hasown": "^2.0.0" }, + "engines": { + "node": ">= 0.4" + }, "funding": { "url": "https://github.com/sponsors/ljharb" } @@ -6833,12 +6890,12 @@ } }, "node_modules/has-property-descriptors": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.1.tgz", - "integrity": "sha512-VsX8eaIewvas0xnvinAe9bw4WfIeODpGYikiWYLH+dma0Jw6KHYqWiWfhQlgOVK8D6PvjubK5Uc4P0iIhIcNVg==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz", + "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==", "dev": true, "dependencies": { - "get-intrinsic": "^1.2.2" + "es-define-property": "^1.0.0" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -8320,10 +8377,13 @@ } }, "node_modules/merge-descriptors": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", - "integrity": "sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w==", - "dev": true + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz", + "integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } }, "node_modules/merge-stream": { "version": "2.0.0", @@ -8720,10 +8780,13 @@ } }, "node_modules/object-inspect": { - "version": "1.12.3", - "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.12.3.tgz", - "integrity": "sha512-geUvdk7c+eizMNUDkRpW1wJwgfOiOeHbxBR/hLXK1aT6zmVSO0jsQcs7fj6MGw89jC/cjGfLcNOrtMYtGqm81g==", + "version": "1.13.2", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz", + "integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==", "dev": true, + "engines": { + "node": ">= 0.4" + }, "funding": { "url": "https://github.com/sponsors/ljharb" } @@ -9056,9 +9119,9 @@ } }, "node_modules/path-to-regexp": { - "version": "0.1.7", - "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", - "integrity": "sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==", + "version": "0.1.10", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz", + "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==", "dev": true }, "node_modules/path-type": { @@ -10076,9 +10139,9 @@ } }, "node_modules/send": { - "version": "0.18.0", - "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", - "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==", + "version": "0.19.0", + "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", + "integrity": "sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==", "dev": true, "dependencies": { "debug": "2.6.9", @@ -10121,9 +10184,9 @@ "dev": true }, "node_modules/serve-static": { - "version": "1.15.0", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz", - "integrity": "sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==", + "version": "1.16.0", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.0.tgz", + "integrity": "sha512-pDLK8zwl2eKaYrs8mrPZBJua4hMplRWJ1tIFksVC3FtBEBnl8dxgeHtsaMS8DhS9i4fLObaon6ABoc4/hQGdPA==", "dev": true, "dependencies": { "encodeurl": "~1.0.2", @@ -10135,6 +10198,51 @@ "node": ">= 0.8.0" } }, + "node_modules/serve-static/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/serve-static/node_modules/debug/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true + }, + "node_modules/serve-static/node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "dev": true + }, + "node_modules/serve-static/node_modules/send": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", + "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==", + "dev": true, + "dependencies": { + "debug": "2.6.9", + "depd": "2.0.0", + "destroy": "1.2.0", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "fresh": "0.5.2", + "http-errors": "2.0.0", + "mime": "1.6.0", + "ms": "2.1.3", + "on-finished": "2.4.1", + "range-parser": "~1.2.1", + "statuses": "2.0.1" + }, + "engines": { + "node": ">= 0.8.0" + } + }, "node_modules/set-blocking": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", @@ -10142,16 +10250,17 @@ "optional": true }, "node_modules/set-function-length": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.0.tgz", - "integrity": "sha512-4DBHDoyHlM1IRPGYcoxexgh67y4ueR53FKV1yyxwFMY7aCqcN/38M1+SwZ/qJQ8iLv7+ck385ot4CcisOAPT9w==", + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", + "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==", "dev": true, "dependencies": { - "define-data-property": "^1.1.1", + "define-data-property": "^1.1.4", + "es-errors": "^1.3.0", "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.2", + "get-intrinsic": "^1.2.4", "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.1" + "has-property-descriptors": "^1.0.2" }, "engines": { "node": ">= 0.4" @@ -10209,14 +10318,18 @@ } }, "node_modules/side-channel": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz", - "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==", + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz", + "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==", "dev": true, "dependencies": { - "call-bind": "^1.0.0", - "get-intrinsic": "^1.0.2", - "object-inspect": "^1.9.0" + "call-bind": "^1.0.7", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.4", + "object-inspect": "^1.13.1" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -14446,9 +14559,9 @@ "dev": true }, "body-parser": { - "version": "1.20.2", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz", - "integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==", + "version": "1.20.3", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz", + "integrity": "sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==", "dev": true, "requires": { "bytes": "3.1.2", @@ -14459,7 +14572,7 @@ "http-errors": "2.0.0", "iconv-lite": "0.4.24", "on-finished": "2.4.1", - "qs": "6.11.0", + "qs": "6.13.0", "raw-body": "2.5.2", "type-is": "~1.6.18", "unpipe": "1.0.0" @@ -14488,6 +14601,15 @@ "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", "dev": true + }, + "qs": { + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", + "integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==", + "dev": true, + "requires": { + "side-channel": "^1.0.6" + } } } }, @@ -14586,14 +14708,16 @@ "dev": true }, "call-bind": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.5.tgz", - "integrity": "sha512-C3nQxfFZxFRVoJoGKKI8y3MOEo129NQ+FgQ08iye+Mk4zNZZGdjfs06bVTr+DBSlA66Q2VEcMki/cUCP4SercQ==", + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz", + "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==", "dev": true, "requires": { + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.1", - "set-function-length": "^1.1.1" + "get-intrinsic": "^1.2.4", + "set-function-length": "^1.2.1" } }, "callsites": { @@ -15255,14 +15379,14 @@ } }, "define-data-property": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.1.tgz", - "integrity": "sha512-E7uGkTzkk1d0ByLeSc6ZsFS79Axg+m1P/VsgYsxHgiuc3tFSj+MjMIwe90FC4lOAZzNBdY7kkO2P2wKdsQ1vgQ==", + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", + "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==", "dev": true, "requires": { - "get-intrinsic": "^1.2.1", - "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.0" + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", + "gopd": "^1.0.1" } }, "define-lazy-prop": { @@ -15488,6 +15612,21 @@ "which-typed-array": "^1.1.10" } }, + "es-define-property": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz", + "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==", + "dev": true, + "requires": { + "get-intrinsic": "^1.2.4" + } + }, + "es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "dev": true + }, "es-iterator-helpers": { "version": "1.0.15", "resolved": "https://registry.npmjs.org/es-iterator-helpers/-/es-iterator-helpers-1.0.15.tgz", @@ -16174,37 +16313,37 @@ } }, "express": { - "version": "4.19.2", - "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz", - "integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==", + "version": "4.20.0", + "resolved": "https://registry.npmjs.org/express/-/express-4.20.0.tgz", + "integrity": "sha512-pLdae7I6QqShF5PnNTCVn4hI91Dx0Grkn2+IAsMTgMIKuQVte2dN9PeGSSAME2FR8anOhVA62QDIUaWVfEXVLw==", "dev": true, "requires": { "accepts": "~1.3.8", "array-flatten": "1.1.1", - "body-parser": "1.20.2", + "body-parser": "1.20.3", "content-disposition": "0.5.4", "content-type": "~1.0.4", "cookie": "0.6.0", "cookie-signature": "1.0.6", "debug": "2.6.9", "depd": "2.0.0", - "encodeurl": "~1.0.2", + "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "etag": "~1.8.1", "finalhandler": "1.2.0", "fresh": "0.5.2", "http-errors": "2.0.0", - "merge-descriptors": "1.0.1", + "merge-descriptors": "1.0.3", "methods": "~1.1.2", "on-finished": "2.4.1", "parseurl": "~1.3.3", - "path-to-regexp": "0.1.7", + "path-to-regexp": "0.1.10", "proxy-addr": "~2.0.7", "qs": "6.11.0", "range-parser": "~1.2.1", "safe-buffer": "5.2.1", - "send": "0.18.0", - "serve-static": "1.15.0", + "send": "0.19.0", + "serve-static": "1.16.0", "setprototypeof": "1.2.0", "statuses": "2.0.1", "type-is": "~1.6.18", @@ -16221,6 +16360,12 @@ "ms": "2.0.0" } }, + "encodeurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", + "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", + "dev": true + }, "ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", @@ -16602,11 +16747,12 @@ "peer": true }, "get-intrinsic": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.2.tgz", - "integrity": "sha512-0gSo4ml/0j98Y3lngkFEot/zhiCeWsbYIlZ+uZOVgzLyLaUw7wxUL+nCTP0XJvJg1AXulJRI3UJi8GsbDuxdGA==", + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz", + "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==", "dev": true, "requires": { + "es-errors": "^1.3.0", "function-bind": "^1.1.2", "has-proto": "^1.0.1", "has-symbols": "^1.0.3", @@ -16735,12 +16881,12 @@ "integrity": "sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw==" }, "has-property-descriptors": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.1.tgz", - "integrity": "sha512-VsX8eaIewvas0xnvinAe9bw4WfIeODpGYikiWYLH+dma0Jw6KHYqWiWfhQlgOVK8D6PvjubK5Uc4P0iIhIcNVg==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz", + "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==", "dev": true, "requires": { - "get-intrinsic": "^1.2.2" + "es-define-property": "^1.0.0" } }, "has-proto": { @@ -17803,9 +17949,9 @@ "dev": true }, "merge-descriptors": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", - "integrity": "sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w==", + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz", + "integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==", "dev": true }, "merge-stream": { @@ -18081,9 +18227,9 @@ "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==" }, "object-inspect": { - "version": "1.12.3", - "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.12.3.tgz", - "integrity": "sha512-geUvdk7c+eizMNUDkRpW1wJwgfOiOeHbxBR/hLXK1aT6zmVSO0jsQcs7fj6MGw89jC/cjGfLcNOrtMYtGqm81g==", + "version": "1.13.2", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz", + "integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==", "dev": true }, "object-keys": { @@ -18322,9 +18468,9 @@ } }, "path-to-regexp": { - "version": "0.1.7", - "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz", - "integrity": "sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==", + "version": "0.1.10", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz", + "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==", "dev": true }, "path-type": { @@ -19040,9 +19186,9 @@ "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" }, "send": { - "version": "0.18.0", - "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", - "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==", + "version": "0.19.0", + "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", + "integrity": "sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==", "dev": true, "requires": { "debug": "2.6.9", @@ -19086,15 +19232,61 @@ } }, "serve-static": { - "version": "1.15.0", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz", - "integrity": "sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==", + "version": "1.16.0", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.0.tgz", + "integrity": "sha512-pDLK8zwl2eKaYrs8mrPZBJua4hMplRWJ1tIFksVC3FtBEBnl8dxgeHtsaMS8DhS9i4fLObaon6ABoc4/hQGdPA==", "dev": true, "requires": { "encodeurl": "~1.0.2", "escape-html": "~1.0.3", "parseurl": "~1.3.3", "send": "0.18.0" + }, + "dependencies": { + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "requires": { + "ms": "2.0.0" + }, + "dependencies": { + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "dev": true + } + } + }, + "ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "dev": true + }, + "send": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz", + "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==", + "dev": true, + "requires": { + "debug": "2.6.9", + "depd": "2.0.0", + "destroy": "1.2.0", + "encodeurl": "~1.0.2", + "escape-html": "~1.0.3", + "etag": "~1.8.1", + "fresh": "0.5.2", + "http-errors": "2.0.0", + "mime": "1.6.0", + "ms": "2.1.3", + "on-finished": "2.4.1", + "range-parser": "~1.2.1", + "statuses": "2.0.1" + } + } } }, "set-blocking": { @@ -19104,16 +19296,17 @@ "optional": true }, "set-function-length": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.0.tgz", - "integrity": "sha512-4DBHDoyHlM1IRPGYcoxexgh67y4ueR53FKV1yyxwFMY7aCqcN/38M1+SwZ/qJQ8iLv7+ck385ot4CcisOAPT9w==", + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", + "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==", "dev": true, "requires": { - "define-data-property": "^1.1.1", + "define-data-property": "^1.1.4", + "es-errors": "^1.3.0", "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.2", + "get-intrinsic": "^1.2.4", "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.1" + "has-property-descriptors": "^1.0.2" } }, "set-function-name": { @@ -19159,14 +19352,15 @@ "dev": true }, "side-channel": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.4.tgz", - "integrity": "sha512-q5XPytqFEIKHkGdiMIrY10mvLRvnQh42/+GoBlFW3b2LXLE2xxJpZFdm94we0BaoV3RwJyGqg5wS7epxTv0Zvw==", + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz", + "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==", "dev": true, "requires": { - "call-bind": "^1.0.0", - "get-intrinsic": "^1.0.2", - "object-inspect": "^1.9.0" + "call-bind": "^1.0.7", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.4", + "object-inspect": "^1.13.1" } }, "signal-exit": { diff --git a/dashboard/package.json b/dashboard/package.json index a3716f7802ccf..71621e4159f5d 100644 --- a/dashboard/package.json +++ b/dashboard/package.json @@ -61,7 +61,7 @@ "eslint-plugin-n": "^15.2.5", "eslint-plugin-promise": "^6.0.1", "eslint-plugin-react": "^7.31.6", - "express": "^4.19.2", + "express": "^4.20.0", "prettier": "^2.7.1", "prettier-plugin-organize-imports": "^3.1.1", "typescript": "5.4.2" diff --git a/e2e_test/backwards-compat-tests/scripts/utils.sh b/e2e_test/backwards-compat-tests/scripts/utils.sh index 8f41dad0860f1..3f99565b9870e 100644 --- a/e2e_test/backwards-compat-tests/scripts/utils.sh +++ b/e2e_test/backwards-compat-tests/scripts/utils.sh @@ -69,7 +69,7 @@ check_version() { local VERSION=$1 local raw_version=$(run_sql "SELECT version();") echo "--- Version" - echo "$raw_version" + echo "raw_version: $raw_version" local version=$(echo $raw_version | grep -i risingwave | sed 's/^.*risingwave-\([0-9]*\.[0-9]*\.[0-9]\).*$/\1/i') if [[ "$version" != "$VERSION" ]]; then echo "Version mismatch, expected $VERSION, got $version" @@ -133,12 +133,24 @@ get_old_version() { # Then we sort them in descending order. echo "--- VERSIONS" - local sorted_versions=$(echo -e "$tags" | sort -t '.' -n) + local sorted_versions=$(echo -e "$tags" | sort -V) echo "$sorted_versions" + # We handle the edge case where the current branch is the one being released. + # If so, we need to prune it from the list. + # We cannot simply use 'git branch --show-current', because buildkite checks out with the commit, + # rather than branch. So the current state is detached. + # Instead we rely on BUILDKITE_BRANCH, provided by buildkite. + local current_branch=$(echo "$BUILDKITE_BRANCH" | tr -d 'v') + echo "--- CURRENT BRANCH: $current_branch" + + echo "--- PRUNED VERSIONS" + local pruned_versions=$(echo -e "$sorted_versions" | grep -v "$current_branch") + echo "$pruned_versions" + # Then we take the Nth latest version. # We set $OLD_VERSION to this. - OLD_VERSION=$(echo -e "$sorted_versions" | tail -n $VERSION_OFFSET | head -1) + OLD_VERSION=$(echo -e "$pruned_versions" | tail -n $VERSION_OFFSET | head -1) } get_new_version() { @@ -182,7 +194,7 @@ seed_old_cluster() { cp -r e2e_test/tpch/* $TEST_DIR/tpch ./risedev clean-data - ./risedev d full-without-monitoring && rm .risingwave/log/* + ENABLE_PYTHON_UDF=1 ENABLE_JS_UDF=1 ./risedev d full-without-monitoring && rm .risingwave/log/* check_version "$OLD_VERSION" @@ -240,7 +252,7 @@ seed_old_cluster() { validate_new_cluster() { echo "--- Start cluster on latest" - ./risedev d full-without-monitoring + ENABLE_PYTHON_UDF=1 ENABLE_JS_UDF=1 ./risedev d full-without-monitoring echo "--- Wait ${RECOVERY_DURATION}s for Recovery on Old Cluster Data" sleep $RECOVERY_DURATION diff --git a/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt b/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt index 0dc937303a852..b0e433c819f83 100644 --- a/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt +++ b/e2e_test/iceberg/test_case/append_only_with_checkpoint_interval.slt @@ -1,6 +1,3 @@ -statement ok -set sink_decouple = false; - statement ok set streaming_parallelism=4; @@ -37,7 +34,6 @@ CREATE SINK sink1 AS select * from mv1 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); statement ok @@ -54,7 +50,6 @@ CREATE SINK sink2 AS select * from mv1 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); sleep 20s diff --git a/e2e_test/iceberg/test_case/cdc/load.slt b/e2e_test/iceberg/test_case/cdc/load.slt index df0c319990374..6e6850725f98a 100644 --- a/e2e_test/iceberg/test_case/cdc/load.slt +++ b/e2e_test/iceberg/test_case/cdc/load.slt @@ -1,4 +1,6 @@ # CDC source basic test +statement ok +set sink_decouple = false; statement ok create source mysql_mydb with ( diff --git a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt index 66eb11da1f438..49c4cf3fb1145 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_append_only_table.slt @@ -39,7 +39,7 @@ CREATE SINK s6 AS select * from mv6 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 + create_table_if_not_exists = 'true' ); statement ok @@ -63,6 +63,30 @@ FLUSH; sleep 5s +statement ok +CREATE Source iceberg_s WITH ( + connector = 'iceberg', + database.name = 'demo_db', + table.name = 'no_partition_append_only_table', + catalog.name = 'demo', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin' +); + +query ?????????????? rowsort +select * from iceberg_s +---- +1 1 1000 1.1 1.11 1-1 t 2022-03-11 2022-03-11 01:00:00+00:00 2022-03-11 01:00:00 1.11000 {1:100,2:200} {1,2,3} (1,2) +2 2 2000 2.2 2.22 2-2 f 2022-03-12 2022-03-12 02:00:00+00:00 2022-03-12 02:00:00 2.22000 {3:300} {1,NULL,3} (3,) +3 3 3000 3.3 3.33 3-3 t 2022-03-13 2022-03-13 03:00:00+00:00 2022-03-13 03:00:00 99999.99999 NULL NULL NULL +4 4 4000 4.4 4.44 4-4 f 2022-03-14 2022-03-14 04:00:00+00:00 2022-03-14 04:00:00 -99999.99999 NULL NULL NULL +5 5 5000 5.5 5.55 5-5 t 2022-03-15 2022-03-15 05:00:00+00:00 2022-03-15 05:00:00 NULL NULL NULL NULL + + statement ok DROP SINK s6; @@ -71,3 +95,6 @@ DROP MATERIALIZED VIEW mv6; statement ok DROP TABLE t6; + +statement ok +DROP SOURCE iceberg_s; diff --git a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt index de96205a2debf..73d953bc2937a 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_no_partition_upsert_table.slt @@ -25,7 +25,6 @@ CREATE SINK s6 AS select mv6.id as id, mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', primary_key = 'v1', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt index 72f0bce46d183..3a27df42903ee 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_partition_append_only_table.slt @@ -36,7 +36,6 @@ CREATE SINK s6 AS select * from mv6 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt index 2b213a77175bd..39f170a834382 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_partition_upsert_table.slt @@ -25,7 +25,6 @@ CREATE SINK s6 AS select mv6.id as id, mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', primary_key = 'v1', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt index 46670ac362599..f0cf9f5fa3133 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_append_only_table.slt @@ -36,7 +36,6 @@ CREATE SINK s6 AS select * from mv6 WITH ( s3.region = 'us-east-1', s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt index 5637ce34c940f..f43e2788a020a 100644 --- a/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt +++ b/e2e_test/iceberg/test_case/iceberg_sink_range_partition_upsert_table.slt @@ -25,7 +25,6 @@ CREATE SINK s6 AS select mv6.id as id, mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, s3.access.key = 'hummockadmin', s3.secret.key = 'hummockadmin', primary_key = 'v1', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/iceberg/test_case/iceberg_source_eq_delete.slt b/e2e_test/iceberg/test_case/iceberg_source_eq_delete.slt new file mode 100644 index 0000000000000..820776fb7e773 --- /dev/null +++ b/e2e_test/iceberg/test_case/iceberg_source_eq_delete.slt @@ -0,0 +1,113 @@ +statement ok +set sink_decouple = false; + +statement ok +set streaming_parallelism=4; + +statement ok +CREATE TABLE s1 (i1 int, i2 varchar, i3 varchar); + +statement ok +CREATE MATERIALIZED VIEW mv1 AS SELECT * FROM s1; + +statement ok +CREATE SINK sink1 AS select * from mv1 WITH ( + connector = 'iceberg', + type = 'upsert', + database.name = 'demo_db', + table.name = 't1', + catalog.name = 'demo', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + create_table_if_not_exists = 'true', + primary_key = 'i1,i2', +); + +statement ok +insert into s1 values(1,'2','3'); + +statement ok +insert into s1 values(7,'8','9'); + +statement ok +insert into s1 values(4,'5','6'); + +statement ok +flush; + +statement ok +delete from s1 where i1 = 7; + +statement ok +flush; + +sleep 5s + +statement ok +CREATE SOURCE iceberg_t1_source +WITH ( + connector = 'iceberg', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + database.name = 'demo_db', + table.name = 't1', +); + +query I +select * from iceberg_t1_source order by i1; +---- +1 2 3 +4 5 6 + +query I +select i1,i2,i3 from iceberg_t1_source order by i1; +---- +1 2 3 +4 5 6 + +query I +select i3,i2 from iceberg_t1_source order by i2; +---- +3 2 +6 5 + +query I +select i2,i1 from iceberg_t1_source order by i1; +---- +2 1 +5 4 + +query I +select i1 from iceberg_t1_source order by i1; +---- +1 +4 + +query I +select i2 from iceberg_t1_source order by i2; +---- +2 +5 + +query I +select i3 from iceberg_t1_source order by i3; +---- +3 +6 + +statement ok +DROP SINK sink1; + +statement ok +DROP SOURCE iceberg_t1_source; + +statement ok +DROP TABLE s1 cascade; diff --git a/e2e_test/iceberg/test_case/iceberg_source_eq_delete.toml b/e2e_test/iceberg/test_case/iceberg_source_eq_delete.toml new file mode 100644 index 0000000000000..6e49ca949f501 --- /dev/null +++ b/e2e_test/iceberg/test_case/iceberg_source_eq_delete.toml @@ -0,0 +1,11 @@ +init_sqls = [ + 'CREATE SCHEMA IF NOT EXISTS demo_db', + 'DROP TABLE IF EXISTS demo_db.t1', +] + +slt = 'test_case/iceberg_source_eq_delete.slt' + +drop_sqls = [ + 'DROP TABLE IF EXISTS demo_db.t1', + 'DROP SCHEMA IF EXISTS demo_db', +] \ No newline at end of file diff --git a/e2e_test/s3/fs_source_v2.py b/e2e_test/s3/fs_source_v2.py index eaef004dd600a..eb9ec69ae3dd5 100644 --- a/e2e_test/s3/fs_source_v2.py +++ b/e2e_test/s3/fs_source_v2.py @@ -69,7 +69,9 @@ def _encode(): name TEXT, sex int, mark int, - ) WITH ( + ) + INCLUDE payload as rw_payload + WITH ( connector = 's3', match_pattern = '{prefix}*.{fmt}', s3.region_name = '{config['S3_REGION']}', @@ -105,6 +107,18 @@ def _assert_eq(field, got, expect): _assert_eq('sum(sex)', result[2], total_rows / 2) _assert_eq('sum(mark)', result[3], 0) + # check rw_payload + print('Check rw_payload') + stmt = f"select id, name, sex, mark, rw_payload from {_table()} limit 1;" + cur.execute(stmt) + result = cur.fetchone() + print("Got one line with rw_payload: ", result) + payload = json.loads(result[4]) + _assert_eq('id', payload['id'], result[0]) + _assert_eq('name', payload['name'], result[1]) + _assert_eq('sex', payload['sex'], result[2]) + _assert_eq('mark', payload['mark'], result[3]) + print('Test pass') if need_drop_table: diff --git a/e2e_test/sink/clickhouse_sink.slt b/e2e_test/sink/clickhouse_sink.slt index e037618bb460e..e5bac0d8d521d 100644 --- a/e2e_test/sink/clickhouse_sink.slt +++ b/e2e_test/sink/clickhouse_sink.slt @@ -17,7 +17,6 @@ CREATE SINK s6 AS select mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3, mv6.v4 as v4, clickhouse.password = '', clickhouse.database = 'default', clickhouse.table='demo_test', - commit_checkpoint_interval = 1, ); statement ok diff --git a/e2e_test/sink/create_sink_as.slt b/e2e_test/sink/create_sink_as.slt index 5c66c5623553e..dc6d0f61419c6 100644 --- a/e2e_test/sink/create_sink_as.slt +++ b/e2e_test/sink/create_sink_as.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t4 (v1 int primary key, v2 int); diff --git a/e2e_test/sink/deltalake_rust_sink.slt b/e2e_test/sink/deltalake_rust_sink.slt index 74dca623a9d0a..cb9f9e7817212 100644 --- a/e2e_test/sink/deltalake_rust_sink.slt +++ b/e2e_test/sink/deltalake_rust_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamptz, v9 boolean, v10 decimal, v11 decimal[]); diff --git a/e2e_test/sink/doris_sink.slt b/e2e_test/sink/doris_sink.slt index 3242206badaea..3e6a4aca9d9f6 100644 --- a/e2e_test/sink/doris_sink.slt +++ b/e2e_test/sink/doris_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamp, v9 boolean, v10 jsonb); diff --git a/e2e_test/sink/iceberg_sink.slt b/e2e_test/sink/iceberg_sink.slt index e3917908f651b..b08abd8a4918c 100644 --- a/e2e_test/sink/iceberg_sink.slt +++ b/e2e_test/sink/iceberg_sink.slt @@ -31,7 +31,6 @@ CREATE SINK s6 AS select mv6.v1 as v1, mv6.v2 as v2, mv6.v3 as v3 from mv6 WITH catalog.type = 'storage', database.name='demo_db', table.name='e2e_demo_table', - commit_checkpoint_interval = 1 ); statement ok diff --git a/e2e_test/sink/kafka/protobuf.slt b/e2e_test/sink/kafka/protobuf.slt index 70de91e25c8d7..25b95a49cf1f3 100644 --- a/e2e_test/sink/kafka/protobuf.slt +++ b/e2e_test/sink/kafka/protobuf.slt @@ -4,17 +4,14 @@ set sink_decouple = false; system ok rpk topic create test-rw-sink-append-only-protobuf -system ok -cp src/connector/src/test_data/proto_recursive/recursive.pb ./proto-recursive - statement ok create table from_kafka with ( connector = 'kafka', topic = 'test-rw-sink-append-only-protobuf', properties.bootstrap.server = 'message_queue:29092') format plain encode protobuf ( - schema.location = 'file:///risingwave/proto-recursive', - message = 'recursive.AllTypes'); + schema.location = 'file:///risingwave/src/connector/codec/tests/test_data/all-types.pb', + message = 'all_types.AllTypes'); system ok rpk topic create test-rw-sink-append-only-protobuf-csr-a @@ -91,8 +88,8 @@ create sink sink0 from into_kafka with ( properties.bootstrap.server = 'message_queue:29092') format plain encode protobuf ( force_append_only = true, - schema.location = 'file:///risingwave/proto-recursive', - message = 'recursive.AllTypes'); + schema.location = 'file:///risingwave/src/connector/codec/tests/test_data/all-types.pb', + message = 'all_types.AllTypes'); statement ok create sink sink_csr_trivial as select string_field as field_a from into_kafka with ( @@ -121,8 +118,8 @@ create sink sink_upsert from into_kafka with ( properties.bootstrap.server = 'message_queue:29092', primary_key = 'string_field') format upsert encode protobuf ( - schema.location = 'file:///risingwave/proto-recursive', - message = 'recursive.AllTypes'); + schema.location = 'file:///risingwave/src/connector/codec/tests/test_data/all-types.pb', + message = 'all_types.AllTypes'); ---- db error: ERROR: Failed to run the query @@ -140,8 +137,8 @@ create sink sink_upsert from into_kafka with ( properties.bootstrap.server = 'message_queue:29092', primary_key = 'string_field') format upsert encode protobuf ( - schema.location = 'file:///risingwave/proto-recursive', - message = 'recursive.AllTypes') + schema.location = 'file:///risingwave/src/connector/codec/tests/test_data/all-types.pb', + message = 'all_types.AllTypes') key encode text; # Shall be ignored by force_append_only sinks but processed by upsert sinks. @@ -196,7 +193,7 @@ create sink sink_err from into_kafka with ( format plain encode protobuf ( force_append_only = true, schema.location = 'file:///risingwave/proto-recursiv', - message = 'recursive.AllTypes'); + message = 'all_types.AllTypes'); statement error field not in proto create sink sink_err as select 1 as extra_column with ( @@ -205,8 +202,8 @@ create sink sink_err as select 1 as extra_column with ( properties.bootstrap.server = 'message_queue:29092') format plain encode protobuf ( force_append_only = true, - schema.location = 'file:///risingwave/proto-recursive', - message = 'recursive.AllTypes'); + schema.location = 'file:///risingwave/src/connector/codec/tests/test_data/all-types.pb', + message = 'all_types.AllTypes'); statement error s3 URL not supported yet create sink sink_err from into_kafka with ( @@ -215,8 +212,8 @@ create sink sink_err from into_kafka with ( properties.bootstrap.server = 'message_queue:29092') format plain encode protobuf ( force_append_only = true, - schema.location = 's3:///risingwave/proto-recursive', - message = 'recursive.AllTypes'); + schema.location = 's3:///risingwave/src/connector/codec/tests/test_data/all-types.pb', + message = 'all_types.AllTypes'); statement ok drop table from_kafka cascade; diff --git a/e2e_test/sink/license.slt b/e2e_test/sink/license.slt index e38470d1c70d7..6e65b3653a536 100644 --- a/e2e_test/sink/license.slt +++ b/e2e_test/sink/license.slt @@ -7,32 +7,6 @@ ALTER SYSTEM SET license_key TO ''; statement ok CREATE TABLE t (k INT); -statement error -CREATE SINK file_sink -FROM - t -WITH -( - connector = 's3', - s3.region_name = 'us-east-1', - s3.bucket_name = 'test', - s3.path = '', - s3.file_type = 'parquet', - type = 'append-only', - force_append_only='true' -) FORMAT PLAIN ENCODE PARQUET(force_append_only='true'); ----- -db error: ERROR: Failed to run the query - -Caused by these errors (recent errors listed first): - 1: gRPC request to meta service failed: Internal error - 2: failed to validate sink - 3: Internal error - 4: feature FileSink is only available for tier Paid and above, while the current tier is Free - -Hint: You may want to set a license key with `ALTER SYSTEM SET license_key = '...';` command. - - statement error CREATE SINK dynamodb_sink FROM diff --git a/e2e_test/sink/mongodb_sink.slt b/e2e_test/sink/mongodb_sink.slt index 2122993e3003a..ddc5a91a20c3f 100644 --- a/e2e_test/sink/mongodb_sink.slt +++ b/e2e_test/sink/mongodb_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table t1( a smallint, diff --git a/e2e_test/sink/redis_cluster_sink.slt b/e2e_test/sink/redis_cluster_sink.slt index 03d197485777a..3effd7795d039 100644 --- a/e2e_test/sink/redis_cluster_sink.slt +++ b/e2e_test/sink/redis_cluster_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 int); diff --git a/e2e_test/sink/redis_sink.slt b/e2e_test/sink/redis_sink.slt index 7475a80ae696e..8828c22b80d27 100644 --- a/e2e_test/sink/redis_sink.slt +++ b/e2e_test/sink/redis_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamptz, v9 boolean); diff --git a/e2e_test/sink/remote/types.slt b/e2e_test/sink/remote/types.slt index f2421eabec906..e511d5e6a6ee7 100644 --- a/e2e_test/sink/remote/types.slt +++ b/e2e_test/sink/remote/types.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table t5 (v1 smallint primary key, v2 int, v3 bigint, v4 float, v5 double, v6 decimal, v7 varchar, v8 timestamp, v9 boolean); diff --git a/e2e_test/sink/sqlserver_sink.slt b/e2e_test/sink/sqlserver_sink.slt index 156b8b865ffc8..08bbd3364ed9a 100644 --- a/e2e_test/sink/sqlserver_sink.slt +++ b/e2e_test/sink/sqlserver_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok create table t_many_data_type_rw ( k1 int, k2 int, diff --git a/e2e_test/sink/starrocks_sink.slt b/e2e_test/sink/starrocks_sink.slt index dedb01755cbbe..0aceac592618a 100644 --- a/e2e_test/sink/starrocks_sink.slt +++ b/e2e_test/sink/starrocks_sink.slt @@ -1,3 +1,6 @@ +statement ok +set sink_decouple = false; + statement ok CREATE TABLE t6 (v1 int primary key, v2 smallint, v3 bigint, v4 real, v5 float, v6 varchar, v7 date, v8 timestamp, v9 boolean, v10 jsonb, v11 decimal); diff --git a/e2e_test/source/basic/kafka.slt b/e2e_test/source/basic/kafka.slt index 40e9b46036112..227c0aa46bac1 100644 --- a/e2e_test/source/basic/kafka.slt +++ b/e2e_test/source/basic/kafka.slt @@ -187,17 +187,6 @@ create table s10 with ( scan.startup.mode = 'earliest' ) FORMAT PLAIN ENCODE AVRO (schema.location = 'file:///risingwave/avro-complex-schema.avsc', with_deprecated_file_header = true); -statement ok -create table s11 with ( - connector = 'kafka', - topic = 'proto_c_bin', - properties.bootstrap.server = 'message_queue:29092', - scan.startup.mode = 'earliest') -FORMAT PLAIN ENCODE PROTOBUF ( - message = 'test.User', - schema.location = 'file:///risingwave/proto-complex-schema' -); - statement ok CREATE TABLE s12( id int, @@ -273,17 +262,6 @@ create table s16 (v1 int, v2 varchar) with ( scan.startup.mode = 'latest' ) FORMAT PLAIN ENCODE JSON -statement ok -create source s17 with ( - connector = 'kafka', - topic = 'proto_c_bin', - properties.bootstrap.server = 'message_queue:29092', - scan.startup.mode = 'earliest') -FORMAT PLAIN ENCODE PROTOBUF ( - message = 'test.User', - schema.location = 'file:///risingwave/proto-complex-schema' -); - statement ok create source s18 with ( connector = 'kafka', @@ -498,6 +476,26 @@ FORMAT DEBEZIUM ENCODE JSON ( ignore_key = 'true' ) +statement error INCLUDE payload is only allowed when using ENCODE JSON, but got ENCODE Bytes +CREATE TABLE test_include_payload (a bytea) +INCLUDE payload +WITH ( + connector = 'kafka', + topic = 'kafka_1_partition_topic', + properties.bootstrap.server = 'message_queue:29092', + scan.startup.mode = 'earliest' +) FORMAT PLAIN ENCODE BYTES + +statement ok +CREATE TABLE test_include_payload (v1 int, v2 varchar) +INCLUDE payload +WITH ( + connector = 'kafka', + topic = 'kafka_1_partition_topic', + properties.bootstrap.server = 'message_queue:29092', + scan.startup.mode = 'earliest' +) FORMAT PLAIN ENCODE JSON + statement ok flush; @@ -512,6 +510,13 @@ select v1, v2 from t0; 3 333 4 4444 +query ITT rowsort +select v1, v2, _rw_kafka_payload from test_include_payload; +---- +1 1 {"v1": 1, "v2": "1"} +2 22 {"v1": 2, "v2": "22"} +3 333 {"v1": 3, "v2": "333"} +4 4444 {"v1": 4, "v2": "4444"} query IT rowsort select v1, v2 from s0; @@ -669,11 +674,6 @@ select id, code, timestamp, xfas, contacts, sex from s10; ---- 100 abc 1473305798 {"(0,200,10.0.0.1)","(1,400,10.0.0.2)"} ("{1xxx,2xxx}","{1xxx,2xxx}") MALE -query ITITT -select id, code, timestamp, xfas, contacts, sex from s11; ----- -0 abc 1473305798 {"(0,200,127.0.0.1)","(1,400,127.0.0.2)"} ("{1xxx,2xxx}","{1xxx,2xxx}") MALE - query ITITT select id, code, timestamp, xfas, contacts, jsonb from s12; ---- @@ -703,9 +703,6 @@ select count(*) from s16 statement error Not supported: alter source with schema registry alter source s18 add column v10 int; -statement error Not supported: alter source with schema registry -alter source s17 add column v10 int; - query III rowsort select * from s21; ---- @@ -848,9 +845,6 @@ drop table s9 statement ok drop table s10 -statement ok -drop table s11 - statement ok drop table s12 @@ -866,9 +860,6 @@ drop table s15 statement ok drop table s16 -statement ok -drop source s17 - statement ok drop source s18 @@ -916,3 +907,6 @@ drop table source_with_rdkafka_props; statement ok drop table debezium_ignore_key; + +statement ok +drop table test_include_payload; diff --git a/e2e_test/source/basic/old_row_format_syntax/kafka.slt b/e2e_test/source/basic/old_row_format_syntax/kafka.slt index 1f4c118f30dc5..d67665a049daa 100644 --- a/e2e_test/source/basic/old_row_format_syntax/kafka.slt +++ b/e2e_test/source/basic/old_row_format_syntax/kafka.slt @@ -171,14 +171,6 @@ create table s10 with ( scan.startup.mode = 'earliest' ) row format avro row schema location 'file:///risingwave/avro-complex-schema.avsc' -statement ok -create table s11 with ( - connector = 'kafka', - topic = 'proto_c_bin', - properties.bootstrap.server = 'message_queue:29092', - scan.startup.mode = 'earliest' -) row format protobuf message 'test.User' row schema location 'file:///risingwave/proto-complex-schema' - statement ok CREATE TABLE s12( id int, @@ -254,14 +246,6 @@ create table s16 (v1 int, v2 varchar) with ( scan.startup.mode = 'latest' ) ROW FORMAT JSON -statement ok -create source s17 with ( - connector = 'kafka', - topic = 'proto_c_bin', - properties.bootstrap.server = 'message_queue:29092', - scan.startup.mode = 'earliest' -) row format protobuf message 'test.User' row schema location 'file:///risingwave/proto-complex-schema' - statement error without schema registry create source s18 with ( connector = 'kafka', @@ -570,11 +554,6 @@ select id, first_name, last_name, email from s8_no_schema_field; # ---- # 100 abc 1473305798 {"(0,200,10.0.0.1)","(1,400,10.0.0.2)"} ("{1xxx,2xxx}","{1xxx,2xxx}") MALE -query ITITT -select id, code, timestamp, xfas, contacts, sex from s11; ----- -0 abc 1473305798 {"(0,200,127.0.0.1)","(1,400,127.0.0.2)"} ("{1xxx,2xxx}","{1xxx,2xxx}") MALE - query ITITT select id, code, timestamp, xfas, contacts, jsonb from s12; ---- @@ -712,9 +691,6 @@ drop table s8_no_schema_field # statement ok # drop table s10 -statement ok -drop table s11 - statement ok drop table s12 @@ -730,9 +706,6 @@ drop table s15 statement ok drop table s16 -statement ok -drop source s17 - # statement ok # drop source s18 diff --git a/e2e_test/source/opendal/posix_fs.slt b/e2e_test/source/opendal/posix_fs.slt index 1bf026aed2744..8eb1ce665590d 100644 --- a/e2e_test/source/opendal/posix_fs.slt +++ b/e2e_test/source/opendal/posix_fs.slt @@ -2,7 +2,7 @@ statement ok SET RW_IMPLICIT_FLUSH TO true; statement ok -CREATE TABLE diamonds_recursive_read ( +CREATE TABLE diamonds ( carat FLOAT, cut TEXT, color TEXT, @@ -11,13 +11,12 @@ CREATE TABLE diamonds_recursive_read ( connector = 'posix_fs', match_pattern = 'data*.csv', posix_fs.root = 'e2e_test/source/opendal/data', - recursive_scan = 'true', ) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ','); sleep 10s query TTTT rowsort -select * from diamonds_recursive_read; +select * from diamonds; ---- 0.22 Premium I 62 0.23 Very Good H 57.5 @@ -30,26 +29,5 @@ select * from diamonds_recursive_read; 1.28 Good J 63.1 1.3 Fair E 64.7 -statement ok -CREATE TABLE diamonds ( - carat FLOAT, - cut TEXT, - color TEXT, - depth FLOAT, -) WITH ( - connector = 'posix_fs', - match_pattern = 'data*.csv', - posix_fs.root = 'e2e_test/source/opendal', -) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ','); - -sleep 10s - -query TTTT rowsort -select * from diamonds; ----- - statement ok DROP TABLE diamonds; - -statement ok -DROP TABLE diamonds_recursive_read; diff --git a/e2e_test/source_inline/kafka/protobuf/recover.slt b/e2e_test/source_inline/kafka/protobuf/recover.slt new file mode 100644 index 0000000000000..3babf26793f2a --- /dev/null +++ b/e2e_test/source_inline/kafka/protobuf/recover.slt @@ -0,0 +1,97 @@ +control substitution on + +system ok +rpk topic create 'test-pb-struct' + + +system ok +jq -sR '{"schema":.,"schemaType":"PROTOBUF"}' << EOF | curl -X POST -H 'content-type: application/json' -d @- "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value/versions" +syntax = "proto3"; +package test; +message User { + int32 id = 1; + Name name = 2; +} +message Name { + string first_name = 1; + string last_name = 2; +} +EOF + + +# create a source with v1 schema +statement ok +create source s with ( + ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, + topic = 'test-pb-struct') +format plain encode protobuf ( + schema.registry = '${RISEDEV_SCHEMA_REGISTRY_URL}', + message = 'test.User'); + + +# register a v2 schema +system ok +jq -sR '{"schema":.,"schemaType":"PROTOBUF"}' << EOF | curl -X POST -H 'content-type: application/json' -d @- "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value/versions" +syntax = "proto3"; +package test; +message User { + int32 id = 1; + Name name = 2; +} +message Name { + string first_name = 1; + string last_name = 2; + string middle_name = 3; +} +EOF + + +# trigger recovery +statement ok +recover; + + +sleep 2s + + +# produce a v2 message +statement ok +create sink sk as select + 1 as id, + row('Alan', 'Turing', 'Mathison')::struct as name +with ( + ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, + topic = 'test-pb-struct') +format plain encode protobuf ( + schema.registry = '${RISEDEV_SCHEMA_REGISTRY_URL}', + message = 'test.User'); + + +sleep 1s + + +# reading as v1 shall not panic +query IT +select * from s; +---- +1 (Alan,Turing) + + +statement ok +drop sink sk; + + +statement ok +drop source s; + + +system ok +curl -X DELETE "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value" + + +system ok +curl -X DELETE "${RISEDEV_SCHEMA_REGISTRY_URL}/subjects/test-pb-struct-value?permanent=true" + + +system ok +rpk topic delete 'test-pb-struct' diff --git a/e2e_test/time_travel/syntax.slt b/e2e_test/time_travel/syntax.slt index 6c3408a276763..5895f6d9b9e8b 100644 --- a/e2e_test/time_travel/syntax.slt +++ b/e2e_test/time_travel/syntax.slt @@ -7,6 +7,10 @@ SET QUERY_MODE TO local; statement ok CREATE TABLE t (k INT); +query I +SELECT * FROM t; +---- + query error SELECT * FROM t FOR SYSTEM_TIME AS OF 963716300; ---- diff --git a/integration_tests/big-query-sink/create_sink.sql b/integration_tests/big-query-sink/create_sink.sql index a41fe0243120d..01fb5e340d545 100644 --- a/integration_tests/big-query-sink/create_sink.sql +++ b/integration_tests/big-query-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + -- create sink with local file CREATE SINK bhv_big_query_sink FROM diff --git a/integration_tests/cassandra-and-scylladb-sink/create_sink.sql b/integration_tests/cassandra-and-scylladb-sink/create_sink.sql index a0a305aebd0e0..fdda994d01427 100644 --- a/integration_tests/cassandra-and-scylladb-sink/create_sink.sql +++ b/integration_tests/cassandra-and-scylladb-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_cassandra_sink FROM bhv_mv WITH ( diff --git a/integration_tests/clickhouse-sink/create_sink.sql b/integration_tests/clickhouse-sink/create_sink.sql index 5f730ed6ff910..b913a246b286e 100644 --- a/integration_tests/clickhouse-sink/create_sink.sql +++ b/integration_tests/clickhouse-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_clickhouse_sink FROM bhv_mv WITH ( diff --git a/integration_tests/deltalake-sink/create_sink.sql b/integration_tests/deltalake-sink/create_sink.sql index f42b09d726e56..17c1c44aea255 100644 --- a/integration_tests/deltalake-sink/create_sink.sql +++ b/integration_tests/deltalake-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + create sink delta_lake_sink from source with ( connector = 'deltalake', diff --git a/integration_tests/doris-sink/create_sink.sql b/integration_tests/doris-sink/create_sink.sql index d4702219fed09..d6b28148c083d 100644 --- a/integration_tests/doris-sink/create_sink.sql +++ b/integration_tests/doris-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + create secret doris_secret with (backend = 'meta') as '123456'; CREATE SINK bhv_doris_sink diff --git a/integration_tests/dynamodb/create_sink.sql b/integration_tests/dynamodb/create_sink.sql index 6de71404a9da1..43cb2be6d1447 100644 --- a/integration_tests/dynamodb/create_sink.sql +++ b/integration_tests/dynamodb/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK dyn_sink FROM movies diff --git a/integration_tests/elasticsearch-sink/create_sink.sql b/integration_tests/elasticsearch-sink/create_sink.sql index 07046507d117d..f72f8f0e6ec3b 100644 --- a/integration_tests/elasticsearch-sink/create_sink.sql +++ b/integration_tests/elasticsearch-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_es7_sink FROM bhv_mv WITH ( diff --git a/integration_tests/kafka-cdc-sink/create_sink.sql b/integration_tests/kafka-cdc-sink/create_sink.sql index 349aac0ca9b0a..0c25553adebba 100644 --- a/integration_tests/kafka-cdc-sink/create_sink.sql +++ b/integration_tests/kafka-cdc-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK IF NOT EXISTS counts_sink FROM counts WITH ( diff --git a/integration_tests/mqtt/create_sink.sql b/integration_tests/mqtt/create_sink.sql index 69b6886943944..27b84aa354250 100644 --- a/integration_tests/mqtt/create_sink.sql +++ b/integration_tests/mqtt/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK mqtt_sink FROM personnel diff --git a/integration_tests/mqtt/create_source.sql b/integration_tests/mqtt/create_source.sql index 068d7e0a6cb46..7ebceaa706bcc 100644 --- a/integration_tests/mqtt/create_source.sql +++ b/integration_tests/mqtt/create_source.sql @@ -11,4 +11,5 @@ WITH ( url='tcp://mqtt-server', topic= 'test', qos = 'at_least_once', + max_packet_size = 200000 ) FORMAT PLAIN ENCODE JSON; diff --git a/integration_tests/mysql-sink/create_sink.sql b/integration_tests/mysql-sink/create_sink.sql index 9776360df2914..f73b92e8ce259 100644 --- a/integration_tests/mysql-sink/create_sink.sql +++ b/integration_tests/mysql-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK target_count_mysql_sink FROM target_count WITH ( diff --git a/integration_tests/nats/create_sink.sql b/integration_tests/nats/create_sink.sql index beee01afcecfb..fda1ab1c77621 100644 --- a/integration_tests/nats/create_sink.sql +++ b/integration_tests/nats/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE TABLE personnel (id integer, name varchar); diff --git a/integration_tests/postgres-sink/create_sink.sql b/integration_tests/postgres-sink/create_sink.sql index 5041f1a36b741..ec76f16ac3037 100644 --- a/integration_tests/postgres-sink/create_sink.sql +++ b/integration_tests/postgres-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK target_count_postgres_sink FROM target_count WITH ( diff --git a/integration_tests/redis-sink/create_sink.sql b/integration_tests/redis-sink/create_sink.sql index 61ffb67326227..f88a68aca2110 100644 --- a/integration_tests/redis-sink/create_sink.sql +++ b/integration_tests/redis-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + CREATE SINK bhv_redis_sink_1 FROM bhv_mv WITH ( diff --git a/integration_tests/starrocks-sink/create_sink.sql b/integration_tests/starrocks-sink/create_sink.sql index 8d7ebf98dfb20..7cfe69ef21973 100644 --- a/integration_tests/starrocks-sink/create_sink.sql +++ b/integration_tests/starrocks-sink/create_sink.sql @@ -1,3 +1,5 @@ +set sink_decouple = false; + create secret starrocks_secret with (backend = 'meta') as '123456'; CREATE SINK bhv_starrocks_sink_primary diff --git a/integration_tests/twitter-pulsar/pb/create_source.sql b/integration_tests/twitter-pulsar/pb/create_source.sql index bf41939b40d91..22c4927ab3bb9 100644 --- a/integration_tests/twitter-pulsar/pb/create_source.sql +++ b/integration_tests/twitter-pulsar/pb/create_source.sql @@ -1,5 +1,6 @@ CREATE SOURCE twitter WITH ( connector = 'pulsar', pulsar.topic = 'twitter', - pulsar.service.url = 'pulsar://message_queue:6650' + pulsar.service.url = 'pulsar://message_queue:6650', + subscription.name.prefix = 'custom_prefix' ) ROW FORMAT PROTOBUF MESSAGE 'twitter.schema.Event' ROW SCHEMA LOCATION 'http://file_server:8080/schema'; \ No newline at end of file diff --git a/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/DbzConnectorConfig.java b/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/DbzConnectorConfig.java index fb8aa62916f60..8ba569c7aea72 100644 --- a/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/DbzConnectorConfig.java +++ b/java/connector-node/risingwave-connector-service/src/main/java/com/risingwave/connector/source/common/DbzConnectorConfig.java @@ -54,6 +54,7 @@ public class DbzConnectorConfig { public static final String PG_PUB_NAME = "publication.name"; public static final String PG_PUB_CREATE = "publication.create.enable"; public static final String PG_SCHEMA_NAME = "schema.name"; + public static final String PG_SSL_ROOT_CERT = "ssl.root.cert"; /* Sql Server configs */ public static final String SQL_SERVER_SCHEMA_NAME = "schema.name"; @@ -211,6 +212,10 @@ public DbzConnectorConfig( LOG.info("Disable table filtering for the shared Postgres source"); dbzProps.remove("table.include.list"); } + + if (userProps.containsKey(PG_SSL_ROOT_CERT)) { + dbzProps.setProperty("database.sslrootcert", userProps.get(PG_SSL_ROOT_CERT)); + } } else if (source == SourceTypeE.CITUS) { var postgresProps = initiateDbConfig(POSTGRES_CONFIG_FILE, substitutor); diff --git a/java/connector-node/risingwave-connector-service/src/main/resources/postgres.properties b/java/connector-node/risingwave-connector-service/src/main/resources/postgres.properties index 06c4210fcf468..c36b62a7aa531 100644 --- a/java/connector-node/risingwave-connector-service/src/main/resources/postgres.properties +++ b/java/connector-node/risingwave-connector-service/src/main/resources/postgres.properties @@ -7,6 +7,7 @@ database.port=${port} database.user=${username} database.password=${password} database.dbname=${database.name} +database.sslmode=${ssl.mode:-prefer} table.include.list=${schema.name}.${table.name} # The name of the PostgreSQL replication slot slot.name=${slot.name} diff --git a/proto/plan_common.proto b/proto/plan_common.proto index bc2e60503f103..0f4e988e6c035 100644 --- a/proto/plan_common.proto +++ b/proto/plan_common.proto @@ -141,6 +141,29 @@ enum JoinType { JOIN_TYPE_RIGHT_ANTI = 8; } +enum AsOfJoinType { + AS_OF_JOIN_TYPE_UNSPECIFIED = 0; + AS_OF_JOIN_TYPE_INNER = 1; + AS_OF_JOIN_TYPE_LEFT_OUTER = 2; +} + +enum AsOfJoinInequalityType { + AS_OF_INEQUALITY_TYPE_UNSPECIFIED = 0; + AS_OF_INEQUALITY_TYPE_GT = 1; + AS_OF_INEQUALITY_TYPE_GE = 2; + AS_OF_INEQUALITY_TYPE_LT = 3; + AS_OF_INEQUALITY_TYPE_LE = 4; +} + +message AsOfJoinDesc { + // The index of the right side's as of column. + uint32 right_idx = 1; + // The index of the left side's as of column. + uint32 left_idx = 2; + // The type of the inequality. + AsOfJoinInequalityType inequality_type = 3; +} + // https://github.com/tokio-rs/prost/issues/80 enum FormatType { FORMAT_TYPE_UNSPECIFIED = 0; @@ -230,6 +253,8 @@ message AdditionalTableName {} message AdditionalCollectionName {} +message AdditionalColumnPayload {} + // this type means we read all headers as a whole message AdditionalColumnHeaders {} @@ -246,6 +271,7 @@ message AdditionalColumn { AdditionalSchemaName schema_name = 9; AdditionalTableName table_name = 10; AdditionalCollectionName collection_name = 11; + AdditionalColumnPayload payload = 12; } } @@ -258,4 +284,5 @@ enum AdditionalColumnType { ADDITIONAL_COLUMN_TYPE_HEADER = 5; ADDITIONAL_COLUMN_TYPE_FILENAME = 6; ADDITIONAL_COLUMN_TYPE_NORMAL = 7; + ADDITIONAL_COLUMN_TYPE_PAYLOAD = 8; } diff --git a/proto/stream_plan.proto b/proto/stream_plan.proto index a96f54818146e..ca67737aeafe0 100644 --- a/proto/stream_plan.proto +++ b/proto/stream_plan.proto @@ -455,6 +455,32 @@ message HashJoinNode { bool is_append_only = 14; } +message AsOfJoinNode { + plan_common.AsOfJoinType join_type = 1; + repeated int32 left_key = 2; + repeated int32 right_key = 3; + // Used for internal table states. + catalog.Table left_table = 4; + // Used for internal table states. + catalog.Table right_table = 5; + // Used for internal table states. + catalog.Table left_degree_table = 6; + // Used for internal table states. + catalog.Table right_degree_table = 7; + // The output indices of current node + repeated uint32 output_indices = 8; + // Left deduped input pk indices. The pk of the left_table and + // The pk of the left_table is [left_join_key | left_inequality_key | left_deduped_input_pk_indices] + // left_inequality_key is not used but for forward compatibility. + repeated uint32 left_deduped_input_pk_indices = 9; + // Right deduped input pk indices. + // The pk of the right_table is [right_join_key | right_inequality_key | right_deduped_input_pk_indices] + // right_inequality_key is not used but for forward compatibility. + repeated uint32 right_deduped_input_pk_indices = 10; + repeated bool null_safe = 11; + optional plan_common.AsOfJoinDesc asof_desc = 12; +} + message TemporalJoinNode { plan_common.JoinType join_type = 1; repeated int32 left_key = 2; diff --git a/proto/stream_service.proto b/proto/stream_service.proto index ad52b0148856a..c13ee8875b43f 100644 --- a/proto/stream_service.proto +++ b/proto/stream_service.proto @@ -9,14 +9,6 @@ import "stream_plan.proto"; option java_package = "com.risingwave.proto"; option optimize_for = SPEED; -message BuildActorInfo { - stream_plan.StreamActor actor = 1; - message SubscriptionIds { - repeated uint32 subscription_ids = 1; - } - map related_subscriptions = 2; -} - message InjectBarrierRequest { string request_id = 1; stream_plan.Barrier barrier = 2; @@ -25,7 +17,9 @@ message InjectBarrierRequest { uint32 partial_graph_id = 6; repeated common.ActorInfo broadcast_info = 8; - repeated BuildActorInfo actors_to_build = 9; + repeated stream_plan.StreamActor actors_to_build = 9; + repeated stream_plan.SubscriptionUpstreamInfo subscriptions_to_add = 10; + repeated stream_plan.SubscriptionUpstreamInfo subscriptions_to_remove = 11; } message BarrierCompleteResponse { @@ -64,6 +58,7 @@ message WaitEpochCommitResponse { message StreamingControlStreamRequest { message InitRequest { uint64 version_id = 1; + repeated stream_plan.SubscriptionUpstreamInfo subscriptions = 2; } message RemovePartialGraphRequest { diff --git a/risedev.yml b/risedev.yml index 22c4569adb610..ce04cea773cac 100644 --- a/risedev.yml +++ b/risedev.yml @@ -30,16 +30,25 @@ profile: # - use: aws-s3 # bucket: test-bucket - # if you want to enable etcd backend, uncomment the following lines. + # By default, the meta-backend is sqlite. + # To enable etcd backend, uncomment the following lines and set the meta-backend to etcd in 'meta-node' # - use: etcd # unsafe-no-fsync: true + # To enable postgres backend, uncomment the following lines and set the meta-backend to postgres in 'meta-node' + # - use: postgres + # port: 8432 + # user: postgres + # database: metadata + # If you want to enable metrics or tracing, uncomment the following lines. # - use: prometheus # metrics # - use: tempo # tracing # - use: grafana # visualization - use: meta-node + # - meta-backend: postgres + # - meta-backend: etcd - use: compute-node - use: frontend @@ -107,9 +116,12 @@ profile: full: steps: - use: minio - - use: etcd + - use: postgres + port: 8432 + user: postgres + database: metadata - use: meta-node - meta-backend: etcd + meta-backend: postgres - use: compute-node - use: frontend - use: compactor @@ -121,10 +133,13 @@ profile: standalone-full-peripherals: steps: - use: minio - - use: etcd + - use: postgres + port: 8432 + user: postgres + database: metadata - use: meta-node user-managed: true - meta-backend: etcd + meta-backend: postgres - use: compute-node user-managed: true - use: frontend diff --git a/src/batch/src/executor/iceberg_scan.rs b/src/batch/src/executor/iceberg_scan.rs index fca7745284fe3..2f67d8ce005aa 100644 --- a/src/batch/src/executor/iceberg_scan.rs +++ b/src/batch/src/executor/iceberg_scan.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::mem; use futures_async_stream::try_stream; @@ -20,8 +21,11 @@ use iceberg::scan::FileScanTask; use iceberg::spec::TableMetadata; use itertools::Itertools; use risingwave_common::array::arrow::IcebergArrowConvert; +use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::{Field, Schema}; +use risingwave_common::row::{OwnedRow, Row}; use risingwave_common::types::DataType; +use risingwave_common::util::iter_util::ZipEqFast; use risingwave_connector::sink::iceberg::IcebergConfig; use risingwave_connector::source::iceberg::{IcebergProperties, IcebergSplit}; use risingwave_connector::source::{ConnectorProperties, SplitImpl, SplitMetaData}; @@ -38,7 +42,8 @@ pub struct IcebergScanExecutor { #[allow(dead_code)] snapshot_id: Option, table_meta: TableMetadata, - file_scan_tasks: Vec, + data_file_scan_tasks: Vec, + eq_delete_file_scan_tasks: Vec, batch_size: usize, schema: Schema, identity: String, @@ -63,7 +68,8 @@ impl IcebergScanExecutor { iceberg_config: IcebergConfig, snapshot_id: Option, table_meta: TableMetadata, - file_scan_tasks: Vec, + data_file_scan_tasks: Vec, + eq_delete_file_scan_tasks: Vec, batch_size: usize, schema: Schema, identity: String, @@ -72,7 +78,8 @@ impl IcebergScanExecutor { iceberg_config, snapshot_id, table_meta, - file_scan_tasks, + data_file_scan_tasks, + eq_delete_file_scan_tasks, batch_size, schema, identity, @@ -86,33 +93,136 @@ impl IcebergScanExecutor { .load_table_v2_with_metadata(self.table_meta) .await?; let data_types = self.schema.data_types(); + let executor_schema_names = self.schema.names(); - let file_scan_tasks = mem::take(&mut self.file_scan_tasks); + let mut eq_delete_file_scan_tasks_map: HashMap = HashMap::default(); + let eq_delete_file_scan_tasks = mem::take(&mut self.eq_delete_file_scan_tasks); - let file_scan_stream = { - #[try_stream] - async move { - for file_scan_task in file_scan_tasks { - yield file_scan_task; + // Build hash map for equality delete files + // Currently, all equality delete files have the same schema which is guaranteed by `IcebergSplitEnumerator`. + let mut eq_delete_ids: Option> = None; + for eq_delete_file_scan_task in eq_delete_file_scan_tasks { + let mut sequence_number = eq_delete_file_scan_task.sequence_number; + + if eq_delete_ids.is_none() { + eq_delete_ids = Some(eq_delete_file_scan_task.project_field_ids.clone()); + } else { + debug_assert_eq!( + eq_delete_ids.as_ref().unwrap(), + &eq_delete_file_scan_task.project_field_ids + ); + } + + let reader = table + .reader_builder() + .with_batch_size(self.batch_size) + .build(); + let delete_file_scan_stream = tokio_stream::once(Ok(eq_delete_file_scan_task)); + + let mut delete_record_batch_stream = reader + .read(Box::pin(delete_file_scan_stream)) + .map_err(BatchError::Iceberg)?; + + while let Some(record_batch) = delete_record_batch_stream.next().await { + let record_batch = record_batch.map_err(BatchError::Iceberg)?; + + let chunk = IcebergArrowConvert.chunk_from_record_batch(&record_batch)?; + for row in chunk.rows() { + let entry = eq_delete_file_scan_tasks_map + .entry(row.to_owned_row()) + .or_default(); + *entry = *entry.max(&mut sequence_number); } } - }; - - let reader = table - .reader_builder() - .with_batch_size(self.batch_size) - .build(); - - let record_batch_stream = reader - .read(Box::pin(file_scan_stream)) - .map_err(BatchError::Iceberg)?; - - #[for_await] - for record_batch in record_batch_stream { - let record_batch = record_batch.map_err(BatchError::Iceberg)?; - let chunk = IcebergArrowConvert.chunk_from_record_batch(&record_batch)?; - debug_assert_eq!(chunk.data_types(), data_types); - yield chunk; + } + + let data_file_scan_tasks = mem::take(&mut self.data_file_scan_tasks); + + // Delete rows in the data file that need to be deleted by map + for data_file_scan_task in data_file_scan_tasks { + let data_sequence_number = data_file_scan_task.sequence_number; + + let data_chunk_column_names: Vec<_> = data_file_scan_task + .project_field_ids + .iter() + .filter_map(|id| { + data_file_scan_task + .schema + .name_by_field_id(*id) + .map(|name| name.to_string()) + }) + .collect(); + + // eq_delete_column_idxes are used to fetch equality delete columns from data files. + let eq_delete_column_idxes = eq_delete_ids.as_ref().map(|eq_delete_ids| { + eq_delete_ids + .iter() + .map(|eq_delete_id| { + data_file_scan_task + .project_field_ids + .iter() + .position(|project_field_id| eq_delete_id == project_field_id) + .expect("eq_delete_id not found in delete_equality_ids") + }) + .collect_vec() + }); + + let reader = table + .reader_builder() + .with_batch_size(self.batch_size) + .build(); + let file_scan_stream = tokio_stream::once(Ok(data_file_scan_task)); + + let mut record_batch_stream = reader + .read(Box::pin(file_scan_stream)) + .map_err(BatchError::Iceberg)?; + + while let Some(record_batch) = record_batch_stream.next().await { + let record_batch = record_batch.map_err(BatchError::Iceberg)?; + + let chunk = IcebergArrowConvert.chunk_from_record_batch(&record_batch)?; + let chunk = match eq_delete_column_idxes.as_ref() { + Some(delete_column_ids) => { + let visibility = Bitmap::from_iter( + // Project with the schema of the delete file + chunk.project(delete_column_ids).rows().map(|row_ref| { + let row = row_ref.to_owned_row(); + if let Some(delete_sequence_number) = + eq_delete_file_scan_tasks_map.get(&row) + && delete_sequence_number > &data_sequence_number + { + // delete_sequence_number > data_sequence_number means the delete file is written later than data file, + // so it needs to be deleted + false + } else { + true + } + }), + ) + .clone(); + // Keep the schema consistent(chunk and executor) + // Filter out (equality delete) columns that are not in the executor schema + let data = chunk + .columns() + .iter() + .zip_eq_fast(&data_chunk_column_names) + .filter_map(|(array, columns)| { + if executor_schema_names.contains(columns) { + Some(array.clone()) + } else { + None + } + }) + .collect_vec(); + let chunk = DataChunk::new(data, visibility); + debug_assert_eq!(chunk.data_types(), data_types); + chunk + } + // If there is no delete file, the data file is directly output + None => chunk, + }; + yield chunk; + } } } } @@ -171,6 +281,11 @@ impl BoxedExecutorBuilder for IcebergScanExecutorBuilder { Some(split.snapshot_id), split.table_meta.deserialize(), split.files.into_iter().map(|x| x.deserialize()).collect(), + split + .eq_delete_files + .into_iter() + .map(|x| x.deserialize()) + .collect(), source.context.get_config().developer.chunk_size, schema, source.plan_node().get_identity().clone(), diff --git a/src/batch/src/worker_manager/worker_node_manager.rs b/src/batch/src/worker_manager/worker_node_manager.rs index fd4d0e37bbbc4..772bc8a4b6da7 100644 --- a/src/batch/src/worker_manager/worker_node_manager.rs +++ b/src/batch/src/worker_manager/worker_node_manager.rs @@ -19,7 +19,7 @@ use std::time::Duration; use rand::seq::SliceRandom; use risingwave_common::bail; use risingwave_common::catalog::OBJECT_ID_PLACEHOLDER; -use risingwave_common::hash::{VirtualNode, WorkerSlotId, WorkerSlotMapping}; +use risingwave_common::hash::{WorkerSlotId, WorkerSlotMapping}; use risingwave_common::vnode_mapping::vnode_placement::place_vnode; use risingwave_pb::common::{WorkerNode, WorkerType}; @@ -346,38 +346,26 @@ impl WorkerNodeSelector { if self.enable_barrier_read { self.manager.get_streaming_fragment_mapping(&fragment_id) } else { - let (hint, parallelism) = match self.manager.serving_fragment_mapping(fragment_id) { - Ok(o) => { - if self.manager.worker_node_mask().is_empty() { - // 1. Stable mapping for most cases. - return Ok(o); - } - // If it's a singleton, set max_parallelism=1 for place_vnode. - let max_parallelism = o.to_single().map(|_| 1); - (Some(o), max_parallelism) - } - Err(e) => { - if !matches!(e, BatchError::ServingVnodeMappingNotFound(_)) { - return Err(e); - } - // We cannot tell whether it's a singleton, set max_parallelism=1 for place_vnode as if it's a singleton. - let max_parallelism = 1; - tracing::warn!( - fragment_id, - max_parallelism, - "Serving fragment mapping not found, fall back to temporary one." - ); - // Workaround the case that new mapping is not available yet due to asynchronous - // notification. - (None, Some(max_parallelism)) - } - }; - // 2. Temporary mapping that filters out unavailable workers. - let new_workers = self.apply_worker_node_mask(self.manager.list_serving_worker_nodes()); - // TODO(var-vnode): use vnode count from config - let masked_mapping = - place_vnode(hint.as_ref(), &new_workers, parallelism, VirtualNode::COUNT); - masked_mapping.ok_or_else(|| BatchError::EmptyWorkerNodes) + let mapping = (self.manager.serving_fragment_mapping(fragment_id)).or_else(|_| { + tracing::warn!( + fragment_id, + "Serving fragment mapping not found, fall back to streaming one." + ); + self.manager.get_streaming_fragment_mapping(&fragment_id) + })?; + + // Filter out unavailable workers. + if self.manager.worker_node_mask().is_empty() { + Ok(mapping) + } else { + let workers = self.apply_worker_node_mask(self.manager.list_serving_worker_nodes()); + // If it's a singleton, set max_parallelism=1 for place_vnode. + let max_parallelism = mapping.to_single().map(|_| 1); + let masked_mapping = + place_vnode(Some(&mapping), &workers, max_parallelism, mapping.len()) + .ok_or_else(|| BatchError::EmptyWorkerNodes)?; + Ok(masked_mapping) + } } } diff --git a/src/common/common_service/Cargo.toml b/src/common/common_service/Cargo.toml index 87206ab7cbc1d..23d1db39af077 100644 --- a/src/common/common_service/Cargo.toml +++ b/src/common/common_service/Cargo.toml @@ -17,12 +17,14 @@ normal = ["workspace-hack"] [dependencies] async-trait = "0.1" axum = { workspace = true } +axum-extra = { workspace = true, features = ["query"] } futures = { version = "0.3", default-features = false, features = ["alloc"] } http = "1" prometheus = { version = "0.13" } risingwave_common = { workspace = true } risingwave_pb = { workspace = true } risingwave_rpc_client = { workspace = true } +serde = { version = "1", features = ["derive"] } thiserror = "1" thiserror-ext = { workspace = true } tokio = { version = "0.2", package = "madsim-tokio", features = ["rt", "rt-multi-thread", "sync", "macros", "time", "signal"] } diff --git a/src/common/common_service/src/metrics_manager.rs b/src/common/common_service/src/metrics_manager.rs index a8151d2c17715..591f74831dbd2 100644 --- a/src/common/common_service/src/metrics_manager.rs +++ b/src/common/common_service/src/metrics_manager.rs @@ -12,21 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashSet; use std::ops::Deref; use std::sync::OnceLock; use axum::body::Body; +use axum::handler::{Handler, HandlerWithoutStateExt}; use axum::response::{IntoResponse, Response}; -use axum::{Extension, Router}; +use axum::Extension; +use axum_extra::extract::Query as ExtraQuery; use prometheus::{Encoder, Registry, TextEncoder}; use risingwave_common::monitor::GLOBAL_METRICS_REGISTRY; +use serde::Deserialize; use thiserror_ext::AsReport; use tokio::net::TcpListener; use tower_http::add_extension::AddExtensionLayer; use tower_http::compression::CompressionLayer; use tracing::{error, info, warn}; -pub struct MetricsManager {} +/// The filter for metrics scrape handler. See [`MetricsManager::metrics`] for more details. +#[derive(Debug, Deserialize)] +struct Filter { + #[serde(default)] + include: HashSet, + #[serde(default)] + exclude: HashSet, +} + +pub struct MetricsManager; impl MetricsManager { pub fn boot_metrics_service(listen_addr: String) { @@ -41,12 +54,12 @@ impl MetricsManager { listen_addr ); - let service = Router::new() - .fallback(Self::metrics_service) + let service = Self::metrics .layer(AddExtensionLayer::new( GLOBAL_METRICS_REGISTRY.deref().clone(), )) - .layer(CompressionLayer::new()); + .layer(CompressionLayer::new()) + .into_make_service(); let serve_future = axum::serve(TcpListener::bind(&listen_addr).await.unwrap(), service); @@ -64,11 +77,44 @@ impl MetricsManager { } } + /// Gather metrics from the global registry and encode them in the Prometheus text format. + /// + /// The handler accepts the following query parameters to filter metrics. Note that `include` + /// and `exclude` should not be used together. + /// + /// - `/metrics` (without filter) + /// - `/metrics?include=foo` (include one metric) + /// - `/metrics?include=foo&include=bar` (include multiple metrics) + /// - `/metrics?exclude=foo&exclude=bar` (include all but foo and bar) + /// + /// One can specify parameters by configuring Prometheus scrape config like below: + /// ```yaml + /// - job_name: compute-node + /// params: + /// include: ["foo", "bar"] + /// ``` #[expect(clippy::unused_async, reason = "required by service_fn")] - async fn metrics_service(Extension(registry): Extension) -> impl IntoResponse { + async fn metrics( + ExtraQuery(Filter { include, exclude }): ExtraQuery, + Extension(registry): Extension, + ) -> impl IntoResponse { + let mut mf = registry.gather(); + + // Filter metrics by name. + // TODO: can we avoid gathering them all? + if !include.is_empty() && !exclude.is_empty() { + return Response::builder() + .status(400) + .body("should not specify both include and exclude".into()) + .unwrap(); + } else if !include.is_empty() { + mf.retain(|fam| include.contains(fam.get_name())); + } else if !exclude.is_empty() { + mf.retain(|fam| !exclude.contains(fam.get_name())); + } + let encoder = TextEncoder::new(); let mut buffer = vec![]; - let mf = registry.gather(); encoder.encode(&mf, &mut buffer).unwrap(); Response::builder() diff --git a/src/common/src/array/arrow/arrow_iceberg.rs b/src/common/src/array/arrow/arrow_iceberg.rs index ff23bc102ee6b..80c0a3dab1667 100644 --- a/src/common/src/array/arrow/arrow_iceberg.rs +++ b/src/common/src/array/arrow/arrow_iceberg.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cell::RefCell; +use std::collections::HashMap; use std::ops::{Div, Mul}; use std::sync::Arc; @@ -138,12 +140,8 @@ impl ToArrow for IcebergArrowConvert { let scale = e.scale() as i8; let diff_scale = abs(max_scale - scale); let value = match scale { - _ if scale < max_scale => { - value.mul(10_i32.pow(diff_scale as u32) as i128) - } - _ if scale > max_scale => { - value.div(10_i32.pow(diff_scale as u32) as i128) - } + _ if scale < max_scale => value.mul(10_i128.pow(diff_scale as u32)), + _ if scale > max_scale => value.div(10_i128.pow(diff_scale as u32)), _ => value, }; Some(value) @@ -171,6 +169,94 @@ impl ToArrow for IcebergArrowConvert { impl FromArrow for IcebergArrowConvert {} +/// Iceberg sink with `create_table_if_not_exists` option will use this struct to convert the +/// iceberg data type to arrow data type. Specifically, it will add the field id to the +/// arrow field metadata, because iceberg-rust and icelake need the field id to be set. +/// +/// Note: this is different from [`IcebergArrowConvert`], which is used to read from/write to +/// an _existing_ iceberg table. In that case, we just need to make sure the data is compatible to the existing schema. +/// But to _create a new table_, we need to meet more requirements of iceberg. +#[derive(Default)] +pub struct IcebergCreateTableArrowConvert { + next_field_id: RefCell, +} + +impl IcebergCreateTableArrowConvert { + pub fn to_arrow_field( + &self, + name: &str, + data_type: &DataType, + ) -> Result { + ToArrow::to_arrow_field(self, name, data_type) + } + + fn add_field_id(&self, arrow_field: &mut arrow_schema::Field) { + *self.next_field_id.borrow_mut() += 1; + let field_id = *self.next_field_id.borrow(); + + let mut metadata = HashMap::new(); + // for iceberg-rust + metadata.insert("PARQUET:field_id".to_string(), field_id.to_string()); + // for icelake + metadata.insert("column_id".to_string(), field_id.to_string()); + arrow_field.set_metadata(metadata); + } +} + +impl ToArrow for IcebergCreateTableArrowConvert { + #[inline] + fn decimal_type_to_arrow(&self, name: &str) -> arrow_schema::Field { + // To create a iceberg table, we need a decimal type with precision and scale to be set + // We choose 28 here + // The decimal type finally will be converted to an iceberg decimal type. + // Iceberg decimal(P,S) + // Fixed-point decimal; precision P, scale S Scale is fixed, precision must be 38 or less. + let data_type = arrow_schema::DataType::Decimal128(28, 10); + + let mut arrow_field = arrow_schema::Field::new(name, data_type, true); + self.add_field_id(&mut arrow_field); + arrow_field + } + + /// Convert RisingWave data type to Arrow data type. + /// + /// This function returns a `Field` instead of `DataType` because some may be converted to + /// extension types which require additional metadata in the field. + fn to_arrow_field( + &self, + name: &str, + value: &DataType, + ) -> Result { + let data_type = match value { + // using the inline function + DataType::Boolean => self.bool_type_to_arrow(), + DataType::Int16 => self.int16_type_to_arrow(), + DataType::Int32 => self.int32_type_to_arrow(), + DataType::Int64 => self.int64_type_to_arrow(), + DataType::Int256 => self.int256_type_to_arrow(), + DataType::Float32 => self.float32_type_to_arrow(), + DataType::Float64 => self.float64_type_to_arrow(), + DataType::Date => self.date_type_to_arrow(), + DataType::Time => self.time_type_to_arrow(), + DataType::Timestamp => self.timestamp_type_to_arrow(), + DataType::Timestamptz => self.timestamptz_type_to_arrow(), + DataType::Interval => self.interval_type_to_arrow(), + DataType::Varchar => self.varchar_type_to_arrow(), + DataType::Bytea => self.bytea_type_to_arrow(), + DataType::Serial => self.serial_type_to_arrow(), + DataType::Decimal => return Ok(self.decimal_type_to_arrow(name)), + DataType::Jsonb => return Ok(self.jsonb_type_to_arrow(name)), + DataType::Struct(fields) => self.struct_type_to_arrow(fields)?, + DataType::List(datatype) => self.list_type_to_arrow(datatype)?, + DataType::Map(datatype) => self.map_type_to_arrow(datatype)?, + }; + + let mut arrow_field = arrow_schema::Field::new(name, data_type, true); + self.add_field_id(&mut arrow_field); + Ok(arrow_field) + } +} + #[cfg(test)] mod test { use std::sync::Arc; @@ -207,4 +293,30 @@ mod test { ) as ArrayRef; assert_eq!(&arrow_array, &expect_array); } + + #[test] + fn decimal_with_large_scale() { + let array = DecimalArray::from_iter([ + None, + Some(Decimal::NaN), + Some(Decimal::PositiveInf), + Some(Decimal::NegativeInf), + Some(Decimal::Normalized("123.4".parse().unwrap())), + Some(Decimal::Normalized("123.456".parse().unwrap())), + ]); + let ty = DataType::Decimal128(28, 10); + let arrow_array = IcebergArrowConvert.decimal_to_arrow(&ty, &array).unwrap(); + let expect_array = Arc::new( + Decimal128Array::from(vec![ + None, + None, + Some(9999999999999999999999999999), + Some(-9999999999999999999999999999), + Some(1234000000000), + Some(1234560000000), + ]) + .with_data_type(ty), + ) as ArrayRef; + assert_eq!(&arrow_array, &expect_array); + } } diff --git a/src/common/src/array/arrow/mod.rs b/src/common/src/array/arrow/mod.rs index fd9f55ee09f7e..d519d62f9935a 100644 --- a/src/common/src/array/arrow/mod.rs +++ b/src/common/src/array/arrow/mod.rs @@ -17,7 +17,7 @@ mod arrow_iceberg; mod arrow_udf; pub use arrow_deltalake::DeltaLakeConvert; -pub use arrow_iceberg::IcebergArrowConvert; +pub use arrow_iceberg::{IcebergArrowConvert, IcebergCreateTableArrowConvert}; pub use arrow_udf::{FromArrow, ToArrow, UdfArrowConvert}; use crate::types::Interval; diff --git a/src/common/src/array/map_array.rs b/src/common/src/array/map_array.rs index 2f0da9bbf816f..f519c25981a56 100644 --- a/src/common/src/array/map_array.rs +++ b/src/common/src/array/map_array.rs @@ -337,6 +337,14 @@ mod scalar { pub fn to_owned(self) -> MapValue { MapValue(self.0.to_owned()) } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } } impl Scalar for MapValue { diff --git a/src/common/src/config.rs b/src/common/src/config.rs index d78fdbe51fa9b..e2b4dd7b0f97c 100644 --- a/src/common/src/config.rs +++ b/src/common/src/config.rs @@ -699,6 +699,9 @@ pub struct StorageConfig { #[serde(default)] pub prefetch_buffer_capacity_mb: Option, + #[serde(default)] + pub max_cached_recent_versions_number: Option, + /// max prefetch block number #[serde(default = "default::storage::max_prefetch_block_number")] pub max_prefetch_block_number: usize, diff --git a/src/compute/tests/cdc_tests.rs b/src/compute/tests/cdc_tests.rs index 974cb15259e2a..8fea0f48fa82d 100644 --- a/src/compute/tests/cdc_tests.rs +++ b/src/compute/tests/cdc_tests.rs @@ -43,6 +43,7 @@ use risingwave_hummock_sdk::to_committed_batch_query_epoch; use risingwave_storage::memory::MemoryStateStore; use risingwave_storage::table::batch_table::storage_table::StorageTable; use risingwave_stream::common::table::state_table::StateTable; +use risingwave_stream::common::table::test_utils::gen_pbtable; use risingwave_stream::error::StreamResult; use risingwave_stream::executor::monitor::StreamingMetrics; use risingwave_stream::executor::test_utils::MockSource; @@ -211,12 +212,16 @@ async fn test_cdc_backfill() -> StreamResult<()> { ColumnDesc::unnamed(ColumnId::from(4), state_schema[4].data_type.clone()), ]; - let state_table = StateTable::new_without_distribution( + let state_table = StateTable::from_table_catalog( + &gen_pbtable( + TableId::from(0x42), + column_descs, + vec![OrderType::ascending()], + vec![0], + 0, + ), memory_state_store.clone(), - TableId::from(0x42), - column_descs.clone(), - vec![OrderType::ascending()], - vec![0_usize], + None, ) .await; diff --git a/src/compute/tests/integration_tests.rs b/src/compute/tests/integration_tests.rs index 11cbe06386a29..13a76c6989b48 100644 --- a/src/compute/tests/integration_tests.rs +++ b/src/compute/tests/integration_tests.rs @@ -48,6 +48,7 @@ use risingwave_storage::memory::MemoryStateStore; use risingwave_storage::panic_store::PanicStateStore; use risingwave_storage::table::batch_table::storage_table::StorageTable; use risingwave_stream::common::table::state_table::StateTable; +use risingwave_stream::common::table::test_utils::gen_pbtable; use risingwave_stream::error::StreamResult; use risingwave_stream::executor::dml::DmlExecutor; use risingwave_stream::executor::monitor::StreamingMetrics; @@ -448,12 +449,16 @@ async fn test_row_seq_scan() -> StreamResult<()> { ColumnDesc::unnamed(ColumnId::from(2), schema[2].data_type.clone()), ]; - let mut state = StateTable::new_without_distribution( + let mut state = StateTable::from_table_catalog( + &gen_pbtable( + TableId::from(0x42), + column_descs.clone(), + vec![OrderType::ascending()], + vec![0], + 0, + ), memory_state_store.clone(), - TableId::from(0x42), - column_descs.clone(), - vec![OrderType::ascending()], - vec![0_usize], + None, ) .await; let table = StorageTable::for_test( diff --git a/src/config/docs.md b/src/config/docs.md index 47905d71e5e0c..bcce61d8bb456 100644 --- a/src/config/docs.md +++ b/src/config/docs.md @@ -119,6 +119,7 @@ This page is automatically generated by `./risedev generate-example-config` | enable_fast_compaction | | true | | high_priority_ratio_in_percent | DEPRECATED: This config will be deprecated in the future version, use `storage.cache.block_cache_eviction.high_priority_ratio_in_percent` with `storage.cache.block_cache_eviction.algorithm = "Lru"` instead. | | | imm_merge_threshold | The threshold for the number of immutable memtables to merge to a new imm. | 0 | +| max_cached_recent_versions_number | | | | max_concurrent_compaction_task_number | | 16 | | max_prefetch_block_number | max prefetch block number | 16 | | max_preload_io_retry_times | | 3 | diff --git a/src/connector/Cargo.toml b/src/connector/Cargo.toml index a77e9cb929d17..2535847c98fe4 100644 --- a/src/connector/Cargo.toml +++ b/src/connector/Cargo.toml @@ -76,7 +76,7 @@ jni = { version = "0.21.1", features = ["invocation"] } jsonbb = { workspace = true } jsonwebtoken = "9.2.0" maplit = "1.0.2" -moka = { version = "0.12", features = ["future"] } +moka = { version = "0.12.0", features = ["future"] } mongodb = { version = "2.8.2", features = ["tokio-runtime"] } mysql_async = { version = "0.34", default-features = false, features = [ "default", @@ -105,7 +105,6 @@ prometheus = { version = "0.13", features = ["process"] } prost = { workspace = true, features = ["no-recursion-limit"] } prost-reflect = { version = "0.14", features = ["serde"] } prost-types = "0.13" -protobuf-native = "0.2.2" pulsar = { version = "6.3", default-features = false, features = [ "tokio-runtime", "telemetry", @@ -194,6 +193,7 @@ assert_matches = "1" criterion = { workspace = true, features = ["async_tokio", "async"] } deltalake = { workspace = true, features = ["datafusion"] } expect-test = "1" +fs-err = "2" paste = "1" pretty_assertions = "1" quote = "1" @@ -206,10 +206,6 @@ tracing-subscriber = "0.3" tracing-test = "0.2" walkdir = "2" -[build-dependencies] -prost-build = "0.12" -protobuf-src = "1" - [[bench]] name = "debezium_json_parser" harness = false diff --git a/src/connector/codec/Cargo.toml b/src/connector/codec/Cargo.toml index 5086549f4bf4c..5848c236dbd4d 100644 --- a/src/connector/codec/Cargo.toml +++ b/src/connector/codec/Cargo.toml @@ -26,6 +26,10 @@ itertools = { workspace = true } jsonbb = { workspace = true } jst = { package = 'jsonschema-transpiler', git = "https://github.com/mozilla/jsonschema-transpiler", rev = "c1a89d720d118843d8bcca51084deb0ed223e4b4" } num-bigint = "0.4" +prost = { workspace = true, features = ["no-recursion-limit"] } +prost-reflect = { version = "0.14", features = ["serde"] } +prost-types = "0.13" +protobuf-native = "0.2.2" risingwave_common = { workspace = true } risingwave_pb = { workspace = true } rust_decimal = "1" @@ -37,7 +41,13 @@ tracing = "0.1" [dev-dependencies] expect-test = "1" +fs-err = "2" hex = "0.4" +tokio = { version = "0.2", package = "madsim-tokio" } + +[build-dependencies] +prost-build = "0.12" +protobuf-src = "1" [target.'cfg(not(madsim))'.dependencies] workspace-hack = { path = "../../workspace-hack" } diff --git a/src/connector/build.rs b/src/connector/codec/build.rs similarity index 87% rename from src/connector/build.rs rename to src/connector/codec/build.rs index 6ef6e1629438c..8a9438d59b9e8 100644 --- a/src/connector/build.rs +++ b/src/connector/codec/build.rs @@ -13,17 +13,17 @@ // limitations under the License. fn main() { - let proto_dir = "./src/test_data/proto_recursive"; + let proto_dir = "./tests/test_data/"; println!("cargo:rerun-if-changed={}", proto_dir); - let proto_files = ["recursive"]; + let proto_files = ["recursive", "all-types"]; let protos: Vec = proto_files .iter() .map(|f| format!("{}/{}.proto", proto_dir, f)) .collect(); prost_build::Config::new() - .out_dir("./src/parser/protobuf") + .out_dir("./tests/integration_tests/protobuf") .compile_protos(&protos, &Vec::::new()) .unwrap(); diff --git a/src/tests/compaction_test/src/bin/delete_range.rs b/src/connector/codec/src/common/mod.rs similarity index 51% rename from src/tests/compaction_test/src/bin/delete_range.rs rename to src/connector/codec/src/common/mod.rs index 1861ca1b9b03f..c8a7ca35c4209 100644 --- a/src/tests/compaction_test/src/bin/delete_range.rs +++ b/src/connector/codec/src/common/mod.rs @@ -12,16 +12,4 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![cfg_attr(coverage, feature(coverage_attribute))] - -#[cfg_attr(coverage, coverage(off))] -fn main() { - // Since we decide to record watermark in every state-table to replace delete-range, this test is not need again. We keep it because we may need delete-range in some day for other features. - use clap::Parser; - - let opts = risingwave_compaction_test::CompactionTestOpts::parse(); - - risingwave_rt::init_risingwave_logger(risingwave_rt::LoggerSettings::default()); - - risingwave_rt::main_okk(|_| risingwave_compaction_test::start_delete_range(opts)) -} +pub mod protobuf; diff --git a/src/connector/codec/src/common/protobuf/compiler.rs b/src/connector/codec/src/common/protobuf/compiler.rs new file mode 100644 index 0000000000000..80e86d002d4aa --- /dev/null +++ b/src/connector/codec/src/common/protobuf/compiler.rs @@ -0,0 +1,86 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::path::{Path, PathBuf}; + +use itertools::Itertools; + +macro_rules! embed_wkts { + [$( $path:literal ),+ $(,)?] => { + &[$( + ( + concat!("google/protobuf/", $path), + include_bytes!(concat!(env!("PROTO_INCLUDE"), "/google/protobuf/", $path)).as_slice(), + ) + ),+] + }; +} +const WELL_KNOWN_TYPES: &[(&str, &[u8])] = embed_wkts![ + "any.proto", + "api.proto", + "compiler/plugin.proto", + "descriptor.proto", + "duration.proto", + "empty.proto", + "field_mask.proto", + "source_context.proto", + "struct.proto", + "timestamp.proto", + "type.proto", + "wrappers.proto", +]; + +#[derive(Debug, thiserror::Error)] +pub enum PbCompileError { + #[error("build_file_descriptor_set failed\n{}", errs.iter().map(|e| format!("\t{e}")).join("\n"))] + Build { + errs: Vec, + }, + #[error("serialize descriptor set failed")] + Serialize, +} + +pub fn compile_pb( + main_file: (PathBuf, Vec), + dependencies: impl IntoIterator)>, +) -> Result, PbCompileError> { + use protobuf_native::compiler::{ + SimpleErrorCollector, SourceTreeDescriptorDatabase, VirtualSourceTree, + }; + use protobuf_native::MessageLite; + + let root = main_file.0.clone(); + + let mut source_tree = VirtualSourceTree::new(); + for (path, bytes) in std::iter::once(main_file).chain(dependencies.into_iter()) { + source_tree.as_mut().add_file(&path, bytes); + } + for (path, bytes) in WELL_KNOWN_TYPES { + source_tree + .as_mut() + .add_file(Path::new(path), bytes.to_vec()); + } + + let mut error_collector = SimpleErrorCollector::new(); + // `db` needs to be dropped before we can iterate on `error_collector`. + let fds = { + let mut db = SourceTreeDescriptorDatabase::new(source_tree.as_mut()); + db.as_mut().record_errors_to(error_collector.as_mut()); + db.as_mut().build_file_descriptor_set(&[root]) + } + .map_err(|_| PbCompileError::Build { + errs: error_collector.as_mut().collect(), + })?; + fds.serialize().map_err(|_| PbCompileError::Serialize) +} diff --git a/src/connector/codec/src/common/protobuf/mod.rs b/src/connector/codec/src/common/protobuf/mod.rs new file mode 100644 index 0000000000000..f630dedf0d240 --- /dev/null +++ b/src/connector/codec/src/common/protobuf/mod.rs @@ -0,0 +1,16 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod compiler; +pub use compiler::compile_pb; diff --git a/src/connector/codec/src/decoder/avro/mod.rs b/src/connector/codec/src/decoder/avro/mod.rs index dc4dae49ca7c4..ebae0e1292fec 100644 --- a/src/connector/codec/src/decoder/avro/mod.rs +++ b/src/connector/codec/src/decoder/avro/mod.rs @@ -136,10 +136,7 @@ impl<'a> AvroParseOptions<'a> { let expected_field_name = avro_schema_to_struct_field_name(variant_schema)?; let mut fields = Vec::with_capacity(struct_type_info.len()); - for (field_name, field_type) in struct_type_info - .names() - .zip_eq_fast(struct_type_info.types()) - { + for (field_name, field_type) in struct_type_info.iter() { if field_name == expected_field_name { let datum = Self { schema: Some(variant_schema), diff --git a/src/connector/codec/src/decoder/mod.rs b/src/connector/codec/src/decoder/mod.rs index bbfdbf0a90d79..e3e579ed36ec1 100644 --- a/src/connector/codec/src/decoder/mod.rs +++ b/src/connector/codec/src/decoder/mod.rs @@ -14,6 +14,7 @@ pub mod avro; pub mod json; +pub mod protobuf; pub mod utils; use risingwave_common::error::NotImplemented; diff --git a/src/connector/src/parser/unified/protobuf.rs b/src/connector/codec/src/decoder/protobuf/mod.rs similarity index 84% rename from src/connector/src/parser/unified/protobuf.rs rename to src/connector/codec/src/decoder/protobuf/mod.rs index b1d34746b5029..7ad357fef50fb 100644 --- a/src/connector/src/parser/unified/protobuf.rs +++ b/src/connector/codec/src/decoder/protobuf/mod.rs @@ -12,17 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod parser; use std::borrow::Cow; use std::sync::LazyLock; +use parser::from_protobuf_value; use prost_reflect::{DynamicMessage, ReflectMessage}; use risingwave_common::log::LogSuppresser; use risingwave_common::types::{DataType, DatumCow, ToOwnedDatum}; use thiserror_ext::AsReport; -use super::{Access, AccessResult}; -use crate::parser::from_protobuf_value; -use crate::parser::unified::uncategorized; +use super::{uncategorized, Access, AccessResult}; pub struct ProtobufAccess { message: DynamicMessage, @@ -32,14 +32,15 @@ impl ProtobufAccess { pub fn new(message: DynamicMessage) -> Self { Self { message } } + + #[cfg(test)] + pub fn descriptor(&self) -> prost_reflect::MessageDescriptor { + self.message.descriptor() + } } impl Access for ProtobufAccess { - fn access<'a>( - &'a self, - path: &[&str], - _type_expected: &DataType, - ) -> AccessResult> { + fn access<'a>(&'a self, path: &[&str], type_expected: &DataType) -> AccessResult> { debug_assert_eq!(1, path.len()); let field_desc = self .message @@ -55,10 +56,10 @@ impl Access for ProtobufAccess { })?; match self.message.get_field(&field_desc) { - Cow::Borrowed(value) => from_protobuf_value(&field_desc, value), + Cow::Borrowed(value) => from_protobuf_value(&field_desc, value, type_expected), // `Owned` variant occurs only if there's no such field and the default value is returned. - Cow::Owned(value) => from_protobuf_value(&field_desc, &value) + Cow::Owned(value) => from_protobuf_value(&field_desc, &value, type_expected) // enforce `Owned` variant to avoid returning a reference to a temporary value .map(|d| d.to_owned_datum().into()), } diff --git a/src/connector/codec/src/decoder/protobuf/parser.rs b/src/connector/codec/src/decoder/protobuf/parser.rs new file mode 100644 index 0000000000000..852fa9cca48d6 --- /dev/null +++ b/src/connector/codec/src/decoder/protobuf/parser.rs @@ -0,0 +1,275 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Context; +use itertools::Itertools; +use prost_reflect::{Cardinality, FieldDescriptor, Kind, MessageDescriptor, ReflectMessage, Value}; +use risingwave_common::array::{ListValue, StructValue}; +use risingwave_common::types::{ + DataType, DatumCow, Decimal, JsonbVal, MapType, MapValue, ScalarImpl, ToOwnedDatum, F32, F64, +}; +use risingwave_pb::plan_common::{AdditionalColumn, ColumnDesc, ColumnDescVersion}; +use thiserror::Error; +use thiserror_ext::Macro; + +use crate::decoder::{uncategorized, AccessError, AccessResult}; + +pub fn pb_schema_to_column_descs( + message_descriptor: &MessageDescriptor, +) -> anyhow::Result> { + let mut columns = Vec::with_capacity(message_descriptor.fields().len()); + let mut index = 0; + let mut parse_trace: Vec = vec![]; + for field in message_descriptor.fields() { + columns.push(pb_field_to_col_desc(&field, &mut index, &mut parse_trace)?); + } + + Ok(columns) +} + +/// Maps a protobuf field to a RW column. +fn pb_field_to_col_desc( + field_descriptor: &FieldDescriptor, + index: &mut i32, + parse_trace: &mut Vec, +) -> anyhow::Result { + let field_type = protobuf_type_mapping(field_descriptor, parse_trace) + .context("failed to map protobuf type")?; + if let Kind::Message(m) = field_descriptor.kind() { + let field_descs = if let DataType::List { .. } = field_type { + vec![] + } else { + m.fields() + .map(|f| pb_field_to_col_desc(&f, index, parse_trace)) + .try_collect()? + }; + *index += 1; + Ok(ColumnDesc { + column_id: *index, + name: field_descriptor.name().to_string(), + column_type: Some(field_type.to_protobuf()), + field_descs, + type_name: m.full_name().to_string(), + generated_or_default_column: None, + description: None, + additional_column_type: 0, // deprecated + additional_column: Some(AdditionalColumn { column_type: None }), + version: ColumnDescVersion::Pr13707 as i32, + }) + } else { + *index += 1; + Ok(ColumnDesc { + column_id: *index, + name: field_descriptor.name().to_string(), + column_type: Some(field_type.to_protobuf()), + additional_column: Some(AdditionalColumn { column_type: None }), + version: ColumnDescVersion::Pr13707 as i32, + ..Default::default() + }) + } +} + +#[derive(Error, Debug, Macro)] +#[error("{0}")] +struct ProtobufTypeError(#[message] String); + +fn detect_loop_and_push( + trace: &mut Vec, + fd: &FieldDescriptor, +) -> std::result::Result<(), ProtobufTypeError> { + let identifier = format!("{}({})", fd.name(), fd.full_name()); + if trace.iter().any(|s| s == identifier.as_str()) { + bail_protobuf_type_error!( + "circular reference detected: {}, conflict with {}, kind {:?}", + trace.iter().format("->"), + identifier, + fd.kind(), + ); + } + trace.push(identifier); + Ok(()) +} + +pub fn from_protobuf_value<'a>( + field_desc: &FieldDescriptor, + value: &'a Value, + type_expected: &DataType, +) -> AccessResult> { + let kind = field_desc.kind(); + + macro_rules! borrowed { + ($v:expr) => { + return Ok(DatumCow::Borrowed(Some($v.into()))) + }; + } + + let v: ScalarImpl = match value { + Value::Bool(v) => ScalarImpl::Bool(*v), + Value::I32(i) => ScalarImpl::Int32(*i), + Value::U32(i) => ScalarImpl::Int64(*i as i64), + Value::I64(i) => ScalarImpl::Int64(*i), + Value::U64(i) => ScalarImpl::Decimal(Decimal::from(*i)), + Value::F32(f) => ScalarImpl::Float32(F32::from(*f)), + Value::F64(f) => ScalarImpl::Float64(F64::from(*f)), + Value::String(s) => borrowed!(s.as_str()), + Value::EnumNumber(idx) => { + let enum_desc = kind.as_enum().ok_or_else(|| AccessError::TypeError { + expected: "enum".to_owned(), + got: format!("{kind:?}"), + value: value.to_string(), + })?; + let enum_symbol = enum_desc.get_value(*idx).ok_or_else(|| { + uncategorized!("unknown enum index {} of enum {:?}", idx, enum_desc) + })?; + ScalarImpl::Utf8(enum_symbol.name().into()) + } + Value::Message(dyn_msg) => { + if dyn_msg.descriptor().full_name() == "google.protobuf.Any" { + ScalarImpl::Jsonb(JsonbVal::from( + serde_json::to_value(dyn_msg).map_err(AccessError::ProtobufAnyToJson)?, + )) + } else { + let desc = dyn_msg.descriptor(); + let DataType::Struct(st) = type_expected else { + return Err(AccessError::TypeError { + expected: type_expected.to_string(), + got: desc.full_name().to_string(), + value: value.to_string(), // Protobuf TEXT + }); + }; + + let mut rw_values = Vec::with_capacity(st.len()); + for (name, expected_field_type) in st.iter() { + let Some(field_desc) = desc.get_field_by_name(name) else { + // Field deleted in protobuf. Fallback to SQL NULL (of proper RW type). + rw_values.push(None); + continue; + }; + let value = dyn_msg.get_field(&field_desc); + rw_values.push( + from_protobuf_value(&field_desc, &value, expected_field_type)? + .to_owned_datum(), + ); + } + ScalarImpl::Struct(StructValue::new(rw_values)) + } + } + Value::List(values) => { + let DataType::List(element_type) = type_expected else { + return Err(AccessError::TypeError { + expected: type_expected.to_string(), + got: format!("repeated {:?}", kind), + value: value.to_string(), // Protobuf TEXT + }); + }; + let mut builder = element_type.create_array_builder(values.len()); + for value in values { + builder.append(from_protobuf_value(field_desc, value, element_type)?); + } + ScalarImpl::List(ListValue::new(builder.finish())) + } + Value::Bytes(value) => borrowed!(&**value), + Value::Map(map) => { + let err = || { + AccessError::TypeError { + expected: type_expected.to_string(), + got: format!("{:?}", kind), + value: value.to_string(), // Protobuf TEXT + } + }; + + let DataType::Map(map_type) = type_expected else { + return Err(err()); + }; + if !field_desc.is_map() { + return Err(err()); + } + let map_desc = kind.as_message().ok_or_else(err)?; + + let mut key_builder = map_type.key().create_array_builder(map.len()); + let mut value_builder = map_type.value().create_array_builder(map.len()); + // NOTE: HashMap's iter order is non-deterministic, but MapValue's + // order matters. We sort by key here to have deterministic order + // in tests. We might consider removing this, or make all MapValue sorted + // in the future. + for (key, value) in map.iter().sorted_by_key(|(k, _v)| *k) { + key_builder.append(from_protobuf_value( + &map_desc.map_entry_key_field(), + &key.clone().into(), + map_type.key(), + )?); + value_builder.append(from_protobuf_value( + &map_desc.map_entry_value_field(), + value, + map_type.value(), + )?); + } + let keys = key_builder.finish(); + let values = value_builder.finish(); + ScalarImpl::Map( + MapValue::try_from_kv(ListValue::new(keys), ListValue::new(values)) + .map_err(|e| uncategorized!("failed to convert protobuf map: {e}"))?, + ) + } + }; + Ok(Some(v).into()) +} + +/// Maps protobuf type to RW type. +fn protobuf_type_mapping( + field_descriptor: &FieldDescriptor, + parse_trace: &mut Vec, +) -> std::result::Result { + detect_loop_and_push(parse_trace, field_descriptor)?; + let mut t = match field_descriptor.kind() { + Kind::Bool => DataType::Boolean, + Kind::Double => DataType::Float64, + Kind::Float => DataType::Float32, + Kind::Int32 | Kind::Sint32 | Kind::Sfixed32 => DataType::Int32, + // Fixed32 represents [0, 2^32 - 1]. It's equal to u32. + Kind::Int64 | Kind::Sint64 | Kind::Sfixed64 | Kind::Uint32 | Kind::Fixed32 => { + DataType::Int64 + } + Kind::Uint64 | Kind::Fixed64 => DataType::Decimal, + Kind::String => DataType::Varchar, + Kind::Message(m) => { + if m.full_name() == "google.protobuf.Any" { + // Well-Known Types are identified by their full name + DataType::Jsonb + } else if m.is_map_entry() { + // Map is equivalent to `repeated MapFieldEntry map_field = N;` + debug_assert!(field_descriptor.is_map()); + let key = protobuf_type_mapping(&m.map_entry_key_field(), parse_trace)?; + let value = protobuf_type_mapping(&m.map_entry_value_field(), parse_trace)?; + _ = parse_trace.pop(); + return Ok(DataType::Map(MapType::from_kv(key, value))); + } else { + let fields = m + .fields() + .map(|f| protobuf_type_mapping(&f, parse_trace)) + .try_collect()?; + let field_names = m.fields().map(|f| f.name().to_string()).collect_vec(); + DataType::new_struct(fields, field_names) + } + } + Kind::Enum(_) => DataType::Varchar, + Kind::Bytes => DataType::Bytea, + }; + if field_descriptor.cardinality() == Cardinality::Repeated { + debug_assert!(!field_descriptor.is_map()); + t = DataType::List(Box::new(t)) + } + _ = parse_trace.pop(); + Ok(t) +} diff --git a/src/connector/codec/src/lib.rs b/src/connector/codec/src/lib.rs index 2119c1ece4e57..d3f0a8c6ec2cf 100644 --- a/src/connector/codec/src/lib.rs +++ b/src/connector/codec/src/lib.rs @@ -37,6 +37,7 @@ #![register_tool(rw)] #![recursion_limit = "256"] +pub mod common; /// Converts JSON/AVRO/Protobuf data to RisingWave datum. /// The core API is [`decoder::Access`]. pub mod decoder; diff --git a/src/connector/codec/tests/integration_tests/avro.rs b/src/connector/codec/tests/integration_tests/avro.rs index 11275f45e9783..ab1df6e7e82b8 100644 --- a/src/connector/codec/tests/integration_tests/avro.rs +++ b/src/connector/codec/tests/integration_tests/avro.rs @@ -64,33 +64,11 @@ fn avro_schema_str_to_risingwave_schema( Ok((resolved_schema, rw_schema)) } -/// Data driven testing for converting Avro Schema to RisingWave Schema, and then converting Avro data into RisingWave data. -/// -/// The expected results can be automatically updated. To run and update the tests: -/// ```bash -/// UPDATE_EXPECT=1 cargo test -p risingwave_connector_codec -/// ``` -/// Or use Rust Analyzer. Refer to . +/// Refer to [crate level documentation](crate) for the ideas. /// /// ## Arguments /// - `avro_schema`: Avro schema in JSON format. /// - `avro_data`: list of Avro data. Refer to [`TestDataEncoding`] for the format. -/// -/// ## Why not directly test the uppermost layer `AvroParserConfig` and `AvroAccessBuilder`? -/// -/// Because their interface are not clean enough, and have complex logic like schema registry. -/// We might need to separate logic to make them clenaer and then we can use it directly for testing. -/// -/// ## If we reimplement a similar logic here, what are we testing? -/// -/// Basically unit tests of `avro_schema_to_column_descs`, `convert_to_datum`, i.e., the type mapping. -/// -/// It makes some sense, as the data parsing logic is generally quite simple (one-liner), and the most -/// complex and error-prone part is the type mapping. -/// -/// ## Why test schema mapping and data mapping together? -/// -/// Because the expected data type for data mapping comes from the schema mapping. #[track_caller] fn check( avro_schema: &str, @@ -992,10 +970,10 @@ fn test_map() { map_map_int(#2): Jsonb, ]"#]], expect![[r#" - Owned(Jsonb(JsonbRef({"a": "x", "b": "y"}))) - Owned(Jsonb(JsonbRef({"m1": {"a": Number(1), "b": Number(2)}, "m2": {"c": Number(3), "d": Number(4)}}))) + Owned(Jsonb({"a": "x", "b": "y"})) + Owned(Jsonb({"m1": {"a": 1, "b": 2}, "m2": {"c": 3, "d": 4}})) ---- - Owned(Jsonb(JsonbRef({}))) - Owned(Jsonb(JsonbRef({})))"#]], + Owned(Jsonb({})) + Owned(Jsonb({}))"#]], ); } diff --git a/src/connector/codec/tests/integration_tests/main.rs b/src/connector/codec/tests/integration_tests/main.rs index 8c718f918d0a6..010fe05936517 100644 --- a/src/connector/codec/tests/integration_tests/main.rs +++ b/src/connector/codec/tests/integration_tests/main.rs @@ -12,6 +12,31 @@ // See the License for the specific language governing permissions and // limitations under the License. +//! Data driven testing for converting Avro/Protobuf Schema to RisingWave Schema, and then converting Avro/Protobuf data into RisingWave data. +//! +//! The expected results can be automatically updated. To run and update the tests: +//! ```bash +//! UPDATE_EXPECT=1 cargo test -p risingwave_connector_codec +//! ``` +//! Or use Rust Analyzer. Refer to . +//! +//! ## Why not directly test the uppermost layer `AvroParserConfig` and `AvroAccessBuilder`? +//! +//! Because their interface are not clean enough, and have complex logic like schema registry. +//! We might need to separate logic to make them cleaner and then we can use it directly for testing. +//! +//! ## If we reimplement a similar logic here, what are we testing? +//! +//! Basically unit tests of `avro_schema_to_column_descs`, `convert_to_datum`, i.e., the type mapping. +//! +//! It makes some sense, as the data parsing logic is generally quite simple (one-liner), and the most +//! complex and error-prone part is the type mapping. +//! +//! ## Why test schema mapping and data mapping together? +//! +//! Because the expected data type for data mapping comes from the schema mapping. + mod avro; +mod protobuf; pub mod utils; diff --git a/src/connector/codec/tests/integration_tests/protobuf.rs b/src/connector/codec/tests/integration_tests/protobuf.rs new file mode 100644 index 0000000000000..9a70ef5e5c7a9 --- /dev/null +++ b/src/connector/codec/tests/integration_tests/protobuf.rs @@ -0,0 +1,719 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#[rustfmt::skip] +#[allow(clippy::all)] +mod recursive; +#[rustfmt::skip] +#[allow(clippy::all)] +mod all_types; +use std::collections::HashMap; +use std::path::PathBuf; + +use anyhow::Context; +use prost::Message; +use prost_reflect::{DescriptorPool, DynamicMessage, MessageDescriptor}; +use risingwave_connector_codec::common::protobuf::compile_pb; +use risingwave_connector_codec::decoder::protobuf::parser::*; +use risingwave_connector_codec::decoder::protobuf::ProtobufAccess; +use risingwave_connector_codec::decoder::Access; +use thiserror_ext::AsReport; + +use crate::utils::*; + +/// Refer to [crate level documentation](crate) for the ideas. +#[track_caller] +fn check( + pb_schema: MessageDescriptor, + pb_data: &[&[u8]], + expected_risingwave_schema: expect_test::Expect, + expected_risingwave_data: expect_test::Expect, +) { + let rw_schema = pb_schema_to_column_descs(&pb_schema); + + if let Err(e) = rw_schema { + expected_risingwave_schema.assert_eq(&e.to_report_string_pretty()); + expected_risingwave_data.assert_eq(""); + return; + } + + let rw_schema = rw_schema + .unwrap() + .iter() + .map(ColumnDesc::from) + .collect_vec(); + expected_risingwave_schema.assert_eq(&format!( + "{:#?}", + rw_schema.iter().map(ColumnDescTestDisplay).collect_vec() + )); + + let mut data_str = vec![]; + for data in pb_data { + let access = ProtobufAccess::new(DynamicMessage::decode(pb_schema.clone(), *data).unwrap()); + let mut row = vec![]; + for col in &rw_schema { + let rw_data = access.access(&[&col.name], &col.data_type); + match rw_data { + Ok(data) => row.push(format!("{:#?}", DatumCowTestDisplay(&data))), + Err(e) => row.push(format!( + "~~~~\nError at column `{}`: {}\n~~~~", + col.name, + e.to_report_string() + )), + } + } + data_str.push(format!("{}", row.iter().format("\n"))); + } + + expected_risingwave_data.assert_eq(&format!( + "{}", + data_str + .iter() + .format("\n================================================================\n") + )); +} + +fn load_message_descriptor( + file_name: &str, + message_name: &str, +) -> anyhow::Result { + let location = "tests/test_data/".to_string() + file_name; + let file_content = fs_err::read(&location).unwrap(); + let schema_bytes = if file_name.ends_with(".proto") { + compile_pb((PathBuf::from(&location), file_content), [])? + } else { + file_content + }; + let pool = DescriptorPool::decode(schema_bytes.as_slice()) + .with_context(|| format!("cannot build descriptor pool from schema `{location}`"))?; + + pool.get_message_by_name(message_name).with_context(|| { + format!( + "cannot find message `{}` in schema `{}`", + message_name, location, + ) + }) +} + +#[test] +fn test_simple_schema() -> anyhow::Result<()> { + // Id: 123, + // Address: "test address", + // City: "test city", + // Zipcode: 456, + // Rate: 1.2345, + // Date: "2021-01-01" + static PRE_GEN_PROTO_DATA: &[u8] = b"\x08\x7b\x12\x0c\x74\x65\x73\x74\x20\x61\x64\x64\x72\x65\x73\x73\x1a\x09\x74\x65\x73\x74\x20\x63\x69\x74\x79\x20\xc8\x03\x2d\x19\x04\x9e\x3f\x32\x0a\x32\x30\x32\x31\x2d\x30\x31\x2d\x30\x31"; + + let message_descriptor = + load_message_descriptor("simple-schema.proto", "test.TestRecord").unwrap(); + + // validate the binary data is correct + let value = DynamicMessage::decode(message_descriptor.clone(), PRE_GEN_PROTO_DATA).unwrap(); + expect![[r#" + [ + I32( + 123, + ), + String( + "test address", + ), + String( + "test city", + ), + I64( + 456, + ), + F32( + 1.2345, + ), + String( + "2021-01-01", + ), + ] + "#]] + .assert_debug_eq(&value.fields().map(|f| f.1).collect_vec()); + + check( + message_descriptor, + &[PRE_GEN_PROTO_DATA], + expect![[r#" + [ + id(#1): Int32, + address(#2): Varchar, + city(#3): Varchar, + zipcode(#4): Int64, + rate(#5): Float32, + date(#6): Varchar, + ]"#]], + expect![[r#" + Owned(Int32(123)) + Borrowed(Utf8("test address")) + Borrowed(Utf8("test city")) + Owned(Int64(456)) + Owned(Float32(OrderedFloat(1.2345))) + Borrowed(Utf8("2021-01-01"))"#]], + ); + + Ok(()) +} + +#[test] +fn test_complex_schema() -> anyhow::Result<()> { + let message_descriptor = load_message_descriptor("complex-schema.proto", "test.User").unwrap(); + + check( + message_descriptor, + &[], + expect![[r#" + [ + id(#1): Int32, + code(#2): Varchar, + timestamp(#3): Int64, + xfas(#4): List( + Struct { + device_model_id: Int32, + device_make_id: Int32, + ip: Varchar, + }, + ), type_name: test.Xfa, + contacts(#7): Struct { + emails: List(Varchar), + phones: List(Varchar), + }, type_name: test.Contacts, field_descs: [emails(#5): List(Varchar), phones(#6): List(Varchar)], + sex(#8): Varchar, + ]"#]], + expect![""], + ); + + Ok(()) +} + +#[test] +fn test_any_schema() -> anyhow::Result<()> { + let message_descriptor = load_message_descriptor("any-schema.proto", "test.TestAny").unwrap(); + + // id: 12345 + // name { + // type_url: "type.googleapis.com/test.Int32Value" + // value: "\010\322\376\006" + // } + // Unpacked Int32Value from Any: value: 114514 + static ANY_DATA_1: &[u8] = b"\x08\xb9\x60\x12\x2b\x0a\x23\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x49\x6e\x74\x33\x32\x56\x61\x6c\x75\x65\x12\x04\x08\xd2\xfe\x06"; + + // "id": 12345, + // "any_value": { + // "type_url": "type.googleapis.com/test.AnyValue", + // "value": { + // "any_value_1": { + // "type_url": "type.googleapis.com/test.StringValue", + // "value": "114514" + // }, + // "any_value_2": { + // "type_url": "type.googleapis.com/test.Int32Value", + // "value": 114514 + // } + // } + // } + static ANY_DATA_2: &[u8] = b"\x08\xb9\x60\x12\x84\x01\x0a\x21\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x41\x6e\x79\x56\x61\x6c\x75\x65\x12\x5f\x0a\x30\x0a\x24\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x53\x74\x72\x69\x6e\x67\x56\x61\x6c\x75\x65\x12\x08\x0a\x06\x31\x31\x34\x35\x31\x34\x12\x2b\x0a\x23\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x49\x6e\x74\x33\x32\x56\x61\x6c\x75\x65\x12\x04\x08\xd2\xfe\x06"; + + // id: 12345 + // name { + // type_url: "type.googleapis.com/test.StringValue" + // value: "\n\010John Doe" + // } + static ANY_DATA_3: &[u8] = b"\x08\xb9\x60\x12\x32\x0a\x24\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x53\x74\x72\x69\x6e\x67\x56\x61\x6c\x75\x65\x12\x0a\x0a\x08\x4a\x6f\x68\x6e\x20\x44\x6f\x65"; + + // // id: 12345 + // // any_value: { + // // type_url: "type.googleapis.com/test.StringXalue" + // // value: "\n\010John Doe" + // // } + static ANY_DATA_INVALID: &[u8] = b"\x08\xb9\x60\x12\x32\x0a\x24\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x53\x74\x72\x69\x6e\x67\x58\x61\x6c\x75\x65\x12\x0a\x0a\x08\x4a\x6f\x68\x6e\x20\x44\x6f\x65"; + + // validate the binary data is correct + { + let value1 = DynamicMessage::decode(message_descriptor.clone(), ANY_DATA_1).unwrap(); + expect![[r#" + [ + I32( + 12345, + ), + Message( + DynamicMessage { + desc: MessageDescriptor { + name: "Any", + full_name: "google.protobuf.Any", + is_map_entry: false, + fields: [ + FieldDescriptor { + name: "type_url", + full_name: "google.protobuf.Any.type_url", + json_name: "typeUrl", + number: 1, + kind: string, + cardinality: Optional, + containing_oneof: None, + default_value: None, + is_group: false, + is_list: false, + is_map: false, + is_packed: false, + supports_presence: false, + }, + FieldDescriptor { + name: "value", + full_name: "google.protobuf.Any.value", + json_name: "value", + number: 2, + kind: bytes, + cardinality: Optional, + containing_oneof: None, + default_value: None, + is_group: false, + is_list: false, + is_map: false, + is_packed: false, + supports_presence: false, + }, + ], + oneofs: [], + }, + fields: DynamicMessageFieldSet { + fields: { + 1: Value( + String( + "type.googleapis.com/test.Int32Value", + ), + ), + 2: Value( + Bytes( + b"\x08\xd2\xfe\x06", + ), + ), + }, + }, + }, + ), + ] + "#]] + .assert_debug_eq(&value1.fields().map(|f| f.1).collect_vec()); + + let value2 = DynamicMessage::decode(message_descriptor.clone(), ANY_DATA_2).unwrap(); + expect![[r#" + [ + I32( + 12345, + ), + Message( + DynamicMessage { + desc: MessageDescriptor { + name: "Any", + full_name: "google.protobuf.Any", + is_map_entry: false, + fields: [ + FieldDescriptor { + name: "type_url", + full_name: "google.protobuf.Any.type_url", + json_name: "typeUrl", + number: 1, + kind: string, + cardinality: Optional, + containing_oneof: None, + default_value: None, + is_group: false, + is_list: false, + is_map: false, + is_packed: false, + supports_presence: false, + }, + FieldDescriptor { + name: "value", + full_name: "google.protobuf.Any.value", + json_name: "value", + number: 2, + kind: bytes, + cardinality: Optional, + containing_oneof: None, + default_value: None, + is_group: false, + is_list: false, + is_map: false, + is_packed: false, + supports_presence: false, + }, + ], + oneofs: [], + }, + fields: DynamicMessageFieldSet { + fields: { + 1: Value( + String( + "type.googleapis.com/test.AnyValue", + ), + ), + 2: Value( + Bytes( + b"\n0\n$type.googleapis.com/test.StringValue\x12\x08\n\x06114514\x12+\n#type.googleapis.com/test.Int32Value\x12\x04\x08\xd2\xfe\x06", + ), + ), + }, + }, + }, + ), + ] + "#]] + .assert_debug_eq(&value2.fields().map(|f| f.1).collect_vec()); + + let value3 = DynamicMessage::decode(message_descriptor.clone(), ANY_DATA_INVALID).unwrap(); + expect![[r#" + [ + I32( + 12345, + ), + Message( + DynamicMessage { + desc: MessageDescriptor { + name: "Any", + full_name: "google.protobuf.Any", + is_map_entry: false, + fields: [ + FieldDescriptor { + name: "type_url", + full_name: "google.protobuf.Any.type_url", + json_name: "typeUrl", + number: 1, + kind: string, + cardinality: Optional, + containing_oneof: None, + default_value: None, + is_group: false, + is_list: false, + is_map: false, + is_packed: false, + supports_presence: false, + }, + FieldDescriptor { + name: "value", + full_name: "google.protobuf.Any.value", + json_name: "value", + number: 2, + kind: bytes, + cardinality: Optional, + containing_oneof: None, + default_value: None, + is_group: false, + is_list: false, + is_map: false, + is_packed: false, + supports_presence: false, + }, + ], + oneofs: [], + }, + fields: DynamicMessageFieldSet { + fields: { + 1: Value( + String( + "type.googleapis.com/test.StringXalue", + ), + ), + 2: Value( + Bytes( + b"\n\x08John Doe", + ), + ), + }, + }, + }, + ), + ] + "#]] + .assert_debug_eq(&value3.fields().map(|f| f.1).collect_vec()); + } + + check( + message_descriptor, + &[ANY_DATA_1, ANY_DATA_2, ANY_DATA_3, ANY_DATA_INVALID], + expect![[r#" + [ + id(#1): Int32, + any_value(#4): Jsonb, type_name: google.protobuf.Any, field_descs: [type_url(#2): Varchar, value(#3): Bytea], + ]"#]], + expect![[r#" + Owned(Int32(12345)) + Owned(Jsonb({ + "@type": "type.googleapis.com/test.Int32Value", + "value": Number(114514), + })) + ================================================================ + Owned(Int32(12345)) + Owned(Jsonb({ + "@type": "type.googleapis.com/test.AnyValue", + "anyValue1": { + "@type": "type.googleapis.com/test.StringValue", + "value": "114514", + }, + "anyValue2": { + "@type": "type.googleapis.com/test.Int32Value", + "value": Number(114514), + }, + })) + ================================================================ + Owned(Int32(12345)) + Owned(Jsonb({ + "@type": "type.googleapis.com/test.StringValue", + "value": "John Doe", + })) + ================================================================ + Owned(Int32(12345)) + ~~~~ + Error at column `any_value`: Fail to convert protobuf Any into jsonb: message 'test.StringXalue' not found + ~~~~"#]], + ); + + Ok(()) +} + +#[test] +fn test_all_types() -> anyhow::Result<()> { + use self::all_types::all_types::*; + use self::all_types::*; + + let message_descriptor = + load_message_descriptor("all-types.proto", "all_types.AllTypes").unwrap(); + + let data = { + AllTypes { + double_field: 1.2345, + float_field: 1.2345, + int32_field: 42, + int64_field: 1234567890, + uint32_field: 98765, + uint64_field: 9876543210, + sint32_field: -12345, + sint64_field: -987654321, + fixed32_field: 1234, + fixed64_field: 5678, + sfixed32_field: -56789, + sfixed64_field: -123456, + bool_field: true, + string_field: "Hello, Prost!".to_string(), + bytes_field: b"byte data".to_vec(), + enum_field: EnumType::Option1 as i32, + nested_message_field: Some(NestedMessage { + id: 100, + name: "Nested".to_string(), + }), + repeated_int_field: vec![1, 2, 3, 4, 5], + map_field: HashMap::from_iter([ + ("key1".to_string(), 1), + ("key2".to_string(), 2), + ("key3".to_string(), 3), + ]), + timestamp_field: Some(::prost_types::Timestamp { + seconds: 1630927032, + nanos: 500000000, + }), + duration_field: Some(::prost_types::Duration { + seconds: 60, + nanos: 500000000, + }), + any_field: Some(::prost_types::Any { + type_url: "type.googleapis.com/my_custom_type".to_string(), + value: b"My custom data".to_vec(), + }), + int32_value_field: Some(42), + string_value_field: Some("Hello, Wrapper!".to_string()), + example_oneof: Some(ExampleOneof::OneofInt32(123)), + map_struct_field: HashMap::from_iter([ + ( + "key1".to_string(), + NestedMessage { + id: 1, + name: "A".to_string(), + }, + ), + ( + "key2".to_string(), + NestedMessage { + id: 2, + name: "B".to_string(), + }, + ), + ]), + map_enum_field: HashMap::from_iter([ + (1, EnumType::Option1 as i32), + (2, EnumType::Option2 as i32), + ]), + } + }; + let mut data_bytes = Vec::new(); + data.encode(&mut data_bytes).unwrap(); + + check( + message_descriptor, + &[&data_bytes], + expect![[r#" + [ + double_field(#1): Float64, + float_field(#2): Float32, + int32_field(#3): Int32, + int64_field(#4): Int64, + uint32_field(#5): Int64, + uint64_field(#6): Decimal, + sint32_field(#7): Int32, + sint64_field(#8): Int64, + fixed32_field(#9): Int64, + fixed64_field(#10): Decimal, + sfixed32_field(#11): Int32, + sfixed64_field(#12): Int64, + bool_field(#13): Boolean, + string_field(#14): Varchar, + bytes_field(#15): Bytea, + enum_field(#16): Varchar, + nested_message_field(#19): Struct { + id: Int32, + name: Varchar, + }, type_name: all_types.AllTypes.NestedMessage, field_descs: [id(#17): Int32, name(#18): Varchar], + repeated_int_field(#20): List(Int32), + oneof_string(#21): Varchar, + oneof_int32(#22): Int32, + oneof_enum(#23): Varchar, + map_field(#26): Map(Varchar,Int32), type_name: all_types.AllTypes.MapFieldEntry, field_descs: [key(#24): Varchar, value(#25): Int32], + timestamp_field(#29): Struct { + seconds: Int64, + nanos: Int32, + }, type_name: google.protobuf.Timestamp, field_descs: [seconds(#27): Int64, nanos(#28): Int32], + duration_field(#32): Struct { + seconds: Int64, + nanos: Int32, + }, type_name: google.protobuf.Duration, field_descs: [seconds(#30): Int64, nanos(#31): Int32], + any_field(#35): Jsonb, type_name: google.protobuf.Any, field_descs: [type_url(#33): Varchar, value(#34): Bytea], + int32_value_field(#37): Struct { value: Int32 }, type_name: google.protobuf.Int32Value, field_descs: [value(#36): Int32], + string_value_field(#39): Struct { value: Varchar }, type_name: google.protobuf.StringValue, field_descs: [value(#38): Varchar], + map_struct_field(#44): Map(Varchar,Struct { id: Int32, name: Varchar }), type_name: all_types.AllTypes.MapStructFieldEntry, field_descs: [key(#40): Varchar, value(#43): Struct { + id: Int32, + name: Varchar, + }, type_name: all_types.AllTypes.NestedMessage, field_descs: [id(#41): Int32, name(#42): Varchar]], + map_enum_field(#47): Map(Int32,Varchar), type_name: all_types.AllTypes.MapEnumFieldEntry, field_descs: [key(#45): Int32, value(#46): Varchar], + ]"#]], + expect![[r#" + Owned(Float64(OrderedFloat(1.2345))) + Owned(Float32(OrderedFloat(1.2345))) + Owned(Int32(42)) + Owned(Int64(1234567890)) + Owned(Int64(98765)) + Owned(Decimal(Normalized(9876543210))) + Owned(Int32(-12345)) + Owned(Int64(-987654321)) + Owned(Int64(1234)) + Owned(Decimal(Normalized(5678))) + Owned(Int32(-56789)) + Owned(Int64(-123456)) + Owned(Bool(true)) + Borrowed(Utf8("Hello, Prost!")) + Borrowed(Bytea([98, 121, 116, 101, 32, 100, 97, 116, 97])) + Owned(Utf8("OPTION1")) + Owned(StructValue( + Int32(100), + Utf8("Nested"), + )) + Owned([ + Int32(1), + Int32(2), + Int32(3), + Int32(4), + Int32(5), + ]) + Owned(Utf8("")) + Owned(Int32(123)) + Owned(Utf8("DEFAULT")) + Owned([ + StructValue( + Utf8("key1"), + Int32(1), + ), + StructValue( + Utf8("key2"), + Int32(2), + ), + StructValue( + Utf8("key3"), + Int32(3), + ), + ]) + Owned(StructValue( + Int64(1630927032), + Int32(500000000), + )) + Owned(StructValue( + Int64(60), + Int32(500000000), + )) + ~~~~ + Error at column `any_field`: Fail to convert protobuf Any into jsonb: message 'my_custom_type' not found + ~~~~ + Owned(StructValue(Int32(42))) + Owned(StructValue(Utf8("Hello, Wrapper!"))) + Owned([ + StructValue( + Utf8("key1"), + StructValue( + Int32(1), + Utf8("A"), + ), + ), + StructValue( + Utf8("key2"), + StructValue( + Int32(2), + Utf8("B"), + ), + ), + ]) + Owned([ + StructValue( + Int32(1), + Utf8("OPTION1"), + ), + StructValue( + Int32(2), + Utf8("OPTION2"), + ), + ])"#]], + ); + + Ok(()) +} + +#[test] +fn test_recursive() -> anyhow::Result<()> { + let message_descriptor = + load_message_descriptor("recursive.proto", "recursive.ComplexRecursiveMessage").unwrap(); + + check( + message_descriptor, + &[], + expect![[r#" + failed to map protobuf type + + Caused by: + circular reference detected: parent(recursive.ComplexRecursiveMessage.parent)->siblings(recursive.ComplexRecursiveMessage.Parent.siblings), conflict with parent(recursive.ComplexRecursiveMessage.parent), kind recursive.ComplexRecursiveMessage.Parent + "#]], + expect![""], + ); + + Ok(()) +} diff --git a/src/connector/src/parser/protobuf/.gitignore b/src/connector/codec/tests/integration_tests/protobuf/.gitignore similarity index 50% rename from src/connector/src/parser/protobuf/.gitignore rename to src/connector/codec/tests/integration_tests/protobuf/.gitignore index 4109deeeb3337..6e5bea6ee81ce 100644 --- a/src/connector/src/parser/protobuf/.gitignore +++ b/src/connector/codec/tests/integration_tests/protobuf/.gitignore @@ -1 +1,2 @@ recursive.rs +all_types.rs diff --git a/src/connector/codec/tests/integration_tests/utils.rs b/src/connector/codec/tests/integration_tests/utils.rs index dd375656c51e3..889dbeffc306f 100644 --- a/src/connector/codec/tests/integration_tests/utils.rs +++ b/src/connector/codec/tests/integration_tests/utils.rs @@ -40,10 +40,15 @@ impl<'a> std::fmt::Debug for DataTypeTestDisplay<'a> { f.finish()?; Ok(()) } - DataType::List(t) => f - .debug_tuple("List") - .field(&DataTypeTestDisplay(t)) - .finish(), + DataType::List(t) => { + if t.is_struct() { + f.debug_tuple("List") + .field(&DataTypeTestDisplay(t)) + .finish() + } else { + write!(f, "List({:?})", &DataTypeTestDisplay(t)) + } + } DataType::Map(m) => { write!( f, @@ -88,6 +93,14 @@ impl<'a> std::fmt::Debug for ScalarRefImplTestDisplay<'a> { .debug_list() .entries(m.inner().iter().map(DatumRefTestDisplay)) .finish(), + ScalarRefImpl::Jsonb(j) => { + let compact_str = format!("{}", j); + if compact_str.len() > 50 { + write!(f, "Jsonb({:#?})", jsonbb::ValueRef::from(j)) + } else { + write!(f, "Jsonb({:#})", j) + } + } _ => { // do not use alternative display for simple types write!(f, "{:?}", self.0) @@ -174,7 +187,13 @@ impl<'a> std::fmt::Debug for ColumnDescTestDisplay<'a> { write!(f, ", type_name: {}", type_name)?; } if !field_descs.is_empty() { - write!(f, ", field_descs: {:?}", field_descs)?; + write!( + f, + ", field_descs: [{}]", + field_descs.iter().format_with(", ", |field_desc, f| { + f(&format_args!("{:?}", ColumnDescTestDisplay(field_desc))) + }) + )?; } if let Some(generated_or_default_column) = generated_or_default_column { write!( diff --git a/src/connector/src/test_data/proto_recursive/recursive.pb b/src/connector/codec/tests/test_data/all-types.pb similarity index 76% rename from src/connector/src/test_data/proto_recursive/recursive.pb rename to src/connector/codec/tests/test_data/all-types.pb index 5c611c18d0d30..177976d5244ad 100644 Binary files a/src/connector/src/test_data/proto_recursive/recursive.pb and b/src/connector/codec/tests/test_data/all-types.pb differ diff --git a/src/connector/codec/tests/test_data/all-types.proto b/src/connector/codec/tests/test_data/all-types.proto new file mode 100644 index 0000000000000..5070328dbf5f3 --- /dev/null +++ b/src/connector/codec/tests/test_data/all-types.proto @@ -0,0 +1,79 @@ +syntax = "proto3"; + +import "google/protobuf/timestamp.proto"; +import "google/protobuf/duration.proto"; +import "google/protobuf/any.proto"; +import "google/protobuf/wrappers.proto"; + +package all_types; + +// all-types.pb is generated by `protoc all-types.proto -o all-types.pb --include_imports` in the current directory. + +message AllTypes { + // standard types + double double_field = 1; + float float_field = 2; + int32 int32_field = 3; + int64 int64_field = 4; + uint32 uint32_field = 5; + uint64 uint64_field = 6; + sint32 sint32_field = 7; + sint64 sint64_field = 8; + fixed32 fixed32_field = 9; + fixed64 fixed64_field = 10; + sfixed32 sfixed32_field = 11; + sfixed64 sfixed64_field = 12; + bool bool_field = 13; + string string_field = 14; + + bytes bytes_field = 15; + + // enum + enum EnumType { + DEFAULT = 0; + OPTION1 = 1; + OPTION2 = 2; + } + EnumType enum_field = 16; + + // nested message + message NestedMessage { + int32 id = 1; + string name = 2; + } + NestedMessage nested_message_field = 17; + + // repeated field + repeated int32 repeated_int_field = 18; + + // oneof field + oneof example_oneof { + string oneof_string = 19; + int32 oneof_int32 = 20; + EnumType oneof_enum = 21; + } + + // map field + map map_field = 22; + + // timestamp + google.protobuf.Timestamp timestamp_field = 23; + + // duration + google.protobuf.Duration duration_field = 24; + + // any + google.protobuf.Any any_field = 25; + + // -- Unsupported + // // struct + // import "google/protobuf/struct.proto"; + // google.protobuf.Struct struct_field = 26; + + // wrapper types + google.protobuf.Int32Value int32_value_field = 27; + google.protobuf.StringValue string_value_field = 28; + + map map_struct_field = 29; + map map_enum_field = 30; + } diff --git a/src/connector/src/test_data/any-schema.proto b/src/connector/codec/tests/test_data/any-schema.proto similarity index 99% rename from src/connector/src/test_data/any-schema.proto rename to src/connector/codec/tests/test_data/any-schema.proto index 12a367100ce7d..6bd9dcdf32b8f 100644 --- a/src/connector/src/test_data/any-schema.proto +++ b/src/connector/codec/tests/test_data/any-schema.proto @@ -35,4 +35,4 @@ message StringStringInt32Value { message Float32StringValue { float first = 1; string second = 2; -} \ No newline at end of file +} diff --git a/src/connector/src/test_data/complex-schema.proto b/src/connector/codec/tests/test_data/complex-schema.proto similarity index 100% rename from src/connector/src/test_data/complex-schema.proto rename to src/connector/codec/tests/test_data/complex-schema.proto diff --git a/src/connector/codec/tests/test_data/recursive.proto b/src/connector/codec/tests/test_data/recursive.proto new file mode 100644 index 0000000000000..a26a6a98e172f --- /dev/null +++ b/src/connector/codec/tests/test_data/recursive.proto @@ -0,0 +1,24 @@ +syntax = "proto3"; + +package recursive; + +message ComplexRecursiveMessage { + string node_name = 1; + int32 node_id = 2; + + message Attributes { + string key = 1; + string value = 2; + } + + repeated Attributes attributes = 3; + + message Parent { + string parent_name = 1; + int32 parent_id = 2; + repeated ComplexRecursiveMessage siblings = 3; + } + + Parent parent = 4; + repeated ComplexRecursiveMessage children = 5; +} diff --git a/src/connector/src/test_data/simple-schema.proto b/src/connector/codec/tests/test_data/simple-schema.proto similarity index 100% rename from src/connector/src/test_data/simple-schema.proto rename to src/connector/codec/tests/test_data/simple-schema.proto diff --git a/src/connector/src/connector_common/mqtt_common.rs b/src/connector/src/connector_common/mqtt_common.rs index c967cf215fd2d..c883e459a49e8 100644 --- a/src/connector/src/connector_common/mqtt_common.rs +++ b/src/connector/src/connector_common/mqtt_common.rs @@ -13,6 +13,7 @@ // limitations under the License. use rumqttc::tokio_rustls::rustls; +use rumqttc::v5::mqttbytes::v5::ConnectProperties; use rumqttc::v5::mqttbytes::QoS; use rumqttc::v5::{AsyncClient, EventLoop, MqttOptions}; use serde_derive::Deserialize; @@ -71,6 +72,10 @@ pub struct MqttCommon { #[serde_as(as = "Option")] pub inflight_messages: Option, + /// The max size of messages received by the MQTT client + #[serde_as(as = "Option")] + pub max_packet_size: Option, + /// Path to CA certificate file for verifying the broker's key. #[serde(rename = "tls.client_key")] pub ca: Option, @@ -111,6 +116,10 @@ impl MqttCommon { options.set_clean_start(self.clean_start); + let mut connect_properties = ConnectProperties::new(); + connect_properties.max_packet_size = self.max_packet_size; + options.set_connect_properties(connect_properties); + if ssl { let tls_config = self.get_tls_config()?; options.set_transport(rumqttc::Transport::tls_with_config( diff --git a/src/connector/src/parser/additional_columns.rs b/src/connector/src/parser/additional_columns.rs index c30f5f74ba390..645220b401c5a 100644 --- a/src/connector/src/parser/additional_columns.rs +++ b/src/connector/src/parser/additional_columns.rs @@ -24,15 +24,15 @@ use risingwave_pb::plan_common::additional_column::ColumnType as AdditionalColum use risingwave_pb::plan_common::{ AdditionalCollectionName, AdditionalColumn, AdditionalColumnFilename, AdditionalColumnHeader, AdditionalColumnHeaders, AdditionalColumnKey, AdditionalColumnOffset, - AdditionalColumnPartition, AdditionalColumnTimestamp, AdditionalDatabaseName, - AdditionalSchemaName, AdditionalTableName, + AdditionalColumnPartition, AdditionalColumnPayload, AdditionalColumnTimestamp, + AdditionalDatabaseName, AdditionalSchemaName, AdditionalTableName, }; use crate::error::ConnectorResult; use crate::source::cdc::MONGODB_CDC_CONNECTOR; use crate::source::{ AZBLOB_CONNECTOR, GCS_CONNECTOR, KAFKA_CONNECTOR, KINESIS_CONNECTOR, OPENDAL_S3_CONNECTOR, - POSIX_FS_CONNECTOR, PULSAR_CONNECTOR, S3_CONNECTOR, + POSIX_FS_CONNECTOR, PULSAR_CONNECTOR, }; // Hidden additional columns connectors which do not support `include` syntax. @@ -44,21 +44,36 @@ pub static COMPATIBLE_ADDITIONAL_COLUMNS: LazyLock ColumnDesc::named_with_additional_column( + column_name, + column_id, + DataType::Jsonb, + AdditionalColumn { + column_type: Some(AdditionalColumnType::Payload(AdditionalColumnPayload {})), + }, + ), "offset" => ColumnDesc::named_with_additional_column( column_name, column_id, diff --git a/src/connector/src/parser/mod.rs b/src/connector/src/parser/mod.rs index 4b14654bf518d..2142914aa2503 100644 --- a/src/connector/src/parser/mod.rs +++ b/src/connector/src/parser/mod.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::fmt::Debug; use std::sync::LazyLock; @@ -488,6 +489,11 @@ impl SourceStreamChunkRowWriter<'_> { .map(|ele| ScalarRefImpl::Utf8(ele.split_id)), )); } + (_, &Some(AdditionalColumnType::Payload(_))) => { + // ingest the whole payload as a single column + // do special logic in `KvEvent::access_field` + parse_field(desc) + } (_, _) => { // For normal columns, call the user provided closure. parse_field(desc) @@ -710,6 +716,7 @@ async fn into_chunk_stream_inner( len: usize, } let mut current_transaction = None; + let mut direct_cdc_event_lag_latency_metrics = HashMap::new(); #[for_await] for batch in data_stream { @@ -759,10 +766,15 @@ async fn into_chunk_stream_inner( if let SourceMeta::DebeziumCdc(msg_meta) = &msg.meta { let lag_ms = process_time_ms - msg_meta.source_ts_ms; // report to promethus - GLOBAL_SOURCE_METRICS - .direct_cdc_event_lag_latency - .with_guarded_label_values(&[&msg_meta.full_table_name]) - .observe(lag_ms as f64); + let full_table_name = msg_meta.full_table_name.clone(); + let direct_cdc_event_lag_latency = direct_cdc_event_lag_latency_metrics + .entry(full_table_name) + .or_insert_with(|| { + GLOBAL_SOURCE_METRICS + .direct_cdc_event_lag_latency + .with_guarded_label_values(&[&msg_meta.full_table_name]) + }); + direct_cdc_event_lag_latency.observe(lag_ms as f64); } let old_len = builder.len(); diff --git a/src/connector/src/parser/protobuf/mod.rs b/src/connector/src/parser/protobuf/mod.rs index bfcb0adfe1a18..462e478932ee7 100644 --- a/src/connector/src/parser/protobuf/mod.rs +++ b/src/connector/src/parser/protobuf/mod.rs @@ -14,7 +14,3 @@ mod parser; pub use parser::*; - -#[rustfmt::skip] -#[cfg(test)] -mod recursive; diff --git a/src/connector/src/parser/protobuf/parser.rs b/src/connector/src/parser/protobuf/parser.rs index ec8c747cafd5a..93eeb19cc1565 100644 --- a/src/connector/src/parser/protobuf/parser.rs +++ b/src/connector/src/parser/protobuf/parser.rs @@ -13,23 +13,14 @@ // limitations under the License. use anyhow::Context; -use itertools::Itertools; -use prost_reflect::{ - Cardinality, DescriptorPool, DynamicMessage, FieldDescriptor, FileDescriptor, Kind, - MessageDescriptor, ReflectMessage, Value, -}; -use risingwave_common::array::{ListValue, StructValue}; -use risingwave_common::types::{ - DataType, DatumCow, Decimal, JsonbVal, ScalarImpl, ToOwnedDatum, F32, F64, -}; +use prost_reflect::{DescriptorPool, DynamicMessage, FileDescriptor, MessageDescriptor}; use risingwave_common::{bail, try_match_expand}; -use risingwave_pb::plan_common::{AdditionalColumn, ColumnDesc, ColumnDescVersion}; -use thiserror::Error; -use thiserror_ext::{AsReport, Macro}; +pub use risingwave_connector_codec::decoder::protobuf::parser::*; +use risingwave_connector_codec::decoder::protobuf::ProtobufAccess; +use risingwave_pb::plan_common::ColumnDesc; use crate::error::ConnectorResult; -use crate::parser::unified::protobuf::ProtobufAccess; -use crate::parser::unified::{uncategorized, AccessError, AccessImpl, AccessResult}; +use crate::parser::unified::AccessImpl; use crate::parser::util::bytes_from_url; use crate::parser::{AccessBuilder, EncodingProperties}; use crate::schema::schema_registry::{extract_schema_id, handle_sr_list, Client, WireFormatError}; @@ -124,207 +115,10 @@ impl ProtobufParserConfig { /// Maps the protobuf schema to relational schema. pub fn map_to_columns(&self) -> ConnectorResult> { - let mut columns = Vec::with_capacity(self.message_descriptor.fields().len()); - let mut index = 0; - let mut parse_trace: Vec = vec![]; - for field in self.message_descriptor.fields() { - columns.push(Self::pb_field_to_col_desc( - &field, - &mut index, - &mut parse_trace, - )?); - } - - Ok(columns) - } - - /// Maps a protobuf field to a RW column. - fn pb_field_to_col_desc( - field_descriptor: &FieldDescriptor, - index: &mut i32, - parse_trace: &mut Vec, - ) -> ConnectorResult { - let field_type = protobuf_type_mapping(field_descriptor, parse_trace) - .context("failed to map protobuf type")?; - if let Kind::Message(m) = field_descriptor.kind() { - let field_descs = if let DataType::List { .. } = field_type { - vec![] - } else { - m.fields() - .map(|f| Self::pb_field_to_col_desc(&f, index, parse_trace)) - .try_collect()? - }; - *index += 1; - Ok(ColumnDesc { - column_id: *index, - name: field_descriptor.name().to_string(), - column_type: Some(field_type.to_protobuf()), - field_descs, - type_name: m.full_name().to_string(), - generated_or_default_column: None, - description: None, - additional_column_type: 0, // deprecated - additional_column: Some(AdditionalColumn { column_type: None }), - version: ColumnDescVersion::Pr13707 as i32, - }) - } else { - *index += 1; - Ok(ColumnDesc { - column_id: *index, - name: field_descriptor.name().to_string(), - column_type: Some(field_type.to_protobuf()), - additional_column: Some(AdditionalColumn { column_type: None }), - version: ColumnDescVersion::Pr13707 as i32, - ..Default::default() - }) - } + pb_schema_to_column_descs(&self.message_descriptor).map_err(|e| e.into()) } } -#[derive(Error, Debug, Macro)] -#[error("{0}")] -struct ProtobufTypeError(#[message] String); - -fn detect_loop_and_push( - trace: &mut Vec, - fd: &FieldDescriptor, -) -> std::result::Result<(), ProtobufTypeError> { - let identifier = format!("{}({})", fd.name(), fd.full_name()); - if trace.iter().any(|s| s == identifier.as_str()) { - bail_protobuf_type_error!( - "circular reference detected: {}, conflict with {}, kind {:?}", - trace.iter().format("->"), - identifier, - fd.kind(), - ); - } - trace.push(identifier); - Ok(()) -} - -pub fn from_protobuf_value<'a>( - field_desc: &FieldDescriptor, - value: &'a Value, -) -> AccessResult> { - let kind = field_desc.kind(); - - macro_rules! borrowed { - ($v:expr) => { - return Ok(DatumCow::Borrowed(Some($v.into()))) - }; - } - - let v: ScalarImpl = match value { - Value::Bool(v) => ScalarImpl::Bool(*v), - Value::I32(i) => ScalarImpl::Int32(*i), - Value::U32(i) => ScalarImpl::Int64(*i as i64), - Value::I64(i) => ScalarImpl::Int64(*i), - Value::U64(i) => ScalarImpl::Decimal(Decimal::from(*i)), - Value::F32(f) => ScalarImpl::Float32(F32::from(*f)), - Value::F64(f) => ScalarImpl::Float64(F64::from(*f)), - Value::String(s) => borrowed!(s.as_str()), - Value::EnumNumber(idx) => { - let enum_desc = kind.as_enum().ok_or_else(|| AccessError::TypeError { - expected: "enum".to_owned(), - got: format!("{kind:?}"), - value: value.to_string(), - })?; - let enum_symbol = enum_desc.get_value(*idx).ok_or_else(|| { - uncategorized!("unknown enum index {} of enum {:?}", idx, enum_desc) - })?; - ScalarImpl::Utf8(enum_symbol.name().into()) - } - Value::Message(dyn_msg) => { - if dyn_msg.descriptor().full_name() == "google.protobuf.Any" { - ScalarImpl::Jsonb(JsonbVal::from( - serde_json::to_value(dyn_msg).map_err(AccessError::ProtobufAnyToJson)?, - )) - } else { - let mut rw_values = Vec::with_capacity(dyn_msg.descriptor().fields().len()); - // fields is a btree map in descriptor - // so it's order is the same as datatype - for field_desc in dyn_msg.descriptor().fields() { - // missing field - if !dyn_msg.has_field(&field_desc) - && field_desc.cardinality() == Cardinality::Required - { - return Err(AccessError::Undefined { - name: field_desc.name().to_owned(), - path: dyn_msg.descriptor().full_name().to_owned(), - }); - } - // use default value if dyn_msg doesn't has this field - let value = dyn_msg.get_field(&field_desc); - rw_values.push(from_protobuf_value(&field_desc, &value)?.to_owned_datum()); - } - ScalarImpl::Struct(StructValue::new(rw_values)) - } - } - Value::List(values) => { - let data_type = protobuf_type_mapping(field_desc, &mut vec![]) - .map_err(|e| uncategorized!("{}", e.to_report_string()))?; - let mut builder = data_type.as_list().create_array_builder(values.len()); - for value in values { - builder.append(from_protobuf_value(field_desc, value)?); - } - ScalarImpl::List(ListValue::new(builder.finish())) - } - Value::Bytes(value) => borrowed!(&**value), - _ => { - return Err(AccessError::UnsupportedType { - ty: format!("{kind:?}"), - }); - } - }; - Ok(Some(v).into()) -} - -/// Maps protobuf type to RW type. -fn protobuf_type_mapping( - field_descriptor: &FieldDescriptor, - parse_trace: &mut Vec, -) -> std::result::Result { - detect_loop_and_push(parse_trace, field_descriptor)?; - let field_type = field_descriptor.kind(); - let mut t = match field_type { - Kind::Bool => DataType::Boolean, - Kind::Double => DataType::Float64, - Kind::Float => DataType::Float32, - Kind::Int32 | Kind::Sint32 | Kind::Sfixed32 => DataType::Int32, - // Fixed32 represents [0, 2^32 - 1]. It's equal to u32. - Kind::Int64 | Kind::Sint64 | Kind::Sfixed64 | Kind::Uint32 | Kind::Fixed32 => { - DataType::Int64 - } - Kind::Uint64 | Kind::Fixed64 => DataType::Decimal, - Kind::String => DataType::Varchar, - Kind::Message(m) => match m.full_name() { - // Well-Known Types are identified by their full name - "google.protobuf.Any" => DataType::Jsonb, - _ => { - let fields = m - .fields() - .map(|f| protobuf_type_mapping(&f, parse_trace)) - .try_collect()?; - let field_names = m.fields().map(|f| f.name().to_string()).collect_vec(); - DataType::new_struct(fields, field_names) - } - }, - Kind::Enum(_) => DataType::Varchar, - Kind::Bytes => DataType::Bytea, - }; - if field_descriptor.is_map() { - bail_protobuf_type_error!( - "protobuf map type (on field `{}`) is not supported", - field_descriptor.full_name() - ); - } - if field_descriptor.cardinality() == Cardinality::Repeated { - t = DataType::List(Box::new(t)) - } - _ = parse_trace.pop(); - Ok(t) -} - /// A port from the implementation of confluent's Varint Zig-zag deserialization. /// See `ReadVarint` in fn decode_varint_zigzag(buffer: &[u8]) -> ConnectorResult<(i32, usize)> { @@ -380,601 +174,7 @@ pub(crate) fn resolve_pb_header(payload: &[u8]) -> ConnectorResult<&[u8]> { #[cfg(test)] mod test { - use std::path::PathBuf; - - use prost::Message; - use risingwave_common::types::StructType; - use risingwave_connector_codec::decoder::AccessExt; - use risingwave_pb::catalog::StreamSourceInfo; - use risingwave_pb::data::data_type::PbTypeName; - use risingwave_pb::plan_common::{PbEncodeType, PbFormatType}; - use serde_json::json; - use super::*; - use crate::parser::protobuf::recursive::all_types::{EnumType, ExampleOneof, NestedMessage}; - use crate::parser::protobuf::recursive::AllTypes; - use crate::parser::SpecificParserConfig; - - fn schema_dir() -> String { - let dir = PathBuf::from("src/test_data"); - format!( - "file://{}", - std::fs::canonicalize(dir).unwrap().to_str().unwrap() - ) - } - - // Id: 123, - // Address: "test address", - // City: "test city", - // Zipcode: 456, - // Rate: 1.2345, - // Date: "2021-01-01" - static PRE_GEN_PROTO_DATA: &[u8] = b"\x08\x7b\x12\x0c\x74\x65\x73\x74\x20\x61\x64\x64\x72\x65\x73\x73\x1a\x09\x74\x65\x73\x74\x20\x63\x69\x74\x79\x20\xc8\x03\x2d\x19\x04\x9e\x3f\x32\x0a\x32\x30\x32\x31\x2d\x30\x31\x2d\x30\x31"; - - #[tokio::test] - async fn test_simple_schema() -> crate::error::ConnectorResult<()> { - let location = schema_dir() + "/simple-schema"; - println!("location: {}", location); - let message_name = "test.TestRecord"; - let info = StreamSourceInfo { - proto_message_name: message_name.to_string(), - row_schema_location: location.to_string(), - use_schema_registry: false, - format: PbFormatType::Plain.into(), - row_encode: PbEncodeType::Protobuf.into(), - ..Default::default() - }; - let parser_config = SpecificParserConfig::new(&info, &Default::default())?; - let conf = ProtobufParserConfig::new(parser_config.encoding_config).await?; - let value = DynamicMessage::decode(conf.message_descriptor, PRE_GEN_PROTO_DATA).unwrap(); - - assert_eq!( - value.get_field_by_name("id").unwrap().into_owned(), - Value::I32(123) - ); - assert_eq!( - value.get_field_by_name("address").unwrap().into_owned(), - Value::String("test address".to_string()) - ); - assert_eq!( - value.get_field_by_name("city").unwrap().into_owned(), - Value::String("test city".to_string()) - ); - assert_eq!( - value.get_field_by_name("zipcode").unwrap().into_owned(), - Value::I64(456) - ); - assert_eq!( - value.get_field_by_name("rate").unwrap().into_owned(), - Value::F32(1.2345) - ); - assert_eq!( - value.get_field_by_name("date").unwrap().into_owned(), - Value::String("2021-01-01".to_string()) - ); - - Ok(()) - } - - #[tokio::test] - async fn test_complex_schema() -> crate::error::ConnectorResult<()> { - let location = schema_dir() + "/complex-schema"; - let message_name = "test.User"; - - let info = StreamSourceInfo { - proto_message_name: message_name.to_string(), - row_schema_location: location.to_string(), - use_schema_registry: false, - format: PbFormatType::Plain.into(), - row_encode: PbEncodeType::Protobuf.into(), - ..Default::default() - }; - let parser_config = SpecificParserConfig::new(&info, &Default::default())?; - let conf = ProtobufParserConfig::new(parser_config.encoding_config).await?; - let columns = conf.map_to_columns().unwrap(); - - assert_eq!(columns[0].name, "id".to_string()); - assert_eq!(columns[1].name, "code".to_string()); - assert_eq!(columns[2].name, "timestamp".to_string()); - - let data_type = columns[3].column_type.as_ref().unwrap(); - assert_eq!(data_type.get_type_name().unwrap(), PbTypeName::List); - let inner_field_type = data_type.field_type.clone(); - assert_eq!( - inner_field_type[0].get_type_name().unwrap(), - PbTypeName::Struct - ); - let struct_inner = inner_field_type[0].field_type.clone(); - assert_eq!(struct_inner[0].get_type_name().unwrap(), PbTypeName::Int32); - assert_eq!(struct_inner[1].get_type_name().unwrap(), PbTypeName::Int32); - assert_eq!( - struct_inner[2].get_type_name().unwrap(), - PbTypeName::Varchar - ); - - assert_eq!(columns[4].name, "contacts".to_string()); - let inner_field_type = columns[4].column_type.as_ref().unwrap().field_type.clone(); - assert_eq!( - inner_field_type[0].get_type_name().unwrap(), - PbTypeName::List - ); - assert_eq!( - inner_field_type[1].get_type_name().unwrap(), - PbTypeName::List - ); - Ok(()) - } - - #[tokio::test] - async fn test_refuse_recursive_proto_message() { - let location = schema_dir() + "/proto_recursive/recursive.pb"; - let message_name = "recursive.ComplexRecursiveMessage"; - - let info = StreamSourceInfo { - proto_message_name: message_name.to_string(), - row_schema_location: location.to_string(), - use_schema_registry: false, - format: PbFormatType::Plain.into(), - row_encode: PbEncodeType::Protobuf.into(), - ..Default::default() - }; - let parser_config = SpecificParserConfig::new(&info, &Default::default()).unwrap(); - let conf = ProtobufParserConfig::new(parser_config.encoding_config) - .await - .unwrap(); - let columns = conf.map_to_columns(); - // expect error message: - // "Err(Protocol error: circular reference detected: - // parent(recursive.ComplexRecursiveMessage.parent)->siblings(recursive. - // ComplexRecursiveMessage.Parent.siblings), conflict with - // parent(recursive.ComplexRecursiveMessage.parent), kind - // recursive.ComplexRecursiveMessage.Parent" - assert!(columns.is_err()); - } - - async fn create_recursive_pb_parser_config( - location: &str, - message_name: &str, - ) -> ProtobufParserConfig { - let location = schema_dir() + location; - - let info = StreamSourceInfo { - proto_message_name: message_name.to_string(), - row_schema_location: location.to_string(), - use_schema_registry: false, - format: PbFormatType::Plain.into(), - row_encode: PbEncodeType::Protobuf.into(), - ..Default::default() - }; - let parser_config = SpecificParserConfig::new(&info, &Default::default()).unwrap(); - - ProtobufParserConfig::new(parser_config.encoding_config) - .await - .unwrap() - } - - #[tokio::test] - async fn test_all_types_create_source() { - let conf = create_recursive_pb_parser_config( - "/proto_recursive/recursive.pb", - "recursive.AllTypes", - ) - .await; - - // Ensure that the parser can recognize the schema. - let columns = conf - .map_to_columns() - .unwrap() - .into_iter() - .map(|c| DataType::from(&c.column_type.unwrap())) - .collect_vec(); - assert_eq!( - columns, - vec![ - DataType::Float64, // double_field - DataType::Float32, // float_field - DataType::Int32, // int32_field - DataType::Int64, // int64_field - DataType::Int64, // uint32_field - DataType::Decimal, // uint64_field - DataType::Int32, // sint32_field - DataType::Int64, // sint64_field - DataType::Int64, // fixed32_field - DataType::Decimal, // fixed64_field - DataType::Int32, // sfixed32_field - DataType::Int64, // sfixed64_field - DataType::Boolean, // bool_field - DataType::Varchar, // string_field - DataType::Bytea, // bytes_field - DataType::Varchar, // enum_field - DataType::Struct(StructType::new(vec![ - ("id", DataType::Int32), - ("name", DataType::Varchar) - ])), // nested_message_field - DataType::List(DataType::Int32.into()), // repeated_int_field - DataType::Varchar, // oneof_string - DataType::Int32, // oneof_int32 - DataType::Varchar, // oneof_enum - DataType::Struct(StructType::new(vec![ - ("seconds", DataType::Int64), - ("nanos", DataType::Int32) - ])), // timestamp_field - DataType::Struct(StructType::new(vec![ - ("seconds", DataType::Int64), - ("nanos", DataType::Int32) - ])), // duration_field - DataType::Jsonb, // any_field - DataType::Struct(StructType::new(vec![("value", DataType::Int32)])), /* int32_value_field */ - DataType::Struct(StructType::new(vec![("value", DataType::Varchar)])), /* string_value_field */ - ] - ) - } - - #[tokio::test] - async fn test_all_types_data_parsing() { - let m = create_all_types_message(); - let mut payload = Vec::new(); - m.encode(&mut payload).unwrap(); - - let conf = create_recursive_pb_parser_config( - "/proto_recursive/recursive.pb", - "recursive.AllTypes", - ) - .await; - let mut access_builder = ProtobufAccessBuilder::new(conf).unwrap(); - let access = access_builder.generate_accessor(payload).await.unwrap(); - if let AccessImpl::Protobuf(a) = access { - assert_all_types_eq(&a, &m); - } else { - panic!("unexpected") - } - } - - fn assert_all_types_eq(a: &ProtobufAccess, m: &AllTypes) { - type S = ScalarImpl; - - pb_eq(a, "double_field", S::Float64(m.double_field.into())); - pb_eq(a, "float_field", S::Float32(m.float_field.into())); - pb_eq(a, "int32_field", S::Int32(m.int32_field)); - pb_eq(a, "int64_field", S::Int64(m.int64_field)); - pb_eq(a, "uint32_field", S::Int64(m.uint32_field.into())); - pb_eq(a, "uint64_field", S::Decimal(m.uint64_field.into())); - pb_eq(a, "sint32_field", S::Int32(m.sint32_field)); - pb_eq(a, "sint64_field", S::Int64(m.sint64_field)); - pb_eq(a, "fixed32_field", S::Int64(m.fixed32_field.into())); - pb_eq(a, "fixed64_field", S::Decimal(m.fixed64_field.into())); - pb_eq(a, "sfixed32_field", S::Int32(m.sfixed32_field)); - pb_eq(a, "sfixed64_field", S::Int64(m.sfixed64_field)); - pb_eq(a, "bool_field", S::Bool(m.bool_field)); - pb_eq(a, "string_field", S::Utf8(m.string_field.as_str().into())); - pb_eq(a, "bytes_field", S::Bytea(m.bytes_field.clone().into())); - pb_eq(a, "enum_field", S::Utf8("OPTION1".into())); - pb_eq( - a, - "nested_message_field", - S::Struct(StructValue::new(vec![ - Some(ScalarImpl::Int32(100)), - Some(ScalarImpl::Utf8("Nested".into())), - ])), - ); - pb_eq( - a, - "repeated_int_field", - S::List(ListValue::from_iter(m.repeated_int_field.clone())), - ); - pb_eq( - a, - "timestamp_field", - S::Struct(StructValue::new(vec![ - Some(ScalarImpl::Int64(1630927032)), - Some(ScalarImpl::Int32(500000000)), - ])), - ); - pb_eq( - a, - "duration_field", - S::Struct(StructValue::new(vec![ - Some(ScalarImpl::Int64(60)), - Some(ScalarImpl::Int32(500000000)), - ])), - ); - pb_eq( - a, - "int32_value_field", - S::Struct(StructValue::new(vec![Some(ScalarImpl::Int32(42))])), - ); - pb_eq( - a, - "string_value_field", - S::Struct(StructValue::new(vec![Some(ScalarImpl::Utf8( - m.string_value_field.as_ref().unwrap().as_str().into(), - ))])), - ); - pb_eq(a, "oneof_string", S::Utf8("".into())); - pb_eq(a, "oneof_int32", S::Int32(123)); - pb_eq(a, "oneof_enum", S::Utf8("DEFAULT".into())); - } - - fn pb_eq(a: &ProtobufAccess, field_name: &str, value: ScalarImpl) { - let dummy_type = DataType::Varchar; - let d = a.access_owned(&[field_name], &dummy_type).unwrap().unwrap(); - assert_eq!(d, value, "field: {} value: {:?}", field_name, d); - } - - fn create_all_types_message() -> AllTypes { - AllTypes { - double_field: 1.2345, - float_field: 1.2345, - int32_field: 42, - int64_field: 1234567890, - uint32_field: 98765, - uint64_field: 9876543210, - sint32_field: -12345, - sint64_field: -987654321, - fixed32_field: 1234, - fixed64_field: 5678, - sfixed32_field: -56789, - sfixed64_field: -123456, - bool_field: true, - string_field: "Hello, Prost!".to_string(), - bytes_field: b"byte data".to_vec(), - enum_field: EnumType::Option1 as i32, - nested_message_field: Some(NestedMessage { - id: 100, - name: "Nested".to_string(), - }), - repeated_int_field: vec![1, 2, 3, 4, 5], - timestamp_field: Some(::prost_types::Timestamp { - seconds: 1630927032, - nanos: 500000000, - }), - duration_field: Some(::prost_types::Duration { - seconds: 60, - nanos: 500000000, - }), - any_field: Some(::prost_types::Any { - type_url: "type.googleapis.com/my_custom_type".to_string(), - value: b"My custom data".to_vec(), - }), - int32_value_field: Some(42), - string_value_field: Some("Hello, Wrapper!".to_string()), - example_oneof: Some(ExampleOneof::OneofInt32(123)), - } - } - - // id: 12345 - // name { - // type_url: "type.googleapis.com/test.StringValue" - // value: "\n\010John Doe" - // } - static ANY_GEN_PROTO_DATA: &[u8] = b"\x08\xb9\x60\x12\x32\x0a\x24\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x53\x74\x72\x69\x6e\x67\x56\x61\x6c\x75\x65\x12\x0a\x0a\x08\x4a\x6f\x68\x6e\x20\x44\x6f\x65"; - - #[tokio::test] - async fn test_any_schema() -> crate::error::ConnectorResult<()> { - let conf = create_recursive_pb_parser_config("/any-schema.pb", "test.TestAny").await; - - println!("Current conf: {:#?}", conf); - println!("---------------------------"); - - let value = - DynamicMessage::decode(conf.message_descriptor.clone(), ANY_GEN_PROTO_DATA).unwrap(); - - println!("Test ANY_GEN_PROTO_DATA, current value: {:#?}", value); - println!("---------------------------"); - - // This is of no use - let field = value.fields().next().unwrap().0; - - if let Some(ret) = from_protobuf_value(&field, &Value::Message(value)) - .unwrap() - .to_owned_datum() - { - println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); - println!("---------------------------"); - - let ScalarImpl::Struct(struct_value) = ret else { - panic!("Expected ScalarImpl::Struct"); - }; - - let fields = struct_value.fields(); - - match fields[0].clone() { - Some(ScalarImpl::Int32(v)) => { - println!("Successfully decode field[0]"); - assert_eq!(v, 12345); - } - _ => panic!("Expected ScalarImpl::Int32"), - } - - match fields[1].clone() { - Some(ScalarImpl::Jsonb(jv)) => { - assert_eq!( - jv, - JsonbVal::from(json!({ - "@type": "type.googleapis.com/test.StringValue", - "value": "John Doe" - })) - ); - } - _ => panic!("Expected ScalarImpl::Jsonb"), - } - } - - Ok(()) - } - - // id: 12345 - // name { - // type_url: "type.googleapis.com/test.Int32Value" - // value: "\010\322\376\006" - // } - // Unpacked Int32Value from Any: value: 114514 - static ANY_GEN_PROTO_DATA_1: &[u8] = b"\x08\xb9\x60\x12\x2b\x0a\x23\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x49\x6e\x74\x33\x32\x56\x61\x6c\x75\x65\x12\x04\x08\xd2\xfe\x06"; - - #[tokio::test] - async fn test_any_schema_1() -> crate::error::ConnectorResult<()> { - let conf = create_recursive_pb_parser_config("/any-schema.pb", "test.TestAny").await; - - println!("Current conf: {:#?}", conf); - println!("---------------------------"); - - let value = - DynamicMessage::decode(conf.message_descriptor.clone(), ANY_GEN_PROTO_DATA_1).unwrap(); - - println!("Current Value: {:#?}", value); - println!("---------------------------"); - - // This is of no use - let field = value.fields().next().unwrap().0; - - if let Some(ret) = from_protobuf_value(&field, &Value::Message(value)) - .unwrap() - .to_owned_datum() - { - println!("Decoded Value for ANY_GEN_PROTO_DATA: {:#?}", ret); - println!("---------------------------"); - - let ScalarImpl::Struct(struct_value) = ret else { - panic!("Expected ScalarImpl::Struct"); - }; - - let fields = struct_value.fields(); - - match fields[0].clone() { - Some(ScalarImpl::Int32(v)) => { - println!("Successfully decode field[0]"); - assert_eq!(v, 12345); - } - _ => panic!("Expected ScalarImpl::Int32"), - } - - match fields[1].clone() { - Some(ScalarImpl::Jsonb(jv)) => { - assert_eq!( - jv, - JsonbVal::from(json!({ - "@type": "type.googleapis.com/test.Int32Value", - "value": 114514 - })) - ); - } - _ => panic!("Expected ScalarImpl::Jsonb"), - } - } - - Ok(()) - } - - // "id": 12345, - // "any_value": { - // "type_url": "type.googleapis.com/test.AnyValue", - // "value": { - // "any_value_1": { - // "type_url": "type.googleapis.com/test.StringValue", - // "value": "114514" - // }, - // "any_value_2": { - // "type_url": "type.googleapis.com/test.Int32Value", - // "value": 114514 - // } - // } - // } - static ANY_RECURSIVE_GEN_PROTO_DATA: &[u8] = b"\x08\xb9\x60\x12\x84\x01\x0a\x21\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x41\x6e\x79\x56\x61\x6c\x75\x65\x12\x5f\x0a\x30\x0a\x24\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x53\x74\x72\x69\x6e\x67\x56\x61\x6c\x75\x65\x12\x08\x0a\x06\x31\x31\x34\x35\x31\x34\x12\x2b\x0a\x23\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x49\x6e\x74\x33\x32\x56\x61\x6c\x75\x65\x12\x04\x08\xd2\xfe\x06"; - - #[tokio::test] - async fn test_any_recursive() -> crate::error::ConnectorResult<()> { - let conf = create_recursive_pb_parser_config("/any-schema.pb", "test.TestAny").await; - - println!("Current conf: {:#?}", conf); - println!("---------------------------"); - - let value = DynamicMessage::decode( - conf.message_descriptor.clone(), - ANY_RECURSIVE_GEN_PROTO_DATA, - ) - .unwrap(); - - println!("Current Value: {:#?}", value); - println!("---------------------------"); - - // This is of no use - let field = value.fields().next().unwrap().0; - - if let Some(ret) = from_protobuf_value(&field, &Value::Message(value)) - .unwrap() - .to_owned_datum() - { - println!("Decoded Value for ANY_RECURSIVE_GEN_PROTO_DATA: {:#?}", ret); - println!("---------------------------"); - - let ScalarImpl::Struct(struct_value) = ret else { - panic!("Expected ScalarImpl::Struct"); - }; - - let fields = struct_value.fields(); - - match fields[0].clone() { - Some(ScalarImpl::Int32(v)) => { - println!("Successfully decode field[0]"); - assert_eq!(v, 12345); - } - _ => panic!("Expected ScalarImpl::Int32"), - } - - match fields[1].clone() { - Some(ScalarImpl::Jsonb(jv)) => { - assert_eq!( - jv, - JsonbVal::from(json!({ - "@type": "type.googleapis.com/test.AnyValue", - "anyValue1": { - "@type": "type.googleapis.com/test.StringValue", - "value": "114514", - }, - "anyValue2": { - "@type": "type.googleapis.com/test.Int32Value", - "value": 114514, - } - })) - ); - } - _ => panic!("Expected ScalarImpl::Jsonb"), - } - } - - Ok(()) - } - - // id: 12345 - // any_value: { - // type_url: "type.googleapis.com/test.StringXalue" - // value: "\n\010John Doe" - // } - static ANY_GEN_PROTO_DATA_INVALID: &[u8] = b"\x08\xb9\x60\x12\x32\x0a\x24\x74\x79\x70\x65\x2e\x67\x6f\x6f\x67\x6c\x65\x61\x70\x69\x73\x2e\x63\x6f\x6d\x2f\x74\x65\x73\x74\x2e\x53\x74\x72\x69\x6e\x67\x58\x61\x6c\x75\x65\x12\x0a\x0a\x08\x4a\x6f\x68\x6e\x20\x44\x6f\x65"; - - #[tokio::test] - async fn test_any_invalid() -> crate::error::ConnectorResult<()> { - let conf = create_recursive_pb_parser_config("/any-schema.pb", "test.TestAny").await; - - let value = - DynamicMessage::decode(conf.message_descriptor.clone(), ANY_GEN_PROTO_DATA_INVALID) - .unwrap(); - - // The top-level `Value` is not a proto field, but we need a dummy one. - let field = value.fields().next().unwrap().0; - - let err = from_protobuf_value(&field, &Value::Message(value)).unwrap_err(); - - let expected = expect_test::expect![[r#" - Fail to convert protobuf Any into jsonb - - Caused by: - message 'test.StringXalue' not found - "#]]; - expected.assert_eq(err.to_report_string_pretty().as_str()); - - Ok(()) - } #[test] fn test_decode_varint_zigzag() { diff --git a/src/connector/src/parser/unified/json.rs b/src/connector/src/parser/unified/json.rs index ca709e2eebc73..8ee8f9fe9386f 100644 --- a/src/connector/src/parser/unified/json.rs +++ b/src/connector/src/parser/unified/json.rs @@ -646,6 +646,7 @@ impl<'a> JsonAccess<'a> { impl Access for JsonAccess<'_> { fn access<'a>(&'a self, path: &[&str], type_expected: &DataType) -> AccessResult> { let mut value = &self.value; + for (idx, &key) in path.iter().enumerate() { if let Some(sub_value) = if self.options.ignoring_keycase { json_object_get_case_insensitive(value, key) diff --git a/src/connector/src/parser/unified/kv_event.rs b/src/connector/src/parser/unified/kv_event.rs index 7e52d2f4c3c24..6ab7925b9bb48 100644 --- a/src/connector/src/parser/unified/kv_event.rs +++ b/src/connector/src/parser/unified/kv_event.rs @@ -79,6 +79,9 @@ where pub fn access_field(&self, desc: &SourceColumnDesc) -> AccessResult> { match desc.additional_column.column_type { Some(AdditionalColumnType::Key(_)) => self.access_key(&[&desc.name], &desc.data_type), + // hack here: Get the whole payload as a single column + // use a special mark empty slice as path to represent the whole payload + Some(AdditionalColumnType::Payload(_)) => self.access_value(&[], &desc.data_type), None => self.access_value(&[&desc.name], &desc.data_type), _ => unreachable!(), } diff --git a/src/connector/src/parser/unified/mod.rs b/src/connector/src/parser/unified/mod.rs index fdfe3aae6aaee..adf32df572307 100644 --- a/src/connector/src/parser/unified/mod.rs +++ b/src/connector/src/parser/unified/mod.rs @@ -17,11 +17,11 @@ use auto_impl::auto_impl; use risingwave_common::types::{DataType, DatumCow}; use risingwave_connector_codec::decoder::avro::AvroAccess; -pub use risingwave_connector_codec::decoder::{uncategorized, Access, AccessError, AccessResult}; +use risingwave_connector_codec::decoder::protobuf::ProtobufAccess; +pub use risingwave_connector_codec::decoder::{Access, AccessError, AccessResult}; use self::bytes::BytesAccess; use self::json::JsonAccess; -use self::protobuf::ProtobufAccess; use crate::parser::unified::debezium::MongoJsonAccess; use crate::source::SourceColumnDesc; @@ -30,7 +30,6 @@ pub mod debezium; pub mod json; pub mod kv_event; pub mod maxwell; -pub mod protobuf; pub mod util; pub enum AccessImpl<'a> { diff --git a/src/connector/src/schema/protobuf.rs b/src/connector/src/schema/protobuf.rs index d140af83c853f..634d692066ac1 100644 --- a/src/connector/src/schema/protobuf.rs +++ b/src/connector/src/schema/protobuf.rs @@ -13,9 +13,10 @@ // limitations under the License. use std::collections::BTreeMap; +use std::path::PathBuf; -use itertools::Itertools as _; use prost_reflect::{DescriptorPool, FileDescriptor, MessageDescriptor}; +use risingwave_connector_codec::common::protobuf::compile_pb; use super::loader::{LoadedSchema, SchemaLoader}; use super::schema_registry::Subject; @@ -98,91 +99,29 @@ pub async fn fetch_from_registry( impl LoadedSchema for FileDescriptor { fn compile(primary: Subject, references: Vec) -> Result { let primary_name = primary.name.clone(); - match compile_pb(primary, references) { - Err(e) => Err(SchemaFetchError::SchemaCompile(e.into())), - Ok(b) => { - let pool = DescriptorPool::decode(b.as_slice()) - .map_err(|e| SchemaFetchError::SchemaCompile(e.into()))?; - pool.get_file_by_name(&primary_name).ok_or_else(|| { - SchemaFetchError::SchemaCompile( - anyhow::anyhow!("{primary_name} lost after compilation").into(), - ) - }) - } - } - } -} - -macro_rules! embed_wkts { - [$( $path:literal ),+ $(,)?] => { - &[$( - ( - concat!("google/protobuf/", $path), - include_bytes!(concat!(env!("PROTO_INCLUDE"), "/google/protobuf/", $path)).as_slice(), + let compiled_pb = compile_pb_subject(primary, references)?; + let pool = DescriptorPool::decode(compiled_pb.as_slice()) + .map_err(|e| SchemaFetchError::SchemaCompile(e.into()))?; + pool.get_file_by_name(&primary_name).ok_or_else(|| { + SchemaFetchError::SchemaCompile( + anyhow::anyhow!("{primary_name} lost after compilation").into(), ) - ),+] - }; -} -const WELL_KNOWN_TYPES: &[(&str, &[u8])] = embed_wkts![ - "any.proto", - "api.proto", - "compiler/plugin.proto", - "descriptor.proto", - "duration.proto", - "empty.proto", - "field_mask.proto", - "source_context.proto", - "struct.proto", - "timestamp.proto", - "type.proto", - "wrappers.proto", -]; - -#[derive(Debug, thiserror::Error)] -pub enum PbCompileError { - #[error("build_file_descriptor_set failed\n{}", errs.iter().map(|e| format!("\t{e}")).join("\n"))] - Build { - errs: Vec, - }, - #[error("serialize descriptor set failed")] - Serialize, + }) + } } -pub fn compile_pb( +fn compile_pb_subject( primary_subject: Subject, dependency_subjects: Vec, -) -> Result, PbCompileError> { - use std::iter; - use std::path::Path; - - use protobuf_native::compiler::{ - SimpleErrorCollector, SourceTreeDescriptorDatabase, VirtualSourceTree, - }; - use protobuf_native::MessageLite; - - let mut source_tree = VirtualSourceTree::new(); - for subject in iter::once(&primary_subject).chain(dependency_subjects.iter()) { - source_tree.as_mut().add_file( - Path::new(&subject.name), - subject.schema.content.as_bytes().to_vec(), - ); - } - for (path, bytes) in WELL_KNOWN_TYPES { - source_tree - .as_mut() - .add_file(Path::new(path), bytes.to_vec()); - } - - let mut error_collector = SimpleErrorCollector::new(); - // `db` needs to be dropped before we can iterate on `error_collector`. - let fds = { - let mut db = SourceTreeDescriptorDatabase::new(source_tree.as_mut()); - db.as_mut().record_errors_to(error_collector.as_mut()); - db.as_mut() - .build_file_descriptor_set(&[Path::new(&primary_subject.name)]) - } - .map_err(|_| PbCompileError::Build { - errs: error_collector.as_mut().collect(), - })?; - fds.serialize().map_err(|_| PbCompileError::Serialize) +) -> Result, SchemaFetchError> { + compile_pb( + ( + PathBuf::from(&primary_subject.name), + primary_subject.schema.content.as_bytes().to_vec(), + ), + dependency_subjects + .into_iter() + .map(|s| (PathBuf::from(&s.name), s.schema.content.as_bytes().to_vec())), + ) + .map_err(|e| SchemaFetchError::SchemaCompile(e.into())) } diff --git a/src/connector/src/sink/big_query.rs b/src/connector/src/sink/big_query.rs index 235b1ff5b6539..85e8ba0187c99 100644 --- a/src/connector/src/sink/big_query.rs +++ b/src/connector/src/sink/big_query.rs @@ -342,7 +342,9 @@ impl BigQuerySink { DataType::Int256 => Err(SinkError::BigQuery(anyhow::anyhow!( "Bigquery cannot support Int256" ))), - DataType::Map(_) => todo!(), + DataType::Map(_) => Err(SinkError::BigQuery(anyhow::anyhow!( + "Bigquery cannot support Map" + ))), } } @@ -392,7 +394,11 @@ impl BigQuerySink { "Bigquery cannot support Int256" ))) } - DataType::Map(_) => todo!(), + DataType::Map(_) => { + return Err(SinkError::BigQuery(anyhow::anyhow!( + "Bigquery cannot support Map" + ))) + } }; Ok(tfs) } diff --git a/src/connector/src/sink/clickhouse.rs b/src/connector/src/sink/clickhouse.rs index 6b3e78f6a7b9d..21a7c56dc8154 100644 --- a/src/connector/src/sink/clickhouse.rs +++ b/src/connector/src/sink/clickhouse.rs @@ -25,7 +25,6 @@ use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::Schema; use risingwave_common::row::Row; -use risingwave_common::session_config::sink_decouple::SinkDecouple; use risingwave_common::types::{DataType, Decimal, ScalarRefImpl, Serial}; use serde::ser::{SerializeSeq, SerializeStruct}; use serde::Serialize; @@ -38,12 +37,10 @@ use with_options::WithOptions; use super::decouple_checkpoint_log_sink::{ default_commit_checkpoint_interval, DecoupleCheckpointLogSinkerOf, - DEFAULT_COMMIT_CHECKPOINT_INTERVAL, }; use super::writer::SinkWriter; use super::{DummySinkCommitCoordinator, SinkWriterParam}; use crate::error::ConnectorResult; -use crate::sink::catalog::desc::SinkDesc; use crate::sink::{ Result, Sink, SinkError, SinkParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT, }; @@ -428,6 +425,7 @@ impl ClickHouseSink { fields_type: &DataType, ck_column: &SystemColumn, ) -> Result<()> { + // FIXME: the "contains" based implementation is wrong let is_match = match fields_type { risingwave_common::types::DataType::Boolean => Ok(ck_column.r#type.contains("Bool")), risingwave_common::types::DataType::Int16 => Ok(ck_column.r#type.contains("UInt16") @@ -497,29 +495,6 @@ impl Sink for ClickHouseSink { const SINK_NAME: &'static str = CLICKHOUSE_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - if let Some(interval) = desc.properties.get("commit_checkpoint_interval") { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - } else { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL - }; - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if commit_checkpoint_interval > 1 { - return Err(SinkError::Config(anyhow!( - "config conflict: Clickhouse config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn validate(&self) -> Result<()> { // For upsert clickhouse sink, the primary key must be defined. if !self.is_append_only && self.pk_indices.is_empty() { diff --git a/src/connector/src/sink/decouple_checkpoint_log_sink.rs b/src/connector/src/sink/decouple_checkpoint_log_sink.rs index 61a2f0f70fd05..59e3335eb36db 100644 --- a/src/connector/src/sink/decouple_checkpoint_log_sink.rs +++ b/src/connector/src/sink/decouple_checkpoint_log_sink.rs @@ -20,10 +20,12 @@ use async_trait::async_trait; use crate::sink::log_store::{LogStoreReadItem, TruncateOffset}; use crate::sink::writer::SinkWriter; use crate::sink::{LogSinker, Result, SinkLogReader, SinkMetrics}; -pub const DEFAULT_COMMIT_CHECKPOINT_INTERVAL: u64 = 10; +pub const DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE: u64 = 10; +pub const DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITHOUT_SINK_DECOUPLE: u64 = 1; +pub const COMMIT_CHECKPOINT_INTERVAL: &str = "commit_checkpoint_interval"; pub fn default_commit_checkpoint_interval() -> u64 { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE } /// The `LogSinker` implementation used for commit-decoupled sinks (such as `Iceberg`, `DeltaLake` and `StarRocks`). diff --git a/src/connector/src/sink/deltalake.rs b/src/connector/src/sink/deltalake.rs index 2dedffa3469e3..494adb2dd6fed 100644 --- a/src/connector/src/sink/deltalake.rs +++ b/src/connector/src/sink/deltalake.rs @@ -31,7 +31,6 @@ use risingwave_common::array::StreamChunk; use risingwave_common::bail; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::Schema; -use risingwave_common::session_config::sink_decouple::SinkDecouple; use risingwave_common::types::DataType; use risingwave_common::util::iter_util::ZipEqFast; use risingwave_pb::connector_service::sink_metadata::Metadata::Serialized; @@ -41,11 +40,9 @@ use serde_derive::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use with_options::WithOptions; -use super::catalog::desc::SinkDesc; use super::coordinate::CoordinatedSinkWriter; use super::decouple_checkpoint_log_sink::{ default_commit_checkpoint_interval, DecoupleCheckpointLogSinkerOf, - DEFAULT_COMMIT_CHECKPOINT_INTERVAL, }; use super::writer::SinkWriter; use super::{ @@ -285,29 +282,6 @@ impl Sink for DeltaLakeSink { const SINK_NAME: &'static str = DELTALAKE_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - if let Some(interval) = desc.properties.get("commit_checkpoint_interval") { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - } else { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL - }; - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if commit_checkpoint_interval > 1 { - return Err(SinkError::Config(anyhow!( - "config conflict: DeltaLake config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result { let inner = DeltaLakeSinkWriter::new( self.config.clone(), diff --git a/src/connector/src/sink/dynamodb.rs b/src/connector/src/sink/dynamodb.rs index 6d73bf2d478c8..c8c3c598e6319 100644 --- a/src/connector/src/sink/dynamodb.rs +++ b/src/connector/src/sink/dynamodb.rs @@ -24,7 +24,7 @@ use dynamodb::types::{ }; use maplit::hashmap; use risingwave_common::array::{Op, RowRef, StreamChunk}; -use risingwave_common::catalog::{Field, Schema}; +use risingwave_common::catalog::Schema; use risingwave_common::row::Row as _; use risingwave_common::types::{DataType, ScalarRefImpl, ToText}; use risingwave_common::util::iter_util::ZipEqDebug; @@ -345,16 +345,13 @@ impl DynamoDbFormatter { row.iter() .zip_eq_debug((self.schema.clone()).into_fields()) .map(|(scalar, field)| { - map_data_type(scalar, &field.data_type()).map(|attr| (field.name, attr)) + map_data(scalar, &field.data_type()).map(|attr| (field.name, attr)) }) .collect() } } -fn map_data_type( - scalar_ref: Option>, - data_type: &DataType, -) -> Result { +fn map_data(scalar_ref: Option>, data_type: &DataType) -> Result { let Some(scalar_ref) = scalar_ref else { return Ok(AttributeValue::Null(true)); }; @@ -381,24 +378,25 @@ fn map_data_type( let list_attr = scalar_ref .into_list() .iter() - .map(|x| map_data_type(x, datatype)) + .map(|x| map_data(x, datatype)) .collect::>>()?; AttributeValue::L(list_attr) } DataType::Struct(st) => { let mut map = HashMap::with_capacity(st.len()); - for (sub_datum_ref, sub_field) in - scalar_ref.into_struct().iter_fields_ref().zip_eq_debug( - st.iter() - .map(|(name, dt)| Field::with_name(dt.clone(), name)), - ) + for (sub_datum_ref, (name, data_type)) in scalar_ref + .into_struct() + .iter_fields_ref() + .zip_eq_debug(st.iter()) { - let attr = map_data_type(sub_datum_ref, &sub_field.data_type())?; - map.insert(sub_field.name.clone(), attr); + let attr = map_data(sub_datum_ref, data_type)?; + map.insert(name.to_string(), attr); } AttributeValue::M(map) } - DataType::Map(_) => todo!(), + DataType::Map(_m) => { + return Err(SinkError::DynamoDb(anyhow!("map is not supported yet"))); + } }; Ok(attr) } diff --git a/src/connector/src/sink/encoder/avro.rs b/src/connector/src/sink/encoder/avro.rs index 4a2060f0a8c6c..1a9218572814f 100644 --- a/src/connector/src/sink/encoder/avro.rs +++ b/src/connector/src/sink/encoder/avro.rs @@ -455,7 +455,7 @@ fn encode_field( return no_match_err(); } DataType::Map(_) => { - // TODO: + // TODO(map): support map return no_match_err(); } }; diff --git a/src/connector/src/sink/encoder/bson.rs b/src/connector/src/sink/encoder/bson.rs index c401d0575a12b..5f7908ed4cedd 100644 --- a/src/connector/src/sink/encoder/bson.rs +++ b/src/connector/src/sink/encoder/bson.rs @@ -188,6 +188,7 @@ fn datum_to_bson(field: &Field, datum: DatumRef<'_>) -> Bson { subtype: BinarySubtype::Generic, bytes: v.into(), }), + // TODO(map): support map _ => { if let Ok(suppressed_count) = LOG_SUPPERSSER.check() { tracing::warn!( diff --git a/src/connector/src/sink/encoder/json.rs b/src/connector/src/sink/encoder/json.rs index 6dc8809f42933..7691b3de5f447 100644 --- a/src/connector/src/sink/encoder/json.rs +++ b/src/connector/src/sink/encoder/json.rs @@ -346,6 +346,7 @@ fn datum_to_json_object( } } } + // TODO(map): support map (data_type, scalar_ref) => { return Err(ArrayError::internal( format!("datum_to_json_object: unsupported data type: field name: {:?}, logical type: {:?}, physical type: {:?}", field.name, data_type, scalar_ref), diff --git a/src/connector/src/sink/encoder/proto.rs b/src/connector/src/sink/encoder/proto.rs index 8046606b5690c..88fb445b1c3ba 100644 --- a/src/connector/src/sink/encoder/proto.rs +++ b/src/connector/src/sink/encoder/proto.rs @@ -421,7 +421,7 @@ fn encode_field( return no_match_err(); } DataType::Map(_) => { - // TODO: + // TODO(map): support map return no_match_err(); } }; @@ -440,10 +440,10 @@ mod tests { #[test] fn test_encode_proto_ok() { let pool_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) - .join("src/test_data/proto_recursive/recursive.pb"); + .join("codec/tests/test_data/all-types.pb"); let pool_bytes = std::fs::read(pool_path).unwrap(); let pool = prost_reflect::DescriptorPool::decode(pool_bytes.as_ref()).unwrap(); - let descriptor = pool.get_message_by_name("recursive.AllTypes").unwrap(); + let descriptor = pool.get_message_by_name("all_types.AllTypes").unwrap(); let schema = Schema::new(vec![ Field::with_name(DataType::Boolean, "bool_field"), Field::with_name(DataType::Varchar, "string_field"), @@ -495,7 +495,7 @@ mod tests { // Hint: write the binary output to a file `test.binpb`, and view it with `protoc`: // ``` // protoc --decode_raw < test.binpb - // protoc --decode=recursive.AllTypes recursive.proto < test.binpb + // protoc --decode=all_types.AllTypes all-types.proto < test.binpb // ``` [ 9, 0, 0, 0, 0, 0, 0, 17, 64, 21, 0, 0, 96, 64, 24, 22, 32, 23, 56, 48, 93, 26, 0, @@ -509,10 +509,10 @@ mod tests { #[test] fn test_encode_proto_repeated() { let pool_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) - .join("src/test_data/proto_recursive/recursive.pb"); - let pool_bytes = std::fs::read(pool_path).unwrap(); + .join("codec/tests/test_data/all-types.pb"); + let pool_bytes = fs_err::read(pool_path).unwrap(); let pool = prost_reflect::DescriptorPool::decode(pool_bytes.as_ref()).unwrap(); - let message_descriptor = pool.get_message_by_name("recursive.AllTypes").unwrap(); + let message_descriptor = pool.get_message_by_name("all_types.AllTypes").unwrap(); let schema = Schema::new(vec![Field::with_name( DataType::List(DataType::List(DataType::Int32.into()).into()), @@ -561,10 +561,10 @@ mod tests { #[test] fn test_encode_proto_err() { let pool_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) - .join("src/test_data/proto_recursive/recursive.pb"); + .join("codec/tests/test_data/all-types.pb"); let pool_bytes = std::fs::read(pool_path).unwrap(); let pool = prost_reflect::DescriptorPool::decode(pool_bytes.as_ref()).unwrap(); - let message_descriptor = pool.get_message_by_name("recursive.AllTypes").unwrap(); + let message_descriptor = pool.get_message_by_name("all_types.AllTypes").unwrap(); let err = validate_fields( std::iter::once(("not_exists", &DataType::Int16)), diff --git a/src/connector/src/sink/encoder/template.rs b/src/connector/src/sink/encoder/template.rs index 1903be667d781..c34e75e4ad435 100644 --- a/src/connector/src/sink/encoder/template.rs +++ b/src/connector/src/sink/encoder/template.rs @@ -22,7 +22,8 @@ use risingwave_common::types::ToText; use super::{Result, RowEncoder}; use crate::sink::SinkError; -/// Encode a row according to a specified string template `user_id:{user_id}` +/// Encode a row according to a specified string template `user_id:{user_id}`. +/// Data is encoded to string with [`ToText`]. pub struct TemplateEncoder { schema: Schema, col_indices: Option>, diff --git a/src/connector/src/sink/encoder/text.rs b/src/connector/src/sink/encoder/text.rs index 734ac8bd6a425..369f4212fea6b 100644 --- a/src/connector/src/sink/encoder/text.rs +++ b/src/connector/src/sink/encoder/text.rs @@ -17,6 +17,7 @@ use risingwave_common::types::{DataType, ToText}; use super::RowEncoder; +/// Encode with [`ToText`]. Only used to encode key. pub struct TextEncoder { pub schema: Schema, // the column must contain only one element diff --git a/src/connector/src/sink/file_sink/opendal_sink.rs b/src/connector/src/sink/file_sink/opendal_sink.rs index 1fd461015b4ba..65ec46f494345 100644 --- a/src/connector/src/sink/file_sink/opendal_sink.rs +++ b/src/connector/src/sink/file_sink/opendal_sink.rs @@ -97,9 +97,6 @@ impl Sink for FileSink { const SINK_NAME: &'static str = S::SINK_NAME; async fn validate(&self) -> Result<()> { - risingwave_common::license::Feature::FileSink - .check_available() - .map_err(|e| anyhow::anyhow!(e))?; if !self.is_append_only { return Err(SinkError::Config(anyhow!( "File sink only supports append-only mode at present. \ diff --git a/src/connector/src/sink/iceberg/jni_catalog.rs b/src/connector/src/sink/iceberg/jni_catalog.rs index b80a6a305870f..6529ea733428d 100644 --- a/src/connector/src/sink/iceberg/jni_catalog.rs +++ b/src/connector/src/sink/iceberg/jni_catalog.rs @@ -288,7 +288,7 @@ impl CatalogV2 for JniCatalog { "Failed to crete iceberg table.", ) .with_source(e) - }) + })? } /// Load table from the catalog. @@ -338,7 +338,7 @@ impl CatalogV2 for JniCatalog { "Failed to load iceberg table.", ) .with_source(e) - }) + })? } /// Drop a table from the catalog. diff --git a/src/connector/src/sink/iceberg/mod.rs b/src/connector/src/sink/iceberg/mod.rs index 1b135cd4d3b40..9e87694539f0c 100644 --- a/src/connector/src/sink/iceberg/mod.rs +++ b/src/connector/src/sink/iceberg/mod.rs @@ -43,11 +43,10 @@ use icelake::io_v2::{ DataFileWriterBuilder, EqualityDeltaWriterBuilder, IcebergWriterBuilder, DELETE_OP, INSERT_OP, }; use icelake::transaction::Transaction; -use icelake::types::{data_file_from_json, data_file_to_json, Any, DataFile, COLUMN_ID_META_KEY}; +use icelake::types::{data_file_from_json, data_file_to_json, Any, DataFile}; use icelake::{Table, TableIdentifier}; use itertools::Itertools; -use parquet::arrow::PARQUET_FIELD_ID_META_KEY; -use risingwave_common::array::arrow::IcebergArrowConvert; +use risingwave_common::array::arrow::{IcebergArrowConvert, IcebergCreateTableArrowConvert}; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bail; use risingwave_common::bitmap::Bitmap; @@ -65,10 +64,8 @@ use with_options::WithOptions; use self::mock_catalog::MockCatalog; use self::prometheus::monitored_base_file_writer::MonitoredBaseFileWriterBuilder; use self::prometheus::monitored_position_delete_writer::MonitoredPositionDeleteWriterBuilder; -use super::catalog::desc::SinkDesc; use super::decouple_checkpoint_log_sink::{ default_commit_checkpoint_interval, DecoupleCheckpointLogSinkerOf, - DEFAULT_COMMIT_CHECKPOINT_INTERVAL, }; use super::{ Sink, SinkError, SinkWriterParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT, @@ -76,7 +73,7 @@ use super::{ use crate::error::ConnectorResult; use crate::sink::coordinate::CoordinatedSinkWriter; use crate::sink::writer::SinkWriter; -use crate::sink::{Result, SinkCommitCoordinator, SinkDecouple, SinkParam}; +use crate::sink::{Result, SinkCommitCoordinator, SinkParam}; use crate::{ deserialize_bool_from_string, deserialize_optional_bool_from_string, deserialize_optional_string_seq_from_string, @@ -672,7 +669,7 @@ impl IcebergConfig { .file_io(storage_catalog.file_io().clone()) // Only support readonly table for storage catalog now. .readonly(true) - .build()) + .build()?) } _ => self.load_table_v2().await, } @@ -747,30 +744,20 @@ impl IcebergSink { bail!("database name must be set if you want to create table") }; + let iceberg_create_table_arrow_convert = IcebergCreateTableArrowConvert::default(); // convert risingwave schema -> arrow schema -> iceberg schema let arrow_fields = self .param .columns .iter() .map(|column| { - let mut arrow_field = IcebergArrowConvert + Ok(iceberg_create_table_arrow_convert .to_arrow_field(&column.name, &column.data_type) .map_err(|e| SinkError::Iceberg(anyhow!(e))) .context(format!( "failed to convert {}: {} to arrow type", &column.name, &column.data_type - ))?; - let mut metadata = HashMap::new(); - metadata.insert( - PARQUET_FIELD_ID_META_KEY.to_string(), - column.column_id.get_id().to_string(), - ); - metadata.insert( - COLUMN_ID_META_KEY.to_string(), - column.column_id.get_id().to_string(), - ); - arrow_field.set_metadata(metadata); - Ok(arrow_field) + ))?) }) .collect::>>()?; let arrow_schema = arrow_schema_iceberg::Schema::new(arrow_fields); @@ -843,31 +830,6 @@ impl Sink for IcebergSink { const SINK_NAME: &'static str = ICEBERG_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - desc.properties - .get("commit_checkpoint_interval") - .map(|interval| { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - }); - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if let Some(commit_checkpoint_interval) = commit_checkpoint_interval - && commit_checkpoint_interval > 1 - { - return Err(SinkError::Config(anyhow!( - "config conflict: Iceberg config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn validate(&self) -> Result<()> { if "glue".eq_ignore_ascii_case(self.config.catalog_type()) { risingwave_common::license::Feature::IcebergSinkWithGlue @@ -1399,7 +1361,7 @@ mod test { use risingwave_common::catalog::Field; - use crate::sink::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL; + use crate::sink::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE; use crate::sink::iceberg::IcebergConfig; use crate::source::DataType; @@ -1482,7 +1444,7 @@ mod test { .into_iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - commit_checkpoint_interval: DEFAULT_COMMIT_CHECKPOINT_INTERVAL, + commit_checkpoint_interval: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE, create_table_if_not_exists: false, }; diff --git a/src/connector/src/sink/iceberg/storage_catalog.rs b/src/connector/src/sink/iceberg/storage_catalog.rs index 01adb510882a2..18e2ff0e036ff 100644 --- a/src/connector/src/sink/iceberg/storage_catalog.rs +++ b/src/connector/src/sink/iceberg/storage_catalog.rs @@ -249,11 +249,11 @@ impl Catalog for StorageCatalog { let version_hint_output = self.file_io.new_output(&version_hint_path)?; version_hint_output.write("1".into()).await?; - Ok(Table::builder() + Table::builder() .metadata(table_metadata) .identifier(table_ident) .file_io(self.file_io.clone()) - .build()) + .build() } /// Load table from the catalog. @@ -283,13 +283,13 @@ impl Catalog for StorageCatalog { let metadata_file_content = metadata_file.read().await?; let table_metadata = serde_json::from_slice::(&metadata_file_content)?; - Ok(Table::builder() + Table::builder() .metadata(table_metadata) .identifier(table.clone()) .file_io(self.file_io.clone()) // Only support readonly table for storage catalog now. .readonly(true) - .build()) + .build() } /// Drop a table from the catalog. diff --git a/src/connector/src/sink/mod.rs b/src/connector/src/sink/mod.rs index dafbc856207a9..b453af53cca41 100644 --- a/src/connector/src/sink/mod.rs +++ b/src/connector/src/sink/mod.rs @@ -53,6 +53,13 @@ use ::deltalake::DeltaTableError; use ::redis::RedisError; use anyhow::anyhow; use async_trait::async_trait; +use clickhouse::CLICKHOUSE_SINK; +use decouple_checkpoint_log_sink::{ + COMMIT_CHECKPOINT_INTERVAL, DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITHOUT_SINK_DECOUPLE, + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE, +}; +use deltalake::DELTALAKE_SINK; +use iceberg::ICEBERG_SINK; use opendal::Error as OpendalError; use risingwave_common::array::ArrayError; use risingwave_common::bitmap::Bitmap; @@ -66,6 +73,7 @@ use risingwave_pb::catalog::PbSinkType; use risingwave_pb::connector_service::{PbSinkParam, SinkMetadata, TableSchema}; use risingwave_rpc_client::error::RpcError; use risingwave_rpc_client::MetaClient; +use starrocks::STARROCKS_SINK; use thiserror::Error; use thiserror_ext::AsReport; pub use tracing; @@ -366,13 +374,54 @@ impl SinkWriterParam { } } +fn is_sink_support_commit_checkpoint_interval(sink_name: &str) -> bool { + matches!( + sink_name, + ICEBERG_SINK | CLICKHOUSE_SINK | STARROCKS_SINK | DELTALAKE_SINK + ) +} pub trait Sink: TryFrom { const SINK_NAME: &'static str; type LogSinker: LogSinker; type Coordinator: SinkCommitCoordinator; + fn set_default_commit_checkpoint_interval( + desc: &mut SinkDesc, + user_specified: &SinkDecouple, + ) -> Result<()> { + if is_sink_support_commit_checkpoint_interval(Self::SINK_NAME) { + match desc.properties.get(COMMIT_CHECKPOINT_INTERVAL) { + Some(commit_checkpoint_interval) => { + let commit_checkpoint_interval = commit_checkpoint_interval + .parse::() + .map_err(|e| SinkError::Config(anyhow!(e)))?; + if matches!(user_specified, SinkDecouple::Disable) + && commit_checkpoint_interval > 1 + { + return Err(SinkError::Config(anyhow!("config conflict: `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled"))); + } + } + None => match user_specified { + SinkDecouple::Default | SinkDecouple::Enable => { + desc.properties.insert( + COMMIT_CHECKPOINT_INTERVAL.to_string(), + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE.to_string(), + ); + } + SinkDecouple::Disable => { + desc.properties.insert( + COMMIT_CHECKPOINT_INTERVAL.to_string(), + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITHOUT_SINK_DECOUPLE.to_string(), + ); + } + }, + } + } + Ok(()) + } + /// `user_specified` is the value of `sink_decouple` config. - fn is_sink_decouple(_desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(user_specified: &SinkDecouple) -> Result { match user_specified { SinkDecouple::Default | SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), diff --git a/src/connector/src/sink/remote.rs b/src/connector/src/sink/remote.rs index 4988a00b95645..aa8ca0625d05f 100644 --- a/src/connector/src/sink/remote.rs +++ b/src/connector/src/sink/remote.rs @@ -59,7 +59,6 @@ use tracing::warn; use super::elasticsearch::{is_es_sink, StreamChunkConverter, ES_OPTION_DELIMITER}; use crate::error::ConnectorResult; -use crate::sink::catalog::desc::SinkDesc; use crate::sink::coordinate::CoordinatedSinkWriter; use crate::sink::log_store::{LogStoreReadItem, LogStoreResult, TruncateOffset}; use crate::sink::writer::{LogSinkerOf, SinkWriter, SinkWriterExt}; @@ -115,7 +114,7 @@ def_remote_sink!(); pub trait RemoteSinkTrait: Send + Sync + 'static { const SINK_NAME: &'static str; - fn default_sink_decouple(_desc: &SinkDesc) -> bool { + fn default_sink_decouple() -> bool { true } } @@ -143,9 +142,9 @@ impl Sink for RemoteSink { const SINK_NAME: &'static str = R::SINK_NAME; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(user_specified: &SinkDecouple) -> Result { match user_specified { - SinkDecouple::Default => Ok(R::default_sink_decouple(desc)), + SinkDecouple::Default => Ok(R::default_sink_decouple()), SinkDecouple::Enable => Ok(true), SinkDecouple::Disable => Ok(false), } diff --git a/src/connector/src/sink/starrocks.rs b/src/connector/src/sink/starrocks.rs index 21a4fc371b940..5c3e724721d18 100644 --- a/src/connector/src/sink/starrocks.rs +++ b/src/connector/src/sink/starrocks.rs @@ -24,7 +24,6 @@ use mysql_async::Opts; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::Schema; -use risingwave_common::session_config::sink_decouple::SinkDecouple; use risingwave_common::types::DataType; use risingwave_pb::connector_service::sink_metadata::Metadata::Serialized; use risingwave_pb::connector_service::sink_metadata::SerializedMetadata; @@ -38,7 +37,7 @@ use tokio::task::JoinHandle; use url::form_urlencoded; use with_options::WithOptions; -use super::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL; +use super::decouple_checkpoint_log_sink::DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE; use super::doris_starrocks_connector::{ HeaderBuilder, InserterInner, StarrocksTxnRequestBuilder, STARROCKS_DELETE_SIGN, STARROCKS_SUCCESS_STATUS, @@ -48,7 +47,6 @@ use super::{ SinkCommitCoordinator, SinkError, SinkParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT, }; -use crate::sink::catalog::desc::SinkDesc; use crate::sink::coordinate::CoordinatedSinkWriter; use crate::sink::decouple_checkpoint_log_sink::DecoupleCheckpointLogSinkerOf; use crate::sink::{Result, Sink, SinkWriter, SinkWriterParam}; @@ -118,7 +116,7 @@ pub struct StarrocksConfig { } fn default_commit_checkpoint_interval() -> u64 { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL + DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE } impl StarrocksConfig { @@ -264,29 +262,6 @@ impl Sink for StarrocksSink { const SINK_NAME: &'static str = STARROCKS_SINK; - fn is_sink_decouple(desc: &SinkDesc, user_specified: &SinkDecouple) -> Result { - let commit_checkpoint_interval = - if let Some(interval) = desc.properties.get("commit_checkpoint_interval") { - interval - .parse::() - .unwrap_or(DEFAULT_COMMIT_CHECKPOINT_INTERVAL) - } else { - DEFAULT_COMMIT_CHECKPOINT_INTERVAL - }; - - match user_specified { - SinkDecouple::Default | SinkDecouple::Enable => Ok(true), - SinkDecouple::Disable => { - if commit_checkpoint_interval > 1 { - return Err(SinkError::Config(anyhow!( - "config conflict: Starrocks config `commit_checkpoint_interval` larger than 1 means that sink decouple must be enabled, but session config sink_decouple is disabled" - ))); - } - Ok(false) - } - } - } - async fn validate(&self) -> Result<()> { if !self.is_append_only && self.pk_indices.is_empty() { return Err(SinkError::Config(anyhow!( diff --git a/src/connector/src/sink/trivial.rs b/src/connector/src/sink/trivial.rs index 5c5e093c8e0f0..e19f99943338c 100644 --- a/src/connector/src/sink/trivial.rs +++ b/src/connector/src/sink/trivial.rs @@ -17,7 +17,6 @@ use std::marker::PhantomData; use async_trait::async_trait; use risingwave_common::session_config::sink_decouple::SinkDecouple; -use super::catalog::desc::SinkDesc; use crate::sink::log_store::{LogStoreReadItem, TruncateOffset}; use crate::sink::{ DummySinkCommitCoordinator, LogSinker, Result, Sink, SinkError, SinkLogReader, SinkParam, @@ -67,7 +66,7 @@ impl Sink for TrivialSink { const SINK_NAME: &'static str = T::SINK_NAME; // Disable sink decoupling for all trivial sinks because it introduces overhead without any benefit - fn is_sink_decouple(_desc: &SinkDesc, _user_specified: &SinkDecouple) -> Result { + fn is_sink_decouple(_user_specified: &SinkDecouple) -> Result { Ok(false) } diff --git a/src/connector/src/source/cdc/external/mod.rs b/src/connector/src/source/cdc/external/mod.rs index be1c891b8d078..7a73f9b9bce98 100644 --- a/src/connector/src/source/cdc/external/mod.rs +++ b/src/connector/src/source/cdc/external/mod.rs @@ -237,7 +237,12 @@ pub struct ExternalTableConfig { /// Choices include `disabled`, `preferred`, and `required`. /// This field is optional. #[serde(rename = "ssl.mode", default = "Default::default")] - pub sslmode: SslMode, + #[serde(alias = "debezium.database.sslmode")] + pub ssl_mode: SslMode, + + #[serde(rename = "ssl.root.cert")] + #[serde(alias = "debezium.database.sslrootcert")] + pub ssl_root_cert: Option, } impl ExternalTableConfig { @@ -253,7 +258,7 @@ impl ExternalTableConfig { } } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, PartialEq, Deserialize)] #[serde(rename_all = "lowercase")] pub enum SslMode { #[serde(alias = "disable")] @@ -262,6 +267,14 @@ pub enum SslMode { Preferred, #[serde(alias = "require")] Required, + /// verify that the server is trustworthy by checking the certificate chain + /// up to the root certificate stored on the client. + #[serde(alias = "verify-ca")] + VerifyCa, + /// Besides verify the certificate, will also verify that the serverhost name + /// matches the name stored in the server certificate. + #[serde(alias = "verify-full")] + VerifyFull, } impl Default for SslMode { @@ -277,6 +290,8 @@ impl fmt::Display for SslMode { SslMode::Disabled => "disabled", SslMode::Preferred => "preferred", SslMode::Required => "required", + SslMode::VerifyCa => "verify-ca", + SslMode::VerifyFull => "verify-full", }) } } diff --git a/src/connector/src/source/cdc/external/mysql.rs b/src/connector/src/source/cdc/external/mysql.rs index 0e7ec02cfac27..59971f8761068 100644 --- a/src/connector/src/source/cdc/external/mysql.rs +++ b/src/connector/src/source/cdc/external/mysql.rs @@ -85,9 +85,12 @@ impl MySqlExternalTable { .host(&config.host) .port(config.port.parse::().unwrap()) .database(&config.database) - .ssl_mode(match config.sslmode { + .ssl_mode(match config.ssl_mode { SslMode::Disabled | SslMode::Preferred => sqlx::mysql::MySqlSslMode::Disabled, SslMode::Required => sqlx::mysql::MySqlSslMode::Required, + _ => { + return Err(anyhow!("unsupported SSL mode").into()); + } }); let connection = MySqlPool::connect_with(options).await?; @@ -308,9 +311,10 @@ impl MySqlExternalTableReader { .tcp_port(config.port.parse::().unwrap()) .db_name(Some(config.database)); - opts_builder = match config.sslmode { + opts_builder = match config.ssl_mode { SslMode::Disabled | SslMode::Preferred => opts_builder.ssl_opts(None), - SslMode::Required => { + // verify-ca and verify-full are same as required for mysql now + SslMode::Required | SslMode::VerifyCa | SslMode::VerifyFull => { let ssl_without_verify = mysql_async::SslOpts::default() .with_danger_accept_invalid_certs(true) .with_danger_skip_domain_validation(true); @@ -529,7 +533,8 @@ mod tests { database: "mydb".to_string(), schema: "".to_string(), table: "part".to_string(), - sslmode: Default::default(), + ssl_mode: Default::default(), + ssl_root_cert: None, }; let table = MySqlExternalTable::connect(config).await.unwrap(); diff --git a/src/connector/src/source/cdc/external/postgres.rs b/src/connector/src/source/cdc/external/postgres.rs index ca0caf46d6125..9123c7451b74e 100644 --- a/src/connector/src/source/cdc/external/postgres.rs +++ b/src/connector/src/source/cdc/external/postgres.rs @@ -86,18 +86,26 @@ pub struct PostgresExternalTable { impl PostgresExternalTable { pub async fn connect(config: ExternalTableConfig) -> ConnectorResult { tracing::debug!("connect to postgres external table"); - let options = PgConnectOptions::new() + let mut options = PgConnectOptions::new() .username(&config.username) .password(&config.password) .host(&config.host) .port(config.port.parse::().unwrap()) .database(&config.database) - .ssl_mode(match config.sslmode { + .ssl_mode(match config.ssl_mode { SslMode::Disabled => PgSslMode::Disable, SslMode::Preferred => PgSslMode::Prefer, SslMode::Required => PgSslMode::Require, + SslMode::VerifyCa => PgSslMode::VerifyCa, + SslMode::VerifyFull => PgSslMode::VerifyFull, }); + if config.ssl_mode == SslMode::VerifyCa || config.ssl_mode == SslMode::VerifyFull { + if let Some(ref root_cert) = config.ssl_root_cert { + options = options.ssl_root_cert(root_cert.as_str()); + } + } + let connection = PgPool::connect_with(options).await?; let schema_discovery = SchemaDiscovery::new(connection, config.schema.as_str()); // fetch column schema and primary key @@ -288,8 +296,14 @@ impl PostgresExternalTableReader { .port(config.port.parse::().unwrap()) .dbname(&config.database); + let (_verify_ca, verify_hostname) = match config.ssl_mode { + SslMode::VerifyCa => (true, false), + SslMode::VerifyFull => (true, true), + _ => (false, false), + }; + #[cfg(not(madsim))] - let connector = match config.sslmode { + let connector = match config.ssl_mode { SslMode::Disabled => { pg_config.ssl_mode(tokio_postgres::config::SslMode::Disable); MaybeMakeTlsConnector::NoTls(NoTls) @@ -315,6 +329,24 @@ impl PostgresExternalTableReader { builder.set_verify(SslVerifyMode::NONE); MaybeMakeTlsConnector::Tls(MakeTlsConnector::new(builder.build())) } + + SslMode::VerifyCa | SslMode::VerifyFull => { + pg_config.ssl_mode(tokio_postgres::config::SslMode::Require); + let mut builder = SslConnector::builder(SslMethod::tls())?; + if let Some(ssl_root_cert) = config.ssl_root_cert { + builder.set_ca_file(ssl_root_cert).map_err(|e| { + anyhow!(format!("bad ssl root cert error: {}", e.to_report_string())) + })?; + } + let mut connector = MakeTlsConnector::new(builder.build()); + if !verify_hostname { + connector.set_callback(|config, _| { + config.set_verify_hostname(false); + Ok(()) + }); + } + MaybeMakeTlsConnector::Tls(connector) + } }; #[cfg(madsim)] let connector = NoTls; @@ -482,7 +514,8 @@ mod tests { database: "mydb".to_string(), schema: "public".to_string(), table: "mytest".to_string(), - sslmode: Default::default(), + ssl_mode: Default::default(), + ssl_root_cert: None, }; let table = PostgresExternalTable::connect(config).await.unwrap(); diff --git a/src/connector/src/source/cdc/source/reader.rs b/src/connector/src/source/cdc/source/reader.rs index b29ef1312bbd9..e2fc405cd6297 100644 --- a/src/connector/src/source/cdc/source/reader.rs +++ b/src/connector/src/source/cdc/source/reader.rs @@ -213,15 +213,15 @@ impl CdcSplitReader { let mut rx = self.rx; let source_id = self.source_id.to_string(); let metrics = self.source_ctx.metrics.clone(); + let connector_source_rows_received_metrics = metrics + .connector_source_rows_received + .with_guarded_label_values(&[source_type.as_str_name(), &source_id]); while let Some(result) = rx.recv().await { match result { Ok(GetEventStreamResponse { events, .. }) => { tracing::trace!("receive {} cdc events ", events.len()); - metrics - .connector_source_rows_received - .with_guarded_label_values(&[source_type.as_str_name(), &source_id]) - .inc_by(events.len() as u64); + connector_source_rows_received_metrics.inc_by(events.len() as u64); let msgs = events.into_iter().map(SourceMessage::from).collect_vec(); yield msgs; } diff --git a/src/connector/src/source/common.rs b/src/connector/src/source/common.rs index 3acb85a87150e..80aacff2899c7 100644 --- a/src/connector/src/source/common.rs +++ b/src/connector/src/source/common.rs @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use futures::{Stream, StreamExt, TryStreamExt}; use futures_async_stream::try_stream; @@ -33,6 +34,8 @@ pub(crate) async fn into_chunk_stream( let source_id = source_ctx.source_id.to_string(); let source_name = source_ctx.source_name.to_string(); let metrics = source_ctx.metrics.clone(); + let mut partition_input_count = HashMap::new(); + let mut partition_bytes_count = HashMap::new(); // add metrics to the data stream let data_stream = data_stream @@ -40,22 +43,38 @@ pub(crate) async fn into_chunk_stream( let mut by_split_id = std::collections::HashMap::new(); for msg in data_batch { + let split_id: String = msg.split_id.as_ref().to_string(); by_split_id - .entry(msg.split_id.as_ref()) + .entry(split_id.clone()) .or_insert_with(Vec::new) .push(msg); + partition_input_count + .entry(split_id.clone()) + .or_insert_with(|| { + metrics.partition_input_count.with_guarded_label_values(&[ + &actor_id, + &source_id, + &split_id.clone(), + &source_name, + &fragment_id, + ]) + }); + partition_bytes_count + .entry(split_id.clone()) + .or_insert_with(|| { + metrics.partition_input_bytes.with_guarded_label_values(&[ + &actor_id, + &source_id, + &split_id, + &source_name, + &fragment_id, + ]) + }); } - for (split_id, msgs) in by_split_id { - metrics - .partition_input_count - .with_guarded_label_values(&[ - &actor_id, - &source_id, - split_id, - &source_name, - &fragment_id, - ]) + partition_input_count + .get_mut(&split_id) + .unwrap() .inc_by(msgs.len() as u64); let sum_bytes = msgs @@ -63,15 +82,9 @@ pub(crate) async fn into_chunk_stream( .flat_map(|msg| msg.payload.as_ref().map(|p| p.len() as u64)) .sum(); - metrics - .partition_input_bytes - .with_guarded_label_values(&[ - &actor_id, - &source_id, - split_id, - &source_name, - &fragment_id, - ]) + partition_input_count + .get_mut(&split_id) + .unwrap() .inc_by(sum_bytes); } }) diff --git a/src/connector/src/source/datagen/source/reader.rs b/src/connector/src/source/datagen/source/reader.rs index e6c6db6af5a71..33c0c4ea29261 100644 --- a/src/connector/src/source/datagen/source/reader.rs +++ b/src/connector/src/source/datagen/source/reader.rs @@ -18,6 +18,7 @@ use anyhow::Context; use async_trait::async_trait; use futures::{Stream, StreamExt, TryStreamExt}; use risingwave_common::field_generator::{FieldGeneratorImpl, VarcharProperty}; +use risingwave_common_estimate_size::EstimateSize; use thiserror_ext::AsReport; use super::generator::DatagenEventGenerator; @@ -156,20 +157,30 @@ impl SplitReader for DatagenSplitReader { let source_name = self.source_ctx.source_name.to_string(); let split_id = self.split_id.to_string(); let metrics = self.source_ctx.metrics.clone(); + let partition_input_count_metric = + metrics.partition_input_count.with_guarded_label_values(&[ + &actor_id, + &source_id, + &split_id, + &source_name, + &fragment_id, + ]); + let partition_input_bytes_metric = + metrics.partition_input_bytes.with_guarded_label_values(&[ + &actor_id, + &source_id, + &split_id, + &source_name, + &fragment_id, + ]); + spawn_data_generation_stream( self.generator .into_native_stream() .inspect_ok(move |stream_chunk| { - metrics - .partition_input_count - .with_guarded_label_values(&[ - &actor_id, - &source_id, - &split_id, - &source_name, - &fragment_id, - ]) - .inc_by(stream_chunk.cardinality() as u64); + partition_input_count_metric.inc_by(stream_chunk.cardinality() as u64); + partition_input_bytes_metric + .inc_by(stream_chunk.estimated_size() as u64); }), BUFFER_SIZE, ) diff --git a/src/connector/src/source/filesystem/opendal_source/mod.rs b/src/connector/src/source/filesystem/opendal_source/mod.rs index cea4972def92c..c0b4898758a79 100644 --- a/src/connector/src/source/filesystem/opendal_source/mod.rs +++ b/src/connector/src/source/filesystem/opendal_source/mod.rs @@ -47,12 +47,7 @@ pub struct FsSourceCommon { #[serde(rename = "refresh.interval.sec")] #[serde_as(as = "Option")] pub refresh_interval_sec: Option, - - #[serde(rename = "recursive_scan", default)] - #[serde_as(as = "Option")] - pub recursive_scan: Option, } - #[derive(Clone, Debug, Deserialize, PartialEq, WithOptions)] pub struct GcsProperties { #[serde(rename = "gcs.bucket_name")] diff --git a/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs b/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs index a9cb4b6c3f7f0..c9788aed28e69 100644 --- a/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs +++ b/src/connector/src/source/filesystem/opendal_source/opendal_enumerator.rs @@ -66,13 +66,13 @@ impl SplitEnumerator for OpendalEnumerator { } impl OpendalEnumerator { - pub async fn list(&self, recursive_scan: bool) -> ConnectorResult { + pub async fn list(&self) -> ConnectorResult { let prefix = self.prefix.as_deref().unwrap_or("/"); let object_lister = self .op .lister_with(prefix) - .recursive(recursive_scan) + .recursive(true) .metakey(Metakey::ContentLength | Metakey::LastModified) .await?; let stream = stream::unfold(object_lister, |mut object_lister| async move { @@ -108,5 +108,9 @@ impl OpendalEnumerator { pub fn get_matcher(&self) -> &Option { &self.matcher } + + pub fn get_prefix(&self) -> &str { + self.prefix.as_deref().unwrap_or("/") + } } pub type ObjectMetadataIter = BoxStream<'static, ConnectorResult>; diff --git a/src/connector/src/source/filesystem/opendal_source/opendal_reader.rs b/src/connector/src/source/filesystem/opendal_source/opendal_reader.rs index 5757452d2b4cd..1cfc9c1355167 100644 --- a/src/connector/src/source/filesystem/opendal_source/opendal_reader.rs +++ b/src/connector/src/source/filesystem/opendal_source/opendal_reader.rs @@ -176,6 +176,16 @@ impl OpendalReader { let mut offset: usize = split.offset; let mut batch_size: usize = 0; let mut batch = Vec::new(); + let partition_input_bytes_metrics = source_ctx + .metrics + .partition_input_bytes + .with_guarded_label_values(&[ + &actor_id, + &source_id, + &split_id, + &source_name, + &fragment_id, + ]); let stream = ReaderStream::with_capacity(buf_reader, STREAM_READER_CAPACITY); #[for_await] for read in stream { @@ -193,34 +203,14 @@ impl OpendalReader { batch.push(msg); if batch.len() >= max_chunk_size { - source_ctx - .metrics - .partition_input_bytes - .with_guarded_label_values(&[ - &actor_id, - &source_id, - &split_id, - &source_name, - &fragment_id, - ]) - .inc_by(batch_size as u64); + partition_input_bytes_metrics.inc_by(batch_size as u64); let yield_batch = std::mem::take(&mut batch); batch_size = 0; yield yield_batch; } } if !batch.is_empty() { - source_ctx - .metrics - .partition_input_bytes - .with_guarded_label_values(&[ - &actor_id, - &source_id, - &split_id, - &source_name, - &fragment_id, - ]) - .inc_by(batch_size as u64); + partition_input_bytes_metrics.inc_by(batch_size as u64); yield batch; } } diff --git a/src/connector/src/source/filesystem/s3/source/reader.rs b/src/connector/src/source/filesystem/s3/source/reader.rs index 7e02102686d00..910c98c1a5dae 100644 --- a/src/connector/src/source/filesystem/s3/source/reader.rs +++ b/src/connector/src/source/filesystem/s3/source/reader.rs @@ -106,6 +106,16 @@ impl S3FileReader { let mut offset: usize = split.offset; let mut batch_size: usize = 0; let mut batch = Vec::new(); + let partition_input_bytes_metrics = source_ctx + .metrics + .partition_input_bytes + .with_guarded_label_values(&[ + &actor_id, + &source_id, + &split_id, + &source_name, + &fragment_id, + ]); #[for_await] for read in stream { let bytes = read?; @@ -121,34 +131,14 @@ impl S3FileReader { batch_size += len; batch.push(msg); if batch.len() >= max_chunk_size { - source_ctx - .metrics - .partition_input_bytes - .with_guarded_label_values(&[ - &actor_id, - &source_id, - &split_id, - &source_name, - &fragment_id, - ]) - .inc_by(batch_size as u64); + partition_input_bytes_metrics.inc_by(batch_size as u64); let yield_batch = std::mem::take(&mut batch); batch_size = 0; yield yield_batch; } } if !batch.is_empty() { - source_ctx - .metrics - .partition_input_bytes - .with_guarded_label_values(&[ - &actor_id, - &source_id, - &split_id, - &source_name, - &fragment_id, - ]) - .inc_by(batch_size as u64); + partition_input_bytes_metrics.inc_by(batch_size as u64); yield batch; } } diff --git a/src/connector/src/source/iceberg/mod.rs b/src/connector/src/source/iceberg/mod.rs index d65929faafba1..845ffb66804d3 100644 --- a/src/connector/src/source/iceberg/mod.rs +++ b/src/connector/src/source/iceberg/mod.rs @@ -21,6 +21,7 @@ use async_trait::async_trait; use futures_async_stream::for_await; use iceberg::scan::FileScanTask; use iceberg::spec::TableMetadata; +use iceberg::table::Table; use itertools::Itertools; pub use parquet_file_reader::*; use risingwave_common::bail; @@ -28,7 +29,7 @@ use risingwave_common::catalog::Schema; use risingwave_common::types::JsonbVal; use serde::{Deserialize, Serialize}; -use crate::error::ConnectorResult; +use crate::error::{ConnectorError, ConnectorResult}; use crate::parser::ParserConfig; use crate::sink::iceberg::IcebergConfig; use crate::source::{ @@ -144,6 +145,7 @@ pub struct IcebergSplit { pub snapshot_id: i64, pub table_meta: TableMetadataJsonStr, pub files: Vec, + pub eq_delete_files: Vec, } impl SplitMetaData for IcebergSplit { @@ -206,6 +208,7 @@ impl IcebergSplitEnumerator { bail!("Batch parallelism is 0. Cannot split the iceberg files."); } let table = self.config.load_table_v2().await?; + let current_snapshot = table.metadata().current_snapshot(); if current_snapshot.is_none() { // If there is no snapshot, we will return a mock `IcebergSplit` with empty files. @@ -214,6 +217,7 @@ impl IcebergSplitEnumerator { snapshot_id: 0, // unused table_meta: TableMetadataJsonStr::serialize(table.metadata()), files: vec![], + eq_delete_files: vec![], }]); } @@ -228,10 +232,13 @@ impl IcebergSplitEnumerator { let snapshot = table .metadata() .snapshots() - .filter(|snapshot| snapshot.timestamp().timestamp_millis() <= timestamp) - .max_by_key(|snapshot| snapshot.timestamp().timestamp_millis()); + .map(|snapshot| snapshot.timestamp().map(|ts| ts.timestamp_millis())) + .collect::, _>>()? + .into_iter() + .filter(|&snapshot_millis| snapshot_millis <= timestamp) + .max_by_key(|&snapshot_millis| snapshot_millis); match snapshot { - Some(snapshot) => snapshot.snapshot_id(), + Some(snapshot) => snapshot, None => { // convert unix time to human readable time let time = chrono::DateTime::from_timestamp_millis(timestamp); @@ -248,12 +255,15 @@ impl IcebergSplitEnumerator { current_snapshot.unwrap().snapshot_id() } }; - let mut files = vec![]; + let require_names = Self::get_require_field_names(&table, snapshot_id, schema).await?; + + let mut data_files = vec![]; + let mut eq_delete_files = vec![]; let scan = table .scan() .snapshot_id(snapshot_id) - .select(schema.names()) + .select(require_names) .build() .map_err(|e| anyhow!(e))?; @@ -261,16 +271,27 @@ impl IcebergSplitEnumerator { #[for_await] for task in file_scan_stream { - let task = task.map_err(|e| anyhow!(e))?; - files.push(IcebergFileScanTaskJsonStr::serialize(&task)); + let mut task: FileScanTask = task.map_err(|e| anyhow!(e))?; + match task.data_file_content { + iceberg::spec::DataContentType::Data => { + data_files.push(IcebergFileScanTaskJsonStr::serialize(&task)); + } + iceberg::spec::DataContentType::EqualityDeletes => { + task.project_field_ids = task.equality_ids.clone(); + eq_delete_files.push(IcebergFileScanTaskJsonStr::serialize(&task)); + } + iceberg::spec::DataContentType::PositionDeletes => { + bail!("Position delete file is not supported") + } + } } let table_meta = TableMetadataJsonStr::serialize(table.metadata()); let split_num = batch_parallelism; // evenly split the files into splits based on the parallelism. - let split_size = files.len() / split_num; - let remaining = files.len() % split_num; + let split_size = data_files.len() / split_num; + let remaining = data_files.len() % split_num; let mut splits = vec![]; for i in 0..split_num { let start = i * split_size; @@ -279,20 +300,62 @@ impl IcebergSplitEnumerator { split_id: i as i64, snapshot_id, table_meta: table_meta.clone(), - files: files[start..end].to_vec(), + files: data_files[start..end].to_vec(), + eq_delete_files: eq_delete_files.clone(), }; splits.push(split); } for i in 0..remaining { splits[i] .files - .push(files[split_num * split_size + i].clone()); + .push(data_files[split_num * split_size + i].clone()); } Ok(splits .into_iter() .filter(|split| !split.files.is_empty()) .collect_vec()) } + + async fn get_require_field_names( + table: &Table, + snapshot_id: i64, + rw_schema: Schema, + ) -> ConnectorResult> { + let scan = table + .scan() + .snapshot_id(snapshot_id) + .build() + .map_err(|e| anyhow!(e))?; + let file_scan_stream = scan.plan_files().await.map_err(|e| anyhow!(e))?; + let schema = scan.snapshot().schema(table.metadata())?; + let mut equality_ids = vec![]; + #[for_await] + for task in file_scan_stream { + let task: FileScanTask = task.map_err(|e| anyhow!(e))?; + if task.data_file_content == iceberg::spec::DataContentType::EqualityDeletes { + if equality_ids.is_empty() { + equality_ids = task.equality_ids; + } else if equality_ids != task.equality_ids { + bail!("The schema of iceberg equality delete file must be consistent"); + } + } + } + let delete_columns = equality_ids + .into_iter() + .map(|id| match schema.name_by_field_id(id) { + Some(name) => Ok::(name.to_string()), + None => bail!("Delete field id {} not found in schema", id), + }) + .collect::>>()?; + let mut require_field_names: Vec<_> = rw_schema.names().to_vec(); + // Add the delete columns to the required field names + for names in delete_columns { + if !require_field_names.contains(&names) { + require_field_names.push(names); + } + } + Ok(require_field_names) + } } #[derive(Debug)] diff --git a/src/connector/src/source/kafka/enumerator/client.rs b/src/connector/src/source/kafka/enumerator/client.rs index 5551c12b433b3..a425de418ef4a 100644 --- a/src/connector/src/source/kafka/enumerator/client.rs +++ b/src/connector/src/source/kafka/enumerator/client.rs @@ -17,10 +17,12 @@ use std::time::Duration; use anyhow::{anyhow, Context}; use async_trait::async_trait; +use prometheus::core::{AtomicI64, GenericGauge}; use rdkafka::consumer::{BaseConsumer, Consumer}; use rdkafka::error::KafkaResult; use rdkafka::{Offset, TopicPartitionList}; use risingwave_common::bail; +use risingwave_common::metrics::LabelGuardedMetric; use crate::error::ConnectorResult; use crate::source::base::SplitEnumerator; @@ -49,6 +51,7 @@ pub struct KafkaSplitEnumerator { stop_offset: KafkaEnumeratorOffset, sync_call_timeout: Duration, + high_watermark_metrics: HashMap, 2>>, } impl KafkaSplitEnumerator {} @@ -124,6 +127,7 @@ impl SplitEnumerator for KafkaSplitEnumerator { start_offset: scan_start_offset, stop_offset: KafkaEnumeratorOffset::None, sync_call_timeout: properties.common.sync_call_timeout, + high_watermark_metrics: HashMap::new(), }) } @@ -160,7 +164,10 @@ impl SplitEnumerator for KafkaSplitEnumerator { } impl KafkaSplitEnumerator { - async fn get_watermarks(&self, partitions: &[i32]) -> KafkaResult> { + async fn get_watermarks( + &mut self, + partitions: &[i32], + ) -> KafkaResult> { let mut map = HashMap::new(); for partition in partitions { let (low, high) = self @@ -358,15 +365,20 @@ impl KafkaSplitEnumerator { } #[inline] - fn report_high_watermark(&self, partition: i32, offset: i64) { - self.context - .metrics - .high_watermark - .with_guarded_label_values(&[ - &self.context.info.source_id.to_string(), - &partition.to_string(), - ]) - .set(offset); + fn report_high_watermark(&mut self, partition: i32, offset: i64) { + let high_watermark_metrics = + self.high_watermark_metrics + .entry(partition) + .or_insert_with(|| { + self.context + .metrics + .high_watermark + .with_guarded_label_values(&[ + &self.context.info.source_id.to_string(), + &partition.to_string(), + ]) + }); + high_watermark_metrics.set(offset); } pub async fn check_reachability(&self) -> bool { diff --git a/src/connector/src/source/kafka/source/reader.rs b/src/connector/src/source/kafka/source/reader.rs index 72d4c36377c81..d58f1b70dd9fc 100644 --- a/src/connector/src/source/kafka/source/reader.rs +++ b/src/connector/src/source/kafka/source/reader.rs @@ -21,10 +21,12 @@ use anyhow::Context; use async_trait::async_trait; use futures::StreamExt; use futures_async_stream::try_stream; +use prometheus::core::{AtomicI64, GenericGauge}; use rdkafka::config::RDKafkaLogLevel; use rdkafka::consumer::{Consumer, StreamConsumer}; use rdkafka::error::KafkaError; use rdkafka::{ClientConfig, Message, Offset, TopicPartitionList}; +use risingwave_common::metrics::LabelGuardedMetric; use risingwave_pb::plan_common::additional_column::ColumnType as AdditionalColumnType; use crate::error::ConnectorResult as Result; @@ -185,21 +187,6 @@ impl SplitReader for KafkaSplitReader { } } -impl KafkaSplitReader { - fn report_latest_message_id(&self, split_id: &str, offset: i64) { - self.source_ctx - .metrics - .latest_message_id - .with_guarded_label_values(&[ - // source name is not available here - &self.source_ctx.source_id.to_string(), - &self.source_ctx.actor_id.to_string(), - split_id, - ]) - .set(offset); - } -} - impl KafkaSplitReader { #[try_stream(ok = Vec, error = crate::error::ConnectorError)] async fn into_data_stream(self) { @@ -236,6 +223,11 @@ impl KafkaSplitReader { ) }); + let mut latest_message_id_metrics: HashMap< + String, + LabelGuardedMetric, 3>, + > = HashMap::new(); + #[for_await] 'for_outer_loop: for msgs in self.consumer.stream().ready_chunks(max_chunk_size) { let msgs: Vec<_> = msgs @@ -250,7 +242,20 @@ impl KafkaSplitReader { for (partition, offset) in split_msg_offsets { let split_id = partition.to_string(); - self.report_latest_message_id(&split_id, offset); + latest_message_id_metrics + .entry(split_id.clone()) + .or_insert_with(|| { + self.source_ctx + .metrics + .latest_message_id + .with_guarded_label_values(&[ + // source name is not available here + &self.source_ctx.source_id.to_string(), + &self.source_ctx.actor_id.to_string(), + &split_id, + ]) + }) + .set(offset); } for msg in msgs { diff --git a/src/connector/src/source/nexmark/source/reader.rs b/src/connector/src/source/nexmark/source/reader.rs index ebcbc0b0aaf32..aea85c5c551cf 100644 --- a/src/connector/src/source/nexmark/source/reader.rs +++ b/src/connector/src/source/nexmark/source/reader.rs @@ -115,31 +115,30 @@ impl SplitReader for NexmarkSplitReader { let split_id = self.split_id.clone(); let metrics = self.source_ctx.metrics.clone(); + let partition_input_count_metric = + metrics.partition_input_count.with_guarded_label_values(&[ + &actor_id, + &source_id, + &split_id, + &source_name, + &fragment_id, + ]); + let partition_input_bytes_metric = + metrics.partition_input_bytes.with_guarded_label_values(&[ + &actor_id, + &source_id, + &split_id, + &source_name, + &fragment_id, + ]); + // Will buffer at most 4 event chunks. const BUFFER_SIZE: usize = 4; spawn_data_generation_stream( self.into_native_stream() .inspect_ok(move |chunk: &StreamChunk| { - metrics - .partition_input_count - .with_guarded_label_values(&[ - &actor_id, - &source_id, - &split_id, - &source_name, - &fragment_id, - ]) - .inc_by(chunk.cardinality() as u64); - metrics - .partition_input_bytes - .with_guarded_label_values(&[ - &actor_id, - &source_id, - &split_id, - &source_name, - &fragment_id, - ]) - .inc_by(chunk.estimated_size() as u64); + partition_input_count_metric.inc_by(chunk.cardinality() as u64); + partition_input_bytes_metric.inc_by(chunk.estimated_size() as u64); }), BUFFER_SIZE, ) diff --git a/src/connector/src/source/pulsar/mod.rs b/src/connector/src/source/pulsar/mod.rs index 5d6d111b13bff..ffbc3be495bf9 100644 --- a/src/connector/src/source/pulsar/mod.rs +++ b/src/connector/src/source/pulsar/mod.rs @@ -74,6 +74,16 @@ pub struct PulsarProperties { #[serde(rename = "iceberg.bucket", default)] pub iceberg_bucket: Option, + /// Specify a custom consumer group id prefix for the source. + /// Defaults to `rw-consumer`. + /// + /// Notes: + /// - Each job (materialized view) will have multiple subscriptions and + /// contains a generated suffix in the subscription name. + /// The subscription name will be `{subscription_name_prefix}-{fragment_id}-{actor_id}`. + #[serde(rename = "subscription.name.prefix")] + pub subscription_name_prefix: Option, + #[serde(flatten)] pub unknown_fields: HashMap, } diff --git a/src/connector/src/source/pulsar/source/reader.rs b/src/connector/src/source/pulsar/source/reader.rs index 212c459388b25..20f6872474e88 100644 --- a/src/connector/src/source/pulsar/source/reader.rs +++ b/src/connector/src/source/pulsar/source/reader.rs @@ -42,6 +42,8 @@ use crate::source::{ SplitMetaData, SplitReader, }; +const PULSAR_DEFAULT_SUBSCRIPTION_PREFIX: &str = "rw-consumer"; + pub enum PulsarSplitReader { Broker(PulsarBrokerReader), Iceberg(PulsarIcebergReader), @@ -174,8 +176,12 @@ impl SplitReader for PulsarBrokerReader { .with_topic(&topic) .with_subscription_type(SubType::Exclusive) .with_subscription(format!( - "rw-consumer-{}-{}", - source_ctx.fragment_id, source_ctx.actor_id + "{}-{}-{}", + props + .subscription_name_prefix + .unwrap_or(PULSAR_DEFAULT_SUBSCRIPTION_PREFIX.to_string()), + source_ctx.fragment_id, + source_ctx.actor_id )); let builder = match split.start_offset.clone() { diff --git a/src/connector/src/source/reader/reader.rs b/src/connector/src/source/reader/reader.rs index 9a7cb1e440e9f..b3a1cb5380d8c 100644 --- a/src/connector/src/source/reader/reader.rs +++ b/src/connector/src/source/reader/reader.rs @@ -93,47 +93,27 @@ impl SourceReader { match config { ConnectorProperties::Gcs(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); - let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_gcs_source(*prop)?; - Ok(build_opendal_fs_list_stream( - lister, - list_interval_sec, - recursive_scan, - )) + Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) } ConnectorProperties::OpendalS3(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); - let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_s3_source(prop.s3_properties, prop.assume_role)?; - Ok(build_opendal_fs_list_stream( - lister, - list_interval_sec, - recursive_scan, - )) + Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) } ConnectorProperties::Azblob(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); - let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_azblob_source(*prop)?; - Ok(build_opendal_fs_list_stream( - lister, - list_interval_sec, - recursive_scan, - )) + Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) } ConnectorProperties::PosixFs(prop) => { list_interval_sec = get_list_interval_sec(prop.fs_common.refresh_interval_sec); - let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); let lister: OpendalEnumerator = OpendalEnumerator::new_posix_fs_source(*prop)?; - Ok(build_opendal_fs_list_stream( - lister, - list_interval_sec, - recursive_scan, - )) + Ok(build_opendal_fs_list_stream(lister, list_interval_sec)) } other => bail!("Unsupported source: {:?}", other), } @@ -284,11 +264,10 @@ impl SourceReader { async fn build_opendal_fs_list_stream( lister: OpendalEnumerator, list_interval_sec: u64, - recursive_scan: bool, ) { loop { let matcher = lister.get_matcher(); - let mut object_metadata_iter = lister.list(recursive_scan).await?; + let mut object_metadata_iter = lister.list().await?; while let Some(list_res) = object_metadata_iter.next().await { match list_res { @@ -300,7 +279,6 @@ async fn build_opendal_fs_list_stream( { yield res } else { - // Currrntly due to the lack of prefix list, we just skip the unmatched files. continue; } } @@ -315,12 +293,9 @@ async fn build_opendal_fs_list_stream( } #[try_stream(boxed, ok = OpendalFsSplit, error = crate::error::ConnectorError)] -pub async fn build_opendal_fs_list_for_batch( - lister: OpendalEnumerator, - recursive_scan: bool, -) { +pub async fn build_opendal_fs_list_for_batch(lister: OpendalEnumerator) { let matcher = lister.get_matcher(); - let mut object_metadata_iter = lister.list(recursive_scan).await?; + let mut object_metadata_iter = lister.list().await?; while let Some(list_res) = object_metadata_iter.next().await { match list_res { diff --git a/src/connector/src/test_data/any-schema.pb b/src/connector/src/test_data/any-schema.pb deleted file mode 100644 index 977f64cec3775..0000000000000 --- a/src/connector/src/test_data/any-schema.pb +++ /dev/null @@ -1,30 +0,0 @@ - -ä -google/protobuf/any.protogoogle.protobuf"6 -Any -type_url ( RtypeUrl -value ( RvalueBv -com.google.protobufBAnyProtoPZ,google.golang.org/protobuf/types/known/anypb¢GPBªGoogle.Protobuf.WellKnownTypesbproto3 -á -any-schema.prototestgoogle/protobuf/any.proto"L -TestAny -id (Rid1 - any_value ( 2.google.protobuf.AnyRanyValue"# - StringValue -value ( Rvalue"" - -Int32Value -value (Rvalue"v -AnyValue4 - any_value_1 ( 2.google.protobuf.AnyR anyValue14 - any_value_2 ( 2.google.protobuf.AnyR anyValue2"@ -StringInt32Value -first ( Rfirst -second (Rsecond"Ž -StringStringInt32Value -first ( Rfirst. -second ( 2.test.StringInt32ValueRsecond. -third ( 2.test.Float32StringValueRthird"B -Float32StringValue -first (Rfirst -second ( Rsecondbproto3 \ No newline at end of file diff --git a/src/connector/src/test_data/complex-schema b/src/connector/src/test_data/complex-schema deleted file mode 100644 index ff7cd64120883..0000000000000 Binary files a/src/connector/src/test_data/complex-schema and /dev/null differ diff --git a/src/connector/src/test_data/proto_recursive/recursive.proto b/src/connector/src/test_data/proto_recursive/recursive.proto deleted file mode 100644 index 93f177055788c..0000000000000 --- a/src/connector/src/test_data/proto_recursive/recursive.proto +++ /dev/null @@ -1,95 +0,0 @@ -syntax = "proto3"; - -import "google/protobuf/timestamp.proto"; -import "google/protobuf/duration.proto"; -import "google/protobuf/any.proto"; -import "google/protobuf/wrappers.proto"; - -package recursive; - -message ComplexRecursiveMessage { - string node_name = 1; - int32 node_id = 2; - - message Attributes { - string key = 1; - string value = 2; - } - - repeated Attributes attributes = 3; - - message Parent { - string parent_name = 1; - int32 parent_id = 2; - repeated ComplexRecursiveMessage siblings = 3; - } - - Parent parent = 4; - repeated ComplexRecursiveMessage children = 5; -} - -message AllTypes { - // standard types - double double_field = 1; - float float_field = 2; - int32 int32_field = 3; - int64 int64_field = 4; - uint32 uint32_field = 5; - uint64 uint64_field = 6; - sint32 sint32_field = 7; - sint64 sint64_field = 8; - fixed32 fixed32_field = 9; - fixed64 fixed64_field = 10; - sfixed32 sfixed32_field = 11; - sfixed64 sfixed64_field = 12; - bool bool_field = 13; - string string_field = 14; - - bytes bytes_field = 15; - - // enum - enum EnumType { - DEFAULT = 0; - OPTION1 = 1; - OPTION2 = 2; - } - EnumType enum_field = 16; - - // nested message - message NestedMessage { - int32 id = 1; - string name = 2; - } - NestedMessage nested_message_field = 17; - - // repeated field - repeated int32 repeated_int_field = 18; - - // oneof field - oneof example_oneof { - string oneof_string = 19; - int32 oneof_int32 = 20; - EnumType oneof_enum = 21; - } - - // // map field - // map map_field = 22; - - // timestamp - google.protobuf.Timestamp timestamp_field = 23; - - // duration - google.protobuf.Duration duration_field = 24; - - // any - google.protobuf.Any any_field = 25; - - // -- Unsupported - // // struct - // import "google/protobuf/struct.proto"; - // google.protobuf.Struct struct_field = 26; - - // wrapper types - google.protobuf.Int32Value int32_value_field = 27; - google.protobuf.StringValue string_value_field = 28; -} \ No newline at end of file diff --git a/src/connector/src/test_data/simple-schema b/src/connector/src/test_data/simple-schema deleted file mode 100644 index 97686ce9c478d..0000000000000 --- a/src/connector/src/test_data/simple-schema +++ /dev/null @@ -1,11 +0,0 @@ - -² -simple-schema.prototest"Œ - -TestRecord -id (Rid -address ( Raddress -city ( Rcity -zipcode (Rzipcode -rate (Rrate -date ( Rdatebproto3 \ No newline at end of file diff --git a/src/connector/with_options_sink.yaml b/src/connector/with_options_sink.yaml index e8a8efff68801..72e800e82a48b 100644 --- a/src/connector/with_options_sink.yaml +++ b/src/connector/with_options_sink.yaml @@ -115,7 +115,7 @@ ClickHouseConfig: field_type: u64 comments: Commit every n(>0) checkpoints, default is 10. required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: r#type field_type: String required: true @@ -143,7 +143,7 @@ DeltaLakeConfig: field_type: u64 comments: Commit every n(>0) checkpoints, default is 10. required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: r#type field_type: String required: true @@ -339,7 +339,7 @@ IcebergConfig: field_type: u64 comments: Commit every n(>0) checkpoints, default is 10. required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: create_table_if_not_exists field_type: bool required: false @@ -717,6 +717,10 @@ MqttConfig: field_type: usize comments: The maximum number of inflight messages. Defaults to 100 required: false + - name: max_packet_size + field_type: u32 + comments: The max size of messages received by the MQTT client + required: false - name: tls.client_key field_type: String comments: Path to CA certificate file for verifying the broker's key. @@ -1021,7 +1025,7 @@ StarrocksConfig: also, in this time, the `sink_decouple` option should be enabled as well. Defaults to 10 if commit_checkpoint_interval <= 0 required: false - default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL + default: DEFAULT_COMMIT_CHECKPOINT_INTERVAL_WITH_SINK_DECOUPLE - name: starrocks.partial_update field_type: String comments: Enable partial update diff --git a/src/connector/with_options_source.yaml b/src/connector/with_options_source.yaml index 695a2aeaa1c14..20ee03949396f 100644 --- a/src/connector/with_options_source.yaml +++ b/src/connector/with_options_source.yaml @@ -24,10 +24,6 @@ AzblobProperties: - name: refresh.interval.sec field_type: u64 required: false - - name: recursive_scan - field_type: bool - required: false - default: Default::default - name: compression_format field_type: CompressionFormat required: false @@ -79,10 +75,6 @@ GcsProperties: - name: refresh.interval.sec field_type: u64 required: false - - name: recursive_scan - field_type: bool - required: false - default: Default::default - name: compression_format field_type: CompressionFormat required: false @@ -493,6 +485,10 @@ MqttProperties: field_type: usize comments: The maximum number of inflight messages. Defaults to 100 required: false + - name: max_packet_size + field_type: u32 + comments: The max size of messages received by the MQTT client + required: false - name: tls.client_key field_type: String comments: Path to CA certificate file for verifying the broker's key. @@ -848,10 +844,6 @@ OpendalS3Properties: - name: refresh.interval.sec field_type: u64 required: false - - name: recursive_scan - field_type: bool - required: false - default: Default::default PosixFsProperties: fields: - name: posix_fs.root @@ -866,10 +858,6 @@ PosixFsProperties: - name: refresh.interval.sec field_type: u64 required: false - - name: recursive_scan - field_type: bool - required: false - default: Default::default - name: compression_format field_type: CompressionFormat required: false @@ -1016,6 +1004,17 @@ PulsarProperties: field_type: String required: false default: Default::default + - name: subscription.name.prefix + field_type: String + comments: |- + Specify a custom consumer group id prefix for the source. + Defaults to `rw-consumer`. + + Notes: + - Each job (materialized view) will have multiple subscriptions and + contains a generated suffix in the subscription name. + The subscription name will be `{subscription_name_prefix}-{fragment_id}-{actor_id}`. + required: false S3Properties: fields: - name: s3.region_name diff --git a/src/ctl/src/cmd_impl/hummock/sst_dump.rs b/src/ctl/src/cmd_impl/hummock/sst_dump.rs index 03a7b35a85192..51b776ad1b2c2 100644 --- a/src/ctl/src/cmd_impl/hummock/sst_dump.rs +++ b/src/ctl/src/cmd_impl/hummock/sst_dump.rs @@ -228,14 +228,6 @@ pub async fn sst_dump_via_sstable_store( println!("Bloom Filter Size: {}", sstable_meta.bloom_filter.len()); println!("Key Count: {}", sstable_meta.key_count); println!("Version: {}", sstable_meta.version); - println!( - "Monotonoic Deletes Count: {}", - sstable_meta.monotonic_tombstone_events.len() - ); - for monotonic_delete in &sstable_meta.monotonic_tombstone_events { - println!("\tevent key: {:?}", monotonic_delete.event_key); - println!("\tnew epoch: {:?}", monotonic_delete.new_epoch); - } println!("Block Count: {}", sstable.block_count()); for i in 0..sstable.block_count() { diff --git a/src/ctl/src/cmd_impl/table/scan.rs b/src/ctl/src/cmd_impl/table/scan.rs index f5cee710a40fc..3002a3585fb41 100644 --- a/src/ctl/src/cmd_impl/table/scan.rs +++ b/src/ctl/src/cmd_impl/table/scan.rs @@ -55,19 +55,15 @@ pub fn print_table_catalog(table: &TableCatalog) { } pub async fn make_state_table(hummock: S, table: &TableCatalog) -> StateTable { - StateTable::new_with_distribution( + StateTable::from_table_catalog( + &table.to_internal_table_prost(), hummock, - table.id, - table - .columns() - .iter() - .map(|x| x.column_desc.clone()) - .collect(), - table.pk().iter().map(|x| x.order_type).collect(), - table.pk().iter().map(|x| x.column_index).collect(), - // TODO(var-vnode): use vnode count from table desc - TableDistribution::all(table.distribution_key().to_vec(), VirtualNode::COUNT), // scan all vnodes - Some(table.value_indices.clone()), + Some( + // scan all vnodes + TableDistribution::all(table.distribution_key().to_vec(), VirtualNode::COUNT) + .vnodes() + .clone(), + ), ) .await } diff --git a/src/expr/impl/Cargo.toml b/src/expr/impl/Cargo.toml index e493037c200b7..c0e506889ef77 100644 --- a/src/expr/impl/Cargo.toml +++ b/src/expr/impl/Cargo.toml @@ -51,7 +51,7 @@ itertools = { workspace = true } jsonbb = { workspace = true } linkme = { version = "0.3", features = ["used_linker"] } md5 = "0.7" -moka = { version = "0.12", features = ["sync"] } +moka = { version = "0.12.0", features = ["sync"] } num-traits = "0.2" openssl = "0.10" regex = "1" diff --git a/src/expr/impl/src/aggregate/approx_percentile.rs b/src/expr/impl/src/aggregate/approx_percentile.rs index 33e2a9969cdc9..98cf1bd3969fa 100644 --- a/src/expr/impl/src/aggregate/approx_percentile.rs +++ b/src/expr/impl/src/aggregate/approx_percentile.rs @@ -18,6 +18,7 @@ use std::ops::Range; use bytes::{Buf, Bytes}; use risingwave_common::array::*; +use risingwave_common::bail; use risingwave_common::row::Row; use risingwave_common::types::*; use risingwave_common_estimate_size::EstimateSize; @@ -38,6 +39,12 @@ fn build(agg: &AggCall) -> Result> { .literal() .map(|x| (*x.as_float64()).into()) .unwrap(); + if relative_error <= 0.0 || relative_error >= 1.0 { + bail!( + "relative_error must be in the range (0, 1), got {}", + relative_error + ) + } let base = (1.0 + relative_error) / (1.0 - relative_error); Ok(Box::new(ApproxPercentile { quantile, base })) } @@ -156,7 +163,8 @@ impl AggregateFunction for ApproxPercentile { // approximate quantile bucket on the fly. async fn get_result(&self, state: &AggregateState) -> Result { let state = state.downcast_ref::(); - let quantile_count = (state.count as f64 * self.quantile).floor() as u64; + let quantile_count = + ((state.count.saturating_sub(1)) as f64 * self.quantile).floor() as u64; let mut acc_count = 0; for (bucket_id, count) in state.neg_buckets.iter().rev() { acc_count += count; diff --git a/src/expr/impl/src/scalar/array.rs b/src/expr/impl/src/scalar/array.rs index 7b7d272000597..863bd5eba62fe 100644 --- a/src/expr/impl/src/scalar/array.rs +++ b/src/expr/impl/src/scalar/array.rs @@ -153,10 +153,7 @@ fn map_contains(map: MapRef<'_>, key: ScalarRefImpl<'_>) -> Result int4")] fn map_length>(map: MapRef<'_>) -> Result { - map.inner() - .len() - .try_into() - .map_err(|_| ExprError::NumericOverflow) + map.len().try_into().map_err(|_| ExprError::NumericOverflow) } /// If both `m1` and `m2` have a value with the same key, then the output map contains the value from `m2`. diff --git a/src/expr/impl/src/udf/external.rs b/src/expr/impl/src/udf/external.rs index 5c400df26c179..0d6ba0e409386 100644 --- a/src/expr/impl/src/udf/external.rs +++ b/src/expr/impl/src/udf/external.rs @@ -25,6 +25,7 @@ use ginepro::{LoadBalancedChannel, ResolutionStrategy}; use risingwave_common::array::arrow::{ToArrow, UdfArrowConvert}; use risingwave_common::util::addr::HostAddr; use thiserror_ext::AsReport; +use tokio::runtime::Runtime; use super::*; @@ -174,9 +175,16 @@ fn get_or_create_flight_client(link: &str) -> Result> { // reuse existing client Ok(client) } else { + static RUNTIME: LazyLock = LazyLock::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("rw-udf") + .enable_all() + .build() + .expect("failed to build udf runtime") + }); // create new client let client = Arc::new(tokio::task::block_in_place(|| { - tokio::runtime::Handle::current().block_on(async { + RUNTIME.block_on(async { let channel = connect_tonic(link).await?; Ok(Client::new(channel).await?) as Result<_> }) diff --git a/src/frontend/planner_test/tests/testdata/input/agg.yaml b/src/frontend/planner_test/tests/testdata/input/agg.yaml index de22c4b5ee91b..daf6df4049416 100644 --- a/src/frontend/planner_test/tests/testdata/input/agg.yaml +++ b/src/frontend/planner_test/tests/testdata/input/agg.yaml @@ -1100,4 +1100,22 @@ expected_outputs: - logical_plan - batch_plan - - stream_plan \ No newline at end of file + - stream_plan +- name: test approx percentile with invalid relative_error + sql: | + CREATE TABLE t (v1 int); + SELECT approx_percentile(0.5, 0.0) WITHIN GROUP (order by v1) from t; + expected_outputs: + - binder_error +- name: test approx percentile with invalid relative_error 0.0 + sql: | + CREATE TABLE t (v1 int); + SELECT approx_percentile(0.5, 0.0) WITHIN GROUP (order by v1) from t; + expected_outputs: + - binder_error +- name: test approx percentile with invalid relative_error 1.0 with group by. + sql: | + CREATE TABLE t (v1 int, v2 int); + SELECT approx_percentile(0.0, 1.0) WITHIN GROUP (order by v1) from t group by v2; + expected_outputs: + - binder_error \ No newline at end of file diff --git a/src/frontend/planner_test/tests/testdata/input/topn.yaml b/src/frontend/planner_test/tests/testdata/input/topn.yaml index 676ac327efc90..96036d4fb3cdf 100644 --- a/src/frontend/planner_test/tests/testdata/input/topn.yaml +++ b/src/frontend/planner_test/tests/testdata/input/topn.yaml @@ -16,3 +16,28 @@ SELECT * FROM t1_mv ORDER BY a DESC LIMIT 50 OFFSET 50; expected_outputs: - batch_plan +- sql: | + WITH c1(k, v) AS ( + VALUES + (1, 'foo'), + (2, 'bar') + ), + c2 AS ( + SELECT + *, + row_number() over ( + PARTITION by k + ORDER BY 1 + ) AS rn + FROM + c1 + ) + SELECT + count(*) + FROM + c2 + WHERE + rn <= 1; + expected_outputs: + - logical_plan + - optimized_logical_plan_for_batch diff --git a/src/frontend/planner_test/tests/testdata/output/agg.yaml b/src/frontend/planner_test/tests/testdata/output/agg.yaml index da2a391a8c603..424b58996fcbc 100644 --- a/src/frontend/planner_test/tests/testdata/output/agg.yaml +++ b/src/frontend/planner_test/tests/testdata/output/agg.yaml @@ -2198,3 +2198,30 @@ └─StreamLocalApproxPercentile { percentile_col: $expr1, quantile: 0.5:Float64, relative_error: 0.01:Float64 } └─StreamProject { exprs: [t.v1::Float64 as $expr1, t._row_id] } └─StreamTableScan { table: t, columns: [t.v1, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } +- name: test approx percentile with invalid relative_error + sql: | + CREATE TABLE t (v1 int); + SELECT approx_percentile(0.5, 0.0) WITHIN GROUP (order by v1) from t; + binder_error: | + Failed to bind expression: approx_percentile(0.5, 0.0) WITHIN GROUP (ORDER BY v1) + + Caused by: + relative_error=0 does not satisfy 0.0 < relative_error < 1.0 +- name: test approx percentile with invalid relative_error 0.0 + sql: | + CREATE TABLE t (v1 int); + SELECT approx_percentile(0.5, 0.0) WITHIN GROUP (order by v1) from t; + binder_error: | + Failed to bind expression: approx_percentile(0.5, 0.0) WITHIN GROUP (ORDER BY v1) + + Caused by: + relative_error=0 does not satisfy 0.0 < relative_error < 1.0 +- name: test approx percentile with invalid relative_error 1.0 with group by. + sql: | + CREATE TABLE t (v1 int, v2 int); + SELECT approx_percentile(0.0, 1.0) WITHIN GROUP (order by v1) from t group by v2; + binder_error: | + Failed to bind expression: approx_percentile(0.0, 1.0) WITHIN GROUP (ORDER BY v1) + + Caused by: + relative_error=1 does not satisfy 0.0 < relative_error < 1.0 diff --git a/src/frontend/planner_test/tests/testdata/output/topn.yaml b/src/frontend/planner_test/tests/testdata/output/topn.yaml index 52e15c35a8f7f..42a7e61cda5a4 100644 --- a/src/frontend/planner_test/tests/testdata/output/topn.yaml +++ b/src/frontend/planner_test/tests/testdata/output/topn.yaml @@ -26,3 +26,41 @@ └─BatchExchange { order: [], dist: Single } └─BatchLimit { limit: 100, offset: 0 } └─BatchScan { table: t1_mv, columns: [t1_mv.pk, t1_mv.a, t1_mv.b, t1_mv.c, t1_mv.d], limit: 100, distribution: SomeShard } +- sql: | + WITH c1(k, v) AS ( + VALUES + (1, 'foo'), + (2, 'bar') + ), + c2 AS ( + SELECT + *, + row_number() over ( + PARTITION by k + ORDER BY 1 + ) AS rn + FROM + c1 + ) + SELECT + count(*) + FROM + c2 + WHERE + rn <= 1; + logical_plan: |- + LogicalProject { exprs: [count] } + └─LogicalAgg { aggs: [count] } + └─LogicalProject { exprs: [] } + └─LogicalFilter { predicate: (row_number <= 1:Int32) } + └─LogicalShare { id: 5 } + └─LogicalProject { exprs: [*VALUES*_0.column_0, *VALUES*_0.column_1, row_number] } + └─LogicalOverWindow { window_functions: [row_number() OVER(PARTITION BY *VALUES*_0.column_0 ORDER BY 1:Int32 ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)] } + └─LogicalProject { exprs: [*VALUES*_0.column_0, *VALUES*_0.column_1, 1:Int32] } + └─LogicalShare { id: 1 } + └─LogicalValues { rows: [[1:Int32, 'foo':Varchar], [2:Int32, 'bar':Varchar]], schema: Schema { fields: [*VALUES*_0.column_0:Int32, *VALUES*_0.column_1:Varchar] } } + optimized_logical_plan_for_batch: |- + LogicalAgg { aggs: [count] } + └─LogicalTopN { order: [1:Int32 ASC], limit: 1, offset: 0, group_key: [*VALUES*_0.column_0] } + └─LogicalProject { exprs: [*VALUES*_0.column_0, 1:Int32] } + └─LogicalValues { rows: [[1:Int32], [2:Int32]], schema: Schema { fields: [*VALUES*_0.column_0:Int32] } } diff --git a/src/frontend/src/binder/expr/function/aggregate.rs b/src/frontend/src/binder/expr/function/aggregate.rs index a9067205f77b0..1899846f7a450 100644 --- a/src/frontend/src/binder/expr/function/aggregate.rs +++ b/src/frontend/src/binder/expr/function/aggregate.rs @@ -13,8 +13,8 @@ // limitations under the License. use itertools::Itertools; -use risingwave_common::bail_not_implemented; use risingwave_common::types::{DataType, ScalarImpl}; +use risingwave_common::{bail, bail_not_implemented}; use risingwave_expr::aggregate::{agg_kinds, AggKind, PbAggKind}; use risingwave_sqlparser::ast::{self, FunctionArgExpr}; @@ -158,6 +158,17 @@ impl Binder { 2 => { let relative_error = &mut direct_args[1]; decimal_to_float64(relative_error, kind)?; + if let Some(relative_error) = relative_error.as_literal() + && let Some(relative_error) = relative_error.get_data() + { + let relative_error = relative_error.as_float64().0; + if relative_error <= 0.0 || relative_error >= 1.0 { + bail!( + "relative_error={} does not satisfy 0.0 < relative_error < 1.0", + relative_error, + ) + } + } } 1 => { let relative_error: ExprImpl = Literal::new( diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_iceberg_snapshots.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_iceberg_snapshots.rs index e2bbcb486b926..3c60236f96e66 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_iceberg_snapshots.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_iceberg_snapshots.rs @@ -17,6 +17,7 @@ use std::ops::Deref; use iceberg::table::Table; use jsonbb::{Value, ValueRef}; use risingwave_common::types::{Fields, JsonbVal, Timestamptz}; +use risingwave_connector::error::ConnectorResult; use risingwave_connector::sink::iceberg::IcebergConfig; use risingwave_connector::source::ConnectorProperties; use risingwave_connector::WithPropertiesExt; @@ -62,25 +63,32 @@ async fn read(reader: &SysCatalogReaderImpl) -> Result> let iceberg_config: IcebergConfig = iceberg_properties.to_iceberg_config(); let table: Table = iceberg_config.load_table_v2().await?; - result.extend(table.metadata().snapshots().map(|snapshot| { - RwIcebergSnapshots { - source_id: source.id as i32, - schema_name: schema_name.clone(), - source_name: source.name.clone(), - sequence_number: snapshot.sequence_number(), - snapshot_id: snapshot.snapshot_id(), - timestamp_ms: Timestamptz::from_millis(snapshot.timestamp().timestamp_millis()), - manifest_list: snapshot.manifest_list().to_string(), - summary: Value::object( - snapshot - .summary() - .other - .iter() - .map(|(k, v)| (k.as_str(), ValueRef::String(v))), - ) - .into(), - } - })); + let snapshots: ConnectorResult> = table + .metadata() + .snapshots() + .map(|snapshot| { + Ok(RwIcebergSnapshots { + source_id: source.id as i32, + schema_name: schema_name.clone(), + source_name: source.name.clone(), + sequence_number: snapshot.sequence_number(), + snapshot_id: snapshot.snapshot_id(), + timestamp_ms: Timestamptz::from_millis( + snapshot.timestamp()?.timestamp_millis(), + ), + manifest_list: snapshot.manifest_list().to_string(), + summary: Value::object( + snapshot + .summary() + .other + .iter() + .map(|(k, v)| (k.as_str(), ValueRef::String(v))), + ) + .into(), + }) + }) + .collect(); + result.extend(snapshots?); } } Ok(result) diff --git a/src/frontend/src/handler/create_source.rs b/src/frontend/src/handler/create_source.rs index 5186c8322095d..f1535fa769b28 100644 --- a/src/frontend/src/handler/create_source.rs +++ b/src/frontend/src/handler/create_source.rs @@ -66,7 +66,7 @@ use risingwave_sqlparser::ast::{ get_delimiter, AstString, ColumnDef, ConnectorSchema, CreateSourceStatement, Encode, Format, ObjectName, ProtobufSchema, SourceWatermark, TableConstraint, }; -use risingwave_sqlparser::parser::IncludeOption; +use risingwave_sqlparser::parser::{IncludeOption, IncludeOptionItem}; use thiserror_ext::AsReport; use super::RwPgResponse; @@ -594,8 +594,43 @@ fn bind_columns_from_source_for_cdc( Ok((Some(columns), stream_source_info)) } +// check the additional column compatibility with the format and encode +fn check_additional_column_compatibility( + column_def: &IncludeOptionItem, + source_schema: Option<&ConnectorSchema>, +) -> Result<()> { + // only allow header column have inner field + if column_def.inner_field.is_some() + && !column_def + .column_type + .real_value() + .eq_ignore_ascii_case("header") + { + return Err(RwError::from(ProtocolError(format!( + "Only header column can have inner field, but got {:?}", + column_def.column_type.real_value(), + )))); + } + + // Payload column only allowed when encode is JSON + if let Some(schema) = source_schema + && column_def + .column_type + .real_value() + .eq_ignore_ascii_case("payload") + && !matches!(schema.row_encode, Encode::Json) + { + return Err(RwError::from(ProtocolError(format!( + "INCLUDE payload is only allowed when using ENCODE JSON, but got ENCODE {:?}", + schema.row_encode + )))); + } + Ok(()) +} + /// add connector-spec columns to the end of column catalog pub fn handle_addition_columns( + source_schema: Option<&ConnectorSchema>, with_properties: &BTreeMap, mut additional_columns: IncludeOption, columns: &mut Vec, @@ -619,17 +654,7 @@ pub fn handle_addition_columns( .unwrap(); // there must be at least one column in the column catalog while let Some(item) = additional_columns.pop() { - { - // only allow header column have inner field - if item.inner_field.is_some() - && !item.column_type.real_value().eq_ignore_ascii_case("header") - { - return Err(RwError::from(ProtocolError(format!( - "Only header column can have inner field, but got {:?}", - item.column_type.real_value(), - )))); - } - } + check_additional_column_compatibility(&item, source_schema)?; let data_type_name: Option = item .header_inner_expect_type @@ -1512,6 +1537,7 @@ pub async fn bind_create_source_or_table_with_connector( // add additional columns before bind pk, because `format upsert` requires the key column handle_addition_columns( + Some(&source_schema), &with_properties, include_column_options, &mut columns, diff --git a/src/frontend/src/handler/create_table.rs b/src/frontend/src/handler/create_table.rs index 6b3da5d001e60..e7b2b44226657 100644 --- a/src/frontend/src/handler/create_table.rs +++ b/src/frontend/src/handler/create_table.rs @@ -771,6 +771,7 @@ pub(crate) fn gen_create_table_plan_for_cdc_table( // append additional columns to the end handle_addition_columns( + None, &connect_properties, include_column_options, &mut columns, diff --git a/src/frontend/src/optimizer/plan_node/stream_sink.rs b/src/frontend/src/optimizer/plan_node/stream_sink.rs index 2717c454e6435..3e34475c8d4bb 100644 --- a/src/frontend/src/optimizer/plan_node/stream_sink.rs +++ b/src/frontend/src/optimizer/plan_node/stream_sink.rs @@ -212,7 +212,7 @@ impl StreamSink { partition_info: Option, ) -> Result { let columns = derive_columns(input.schema(), out_names, &user_cols)?; - let (input, sink) = Self::derive_sink_desc( + let (input, mut sink) = Self::derive_sink_desc( input, user_distributed_by, name, @@ -241,8 +241,11 @@ impl StreamSink { if connector == TABLE_SINK && sink.target_table.is_none() { unsupported_sink(TABLE_SINK) } else { + SinkType::set_default_commit_checkpoint_interval( + &mut sink, + &input.ctx().session_ctx().config().sink_decouple(), + )?; SinkType::is_sink_decouple( - &sink, &input.ctx().session_ctx().config().sink_decouple(), ) } diff --git a/src/frontend/src/optimizer/plan_visitor/cardinality_visitor.rs b/src/frontend/src/optimizer/plan_visitor/cardinality_visitor.rs index 07459b59b1d5f..b17a8318d2b1a 100644 --- a/src/frontend/src/optimizer/plan_visitor/cardinality_visitor.rs +++ b/src/frontend/src/optimizer/plan_visitor/cardinality_visitor.rs @@ -109,12 +109,22 @@ impl PlanVisitor for CardinalityVisitor { fn visit_logical_top_n(&mut self, plan: &plan_node::LogicalTopN) -> Cardinality { let input = self.visit(plan.input()); - match plan.limit_attr() { + let each_group = match plan.limit_attr() { TopNLimit::Simple(limit) => input.sub(plan.offset() as usize).min(limit as usize), TopNLimit::WithTies(limit) => { assert_eq!(plan.offset(), 0, "ties with offset is not supported yet"); input.min((limit as usize)..) } + }; + + if plan.group_key().is_empty() { + each_group + } else { + let group_number = input.min(1..); + each_group + .mul(group_number) + // the output cardinality will never be more than the input, thus `.min(input)` + .min(input) } } diff --git a/src/frontend/src/scheduler/plan_fragmenter.rs b/src/frontend/src/scheduler/plan_fragmenter.rs index 63b6eef38da71..6777f9373b841 100644 --- a/src/frontend/src/scheduler/plan_fragmenter.rs +++ b/src/frontend/src/scheduler/plan_fragmenter.rs @@ -310,11 +310,9 @@ impl SourceScanInfo { Ok(SourceScanInfo::Complete(split_info)) } ConnectorProperties::OpendalS3(prop) => { - let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); - let lister: OpendalEnumerator = OpendalEnumerator::new_s3_source(prop.s3_properties, prop.assume_role)?; - let stream = build_opendal_fs_list_for_batch(lister, recursive_scan); + let stream = build_opendal_fs_list_for_batch(lister); let batch_res: Vec<_> = stream.try_collect().await?; let res = batch_res @@ -325,22 +323,18 @@ impl SourceScanInfo { Ok(SourceScanInfo::Complete(res)) } ConnectorProperties::Gcs(prop) => { - let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); - let lister: OpendalEnumerator = OpendalEnumerator::new_gcs_source(*prop)?; - let stream = build_opendal_fs_list_for_batch(lister, recursive_scan); + let stream = build_opendal_fs_list_for_batch(lister); let batch_res: Vec<_> = stream.try_collect().await?; let res = batch_res.into_iter().map(SplitImpl::Gcs).collect_vec(); Ok(SourceScanInfo::Complete(res)) } ConnectorProperties::Azblob(prop) => { - let recursive_scan = prop.fs_common.recursive_scan.unwrap_or_default(); - let lister: OpendalEnumerator = OpendalEnumerator::new_azblob_source(*prop)?; - let stream = build_opendal_fs_list_for_batch(lister, recursive_scan); + let stream = build_opendal_fs_list_for_batch(lister); let batch_res: Vec<_> = stream.try_collect().await?; let res = batch_res.into_iter().map(SplitImpl::Azblob).collect_vec(); diff --git a/src/license/src/feature.rs b/src/license/src/feature.rs index 583ef93a45863..0b888986db5c2 100644 --- a/src/license/src/feature.rs +++ b/src/license/src/feature.rs @@ -57,7 +57,6 @@ macro_rules! for_all_features { { SqlServerCdcSource, Paid, "CDC source connector for Sql Server." }, { CdcAutoSchemaChange, Paid, "Auto replicate upstream DDL to CDC Table." }, { IcebergSinkWithGlue, Paid, "Delivering data to Iceberg with Glue catalog." }, - { FileSink, Paid, "Delivering data to object storage."}, } }; } diff --git a/src/meta/src/barrier/command.rs b/src/meta/src/barrier/command.rs index 577a0bef25360..c18ad5d0f2b3b 100644 --- a/src/meta/src/barrier/command.rs +++ b/src/meta/src/barrier/command.rs @@ -46,9 +46,7 @@ use super::trace::TracedEpoch; use crate::barrier::{GlobalBarrierManagerContext, InflightSubscriptionInfo}; use crate::manager::{DdlType, InflightFragmentInfo, MetadataManager, StreamingJob, WorkerId}; use crate::model::{ActorId, DispatcherId, FragmentId, TableFragments, TableParallelism}; -use crate::stream::{ - build_actor_connector_splits, validate_assignment, SplitAssignment, ThrottleConfig, -}; +use crate::stream::{build_actor_connector_splits, SplitAssignment, ThrottleConfig}; use crate::MetaResult; /// [`Reschedule`] is for the [`Command::RescheduleFragment`], which is used for rescheduling actors @@ -524,14 +522,9 @@ impl Command { } Command::SourceSplitAssignment(change) => { - let mut checked_assignment = change.clone(); - checked_assignment - .iter_mut() - .for_each(|(_, assignment)| validate_assignment(assignment)); - let mut diff = HashMap::new(); - for actor_splits in checked_assignment.values() { + for actor_splits in change.values() { diff.extend(actor_splits.clone()); } @@ -579,12 +572,7 @@ impl Command { }) .collect(); let added_actors = table_fragments.actor_ids(); - - let mut checked_split_assignment = split_assignment.clone(); - checked_split_assignment - .iter_mut() - .for_each(|(_, assignment)| validate_assignment(assignment)); - let actor_splits = checked_split_assignment + let actor_splits = split_assignment .values() .flat_map(build_actor_connector_splits) .collect(); @@ -790,10 +778,7 @@ impl Command { let mut actor_splits = HashMap::new(); for reschedule in reschedules.values() { - let mut checked_assignment = reschedule.actor_splits.clone(); - validate_assignment(&mut checked_assignment); - - for (actor_id, splits) in &checked_assignment { + for (actor_id, splits) in &reschedule.actor_splits { actor_splits.insert( *actor_id as ActorId, ConnectorSplits { diff --git a/src/meta/src/barrier/creating_job/mod.rs b/src/meta/src/barrier/creating_job/mod.rs index 6b22bf0004017..0e2a948b6eb4b 100644 --- a/src/meta/src/barrier/creating_job/mod.rs +++ b/src/meta/src/barrier/creating_job/mod.rs @@ -21,7 +21,6 @@ use std::mem::take; use std::sync::Arc; use std::time::Duration; -use itertools::Itertools; use prometheus::HistogramTimer; use risingwave_common::metrics::{LabelGuardedHistogram, LabelGuardedIntGauge}; use risingwave_common::util::epoch::Epoch; @@ -29,7 +28,7 @@ use risingwave_pb::common::WorkerNode; use risingwave_pb::ddl_service::DdlProgress; use risingwave_pb::hummock::HummockVersionStats; use risingwave_pb::stream_plan::barrier_mutation::Mutation; -use risingwave_pb::stream_service::{BarrierCompleteResponse, BuildActorInfo}; +use risingwave_pb::stream_service::BarrierCompleteResponse; use tracing::{debug, info}; use crate::barrier::command::CommandContext; @@ -96,24 +95,7 @@ impl CreatingStreamingJobControl { graph_info: InflightGraphInfo::new(fragment_info), backfill_epoch, pending_non_checkpoint_barriers: vec![], - initial_barrier_info: Some(( - actors_to_create - .into_iter() - .map(|(worker_id, actors)| { - ( - worker_id, - actors - .into_iter() - .map(|actor| BuildActorInfo { - actor: Some(actor), - related_subscriptions: Default::default(), - }) - .collect_vec(), - ) - }) - .collect(), - initial_mutation, - )), + initial_barrier_info: Some((actors_to_create, initial_mutation)), }, upstream_lag: metrics .snapshot_backfill_lag @@ -267,6 +249,8 @@ impl CreatingStreamingJobControl { graph_info, Some(graph_info), new_actors, + vec![], + vec![], )?; self.barrier_control.enqueue_epoch( prev_epoch.value().0, @@ -331,6 +315,8 @@ impl CreatingStreamingJobControl { graph_info, Some(graph_info), None, + vec![], + vec![], )?; self.barrier_control.enqueue_epoch( command_ctx.prev_epoch.value().0, @@ -377,6 +363,8 @@ impl CreatingStreamingJobControl { Some(graph_info) }, None, + vec![], + vec![], )?; let prev_epoch = command_ctx.prev_epoch.value().0; self.barrier_control.enqueue_epoch( diff --git a/src/meta/src/barrier/creating_job/status.rs b/src/meta/src/barrier/creating_job/status.rs index 59047a5255e0c..6f205d7ced99f 100644 --- a/src/meta/src/barrier/creating_job/status.rs +++ b/src/meta/src/barrier/creating_job/status.rs @@ -19,8 +19,8 @@ use std::sync::Arc; use risingwave_common::util::epoch::Epoch; use risingwave_pb::hummock::HummockVersionStats; use risingwave_pb::stream_plan::barrier_mutation::Mutation; +use risingwave_pb::stream_plan::StreamActor; use risingwave_pb::stream_service::barrier_complete_response::CreateMviewProgress; -use risingwave_pb::stream_service::BuildActorInfo; use crate::barrier::command::CommandContext; use crate::barrier::info::InflightGraphInfo; @@ -41,7 +41,7 @@ pub(super) enum CreatingStreamingJobStatus { pending_non_checkpoint_barriers: Vec, /// Info of the first barrier: (`actors_to_create`, `mutation`) /// Take the mutation out when injecting the first barrier - initial_barrier_info: Option<(HashMap>, Mutation)>, + initial_barrier_info: Option<(HashMap>, Mutation)>, }, ConsumingLogStore { graph_info: InflightGraphInfo, @@ -60,7 +60,7 @@ pub(super) struct CreatingJobInjectBarrierInfo { pub curr_epoch: TracedEpoch, pub prev_epoch: TracedEpoch, pub kind: BarrierKind, - pub new_actors: Option>>, + pub new_actors: Option>>, pub mutation: Option, } diff --git a/src/meta/src/barrier/mod.rs b/src/meta/src/barrier/mod.rs index fe1e04b1b7b5c..c1dfcaba9650d 100644 --- a/src/meta/src/barrier/mod.rs +++ b/src/meta/src/barrier/mod.rs @@ -833,7 +833,7 @@ impl GlobalBarrierManager { .on_new_worker_node_map(self.active_streaming_nodes.current()); self.checkpoint_control.creating_streaming_job_controls.values().for_each(|job| job.on_new_worker_node_map(self.active_streaming_nodes.current())); if let ActiveStreamingWorkerChange::Add(node) | ActiveStreamingWorkerChange::Update(node) = changed_worker { - self.control_stream_manager.add_worker(node).await; + self.control_stream_manager.add_worker(node, &self.state.inflight_subscription_info).await; } } diff --git a/src/meta/src/barrier/recovery.rs b/src/meta/src/barrier/recovery.rs index 1863e86319c82..f74e6a23aa746 100644 --- a/src/meta/src/barrier/recovery.rs +++ b/src/meta/src/barrier/recovery.rs @@ -26,8 +26,7 @@ use risingwave_pb::meta::subscribe_response::{Info, Operation}; use risingwave_pb::meta::table_fragments::State; use risingwave_pb::meta::{PausedReason, Recovery}; use risingwave_pb::stream_plan::barrier_mutation::Mutation; -use risingwave_pb::stream_plan::AddMutation; -use risingwave_pb::stream_service::BuildActorInfo; +use risingwave_pb::stream_plan::{AddMutation, StreamActor}; use thiserror_ext::AsReport; use tokio::time::Instant; use tokio_retry::strategy::{jitter, ExponentialBackoff}; @@ -40,7 +39,6 @@ use crate::barrier::rpc::ControlStreamManager; use crate::barrier::schedule::ScheduledBarriers; use crate::barrier::state::BarrierManagerState; use crate::barrier::{BarrierKind, GlobalBarrierManager, GlobalBarrierManagerContext}; -use crate::controller::catalog::ReleaseContext; use crate::manager::{ActiveStreamingWorkerNodes, MetadataManager, WorkerId}; use crate::model::{MetadataModel, MigrationPlan, TableFragments, TableParallelism}; use crate::stream::{build_actor_connector_splits, RescheduleOptions, TableResizePolicy}; @@ -100,8 +98,7 @@ impl GlobalBarrierManagerContext { } MetadataManager::V2(mgr) => { mgr.catalog_controller.clean_dirty_subscription().await?; - let ReleaseContext { source_ids, .. } = - mgr.catalog_controller.clean_dirty_creating_jobs().await?; + let source_ids = mgr.catalog_controller.clean_dirty_creating_jobs().await?; // unregister cleaned sources. self.source_manager @@ -347,9 +344,21 @@ impl GlobalBarrierManager { let mut control_stream_manager = ControlStreamManager::new(self.context.clone()); + let subscription_info = InflightSubscriptionInfo { + mv_depended_subscriptions: self + .context + .metadata_manager + .get_mv_depended_subscriptions() + .await?, + }; + let reset_start_time = Instant::now(); control_stream_manager - .reset(version_id, active_streaming_nodes.current()) + .reset( + version_id, + &subscription_info, + active_streaming_nodes.current(), + ) .await .inspect_err(|err| { warn!(error = %err.as_report(), "reset compute nodes failed"); @@ -358,18 +367,10 @@ impl GlobalBarrierManager { self.context.sink_manager.reset().await; - let subscription_info = InflightSubscriptionInfo { - mv_depended_subscriptions: self - .context - .metadata_manager - .get_mv_depended_subscriptions() - .await?, - }; - // update and build all actors. let node_actors = self .context - .load_all_actors(&info, &subscription_info, &active_streaming_nodes) + .load_all_actors(&info, &active_streaming_nodes) .await .inspect_err(|err| { warn!(error = %err.as_report(), "update actors failed"); @@ -398,6 +399,8 @@ impl GlobalBarrierManager { &info, Some(&info), Some(node_actors), + vec![], + vec![], )?; debug!(?node_to_collect, "inject initial barrier"); while !node_to_collect.is_empty() { @@ -1095,18 +1098,14 @@ impl GlobalBarrierManagerContext { async fn load_all_actors( &self, info: &InflightGraphInfo, - subscription_info: &InflightSubscriptionInfo, active_nodes: &ActiveStreamingWorkerNodes, - ) -> MetaResult>> { + ) -> MetaResult>> { if info.actor_map.is_empty() { tracing::debug!("no actor to update, skipping."); return Ok(HashMap::new()); } - let all_node_actors = self - .metadata_manager - .all_node_actors(false, &subscription_info.mv_depended_subscriptions) - .await?; + let all_node_actors = self.metadata_manager.all_node_actors(false).await?; // Check if any actors were dropped after info resolved. if all_node_actors.iter().any(|(node_id, node_actors)| { diff --git a/src/meta/src/barrier/rpc.rs b/src/meta/src/barrier/rpc.rs index a3a2898c2782f..0177259486b72 100644 --- a/src/meta/src/barrier/rpc.rs +++ b/src/meta/src/barrier/rpc.rs @@ -27,13 +27,11 @@ use risingwave_common::util::tracing::TracingContext; use risingwave_hummock_sdk::HummockVersionId; use risingwave_pb::common::{ActorInfo, WorkerNode}; use risingwave_pb::stream_plan::barrier_mutation::Mutation; -use risingwave_pb::stream_plan::{Barrier, BarrierMutation}; -use risingwave_pb::stream_service::build_actor_info::SubscriptionIds; +use risingwave_pb::stream_plan::{Barrier, BarrierMutation, StreamActor, SubscriptionUpstreamInfo}; use risingwave_pb::stream_service::streaming_control_stream_request::RemovePartialGraphRequest; use risingwave_pb::stream_service::{ streaming_control_stream_request, streaming_control_stream_response, BarrierCompleteResponse, - BuildActorInfo, InjectBarrierRequest, StreamingControlStreamRequest, - StreamingControlStreamResponse, + InjectBarrierRequest, StreamingControlStreamRequest, StreamingControlStreamResponse, }; use rw_futures_util::pending_on_none; use thiserror_ext::AsReport; @@ -44,7 +42,7 @@ use tracing::{error, info, warn}; use uuid::Uuid; use super::command::CommandContext; -use super::{BarrierKind, GlobalBarrierManagerContext, TracedEpoch}; +use super::{BarrierKind, GlobalBarrierManagerContext, InflightSubscriptionInfo, TracedEpoch}; use crate::barrier::info::InflightGraphInfo; use crate::manager::WorkerId; use crate::{MetaError, MetaResult}; @@ -113,7 +111,11 @@ impl ControlStreamManager { } } - pub(super) async fn add_worker(&mut self, node: WorkerNode) { + pub(super) async fn add_worker( + &mut self, + node: WorkerNode, + subscription: &InflightSubscriptionInfo, + ) { if self.nodes.contains_key(&node.id) { warn!(id = node.id, host = ?node.host, "node already exists"); return; @@ -132,7 +134,11 @@ impl ControlStreamManager { for i in 1..=MAX_RETRY { match self .context - .new_control_stream_node(node.clone(), version_id) + .new_control_stream_node( + node.clone(), + version_id, + &subscription.mv_depended_subscriptions, + ) .await { Ok((stream_node, response_stream)) => { @@ -157,12 +163,17 @@ impl ControlStreamManager { pub(super) async fn reset( &mut self, version_id: HummockVersionId, + subscriptions: &InflightSubscriptionInfo, nodes: &HashMap, ) -> MetaResult<()> { let nodes = try_join_all(nodes.iter().map(|(worker_id, node)| async { let node = self .context - .new_control_stream_node(node.clone(), version_id) + .new_control_stream_node( + node.clone(), + version_id, + &subscriptions.mv_depended_subscriptions, + ) .await?; Result::<_, MetaError>::Ok((*worker_id, node)) })) @@ -264,49 +275,27 @@ impl ControlStreamManager { pre_applied_graph_info: &InflightGraphInfo, applied_graph_info: Option<&InflightGraphInfo>, ) -> MetaResult> { + let mutation = command_ctx.to_mutation(); + let subscriptions_to_add = if let Some(Mutation::Add(add)) = &mutation { + add.subscriptions_to_add.clone() + } else { + vec![] + }; + let subscriptions_to_remove = if let Some(Mutation::DropSubscriptions(drop)) = &mutation { + drop.info.clone() + } else { + vec![] + }; self.inject_barrier( None, - command_ctx.to_mutation(), + mutation, (&command_ctx.curr_epoch, &command_ctx.prev_epoch), &command_ctx.kind, pre_applied_graph_info, applied_graph_info, - command_ctx - .command - .actors_to_create() - .map(|actors_to_create| { - actors_to_create - .into_iter() - .map(|(worker_id, actors)| { - ( - worker_id, - actors - .into_iter() - .map(|actor| BuildActorInfo { - actor: Some(actor), - // TODO: consider subscriber of backfilling mv - related_subscriptions: command_ctx - .subscription_info - .mv_depended_subscriptions - .iter() - .map(|(table_id, subscriptions)| { - ( - table_id.table_id, - SubscriptionIds { - subscription_ids: subscriptions - .keys() - .cloned() - .collect(), - }, - ) - }) - .collect(), - }) - .collect_vec(), - ) - }) - .collect() - }), + command_ctx.command.actors_to_create(), + subscriptions_to_add, + subscriptions_to_remove, ) } @@ -318,7 +307,9 @@ impl ControlStreamManager { kind: &BarrierKind, pre_applied_graph_info: &InflightGraphInfo, applied_graph_info: Option<&InflightGraphInfo>, - mut new_actors: Option>>, + mut new_actors: Option>>, + subscriptions_to_add: Vec, + subscriptions_to_remove: Vec, ) -> MetaResult> { fail_point!("inject_barrier_err", |_| risingwave_common::bail!( "inject_barrier_err" @@ -352,7 +343,7 @@ impl ControlStreamManager { .flatten() .flat_map(|(worker_id, actor_infos)| { actor_infos.iter().map(|actor_info| ActorInfo { - actor_id: actor_info.actor.as_ref().unwrap().actor_id, + actor_id: actor_info.actor_id, host: self .nodes .get(worker_id) @@ -410,6 +401,8 @@ impl ControlStreamManager { .flatten() .flatten() .collect(), + subscriptions_to_add: subscriptions_to_add.clone(), + subscriptions_to_remove: subscriptions_to_remove.clone(), }, ), ), @@ -469,6 +462,7 @@ impl GlobalBarrierManagerContext { &self, node: WorkerNode, initial_version_id: HummockVersionId, + mv_depended_subscriptions: &HashMap>, ) -> MetaResult<( ControlStreamNode, BoxStream<'static, risingwave_rpc_client::error::Result>, @@ -478,7 +472,7 @@ impl GlobalBarrierManagerContext { .stream_client_pool() .get(&node) .await? - .start_streaming_control(initial_version_id) + .start_streaming_control(initial_version_id, mv_depended_subscriptions) .await?; Ok(( ControlStreamNode { diff --git a/src/meta/src/barrier/state.rs b/src/meta/src/barrier/state.rs index 6453f6930cb05..fa2ead1b1df05 100644 --- a/src/meta/src/barrier/state.rs +++ b/src/meta/src/barrier/state.rs @@ -28,7 +28,7 @@ pub struct BarrierManagerState { /// Inflight running actors info. pub(crate) inflight_graph_info: InflightGraphInfo, - inflight_subscription_info: InflightSubscriptionInfo, + pub(crate) inflight_subscription_info: InflightSubscriptionInfo, /// Whether the cluster is paused and the reason. paused_reason: Option, diff --git a/src/meta/src/controller/catalog.rs b/src/meta/src/controller/catalog.rs index ffab5160b5d9f..c9c3210dd6c67 100644 --- a/src/meta/src/controller/catalog.rs +++ b/src/meta/src/controller/catalog.rs @@ -40,7 +40,7 @@ use risingwave_pb::catalog::subscription::SubscriptionState; use risingwave_pb::catalog::table::PbTableType; use risingwave_pb::catalog::{ PbComment, PbConnection, PbDatabase, PbFunction, PbIndex, PbSchema, PbSecret, PbSink, PbSource, - PbSubscription, PbTable, PbView, + PbStreamJobStatus, PbSubscription, PbTable, PbView, }; use risingwave_pb::meta::cancel_creating_jobs_request::PbCreatingJobInfo; use risingwave_pb::meta::list_object_dependencies_response::PbObjectDependencies; @@ -728,11 +728,11 @@ impl CatalogController { } /// `clean_dirty_creating_jobs` cleans up creating jobs that are creating in Foreground mode or in Initial status. - pub async fn clean_dirty_creating_jobs(&self) -> MetaResult { + pub async fn clean_dirty_creating_jobs(&self) -> MetaResult> { let inner = self.inner.write().await; let txn = inner.db.begin().await?; - let mut dirty_objs: Vec = streaming_job::Entity::find() + let dirty_job_objs: Vec = streaming_job::Entity::find() .select_only() .column(streaming_job::Column::JobId) .columns([ @@ -755,36 +755,46 @@ impl CatalogController { let changed = Self::clean_dirty_sink_downstreams(&txn).await?; - if dirty_objs.is_empty() { + if dirty_job_objs.is_empty() { if changed { txn.commit().await?; } - return Ok(ReleaseContext::default()); + return Ok(vec![]); } - self.log_cleaned_dirty_jobs(&dirty_objs, &txn).await?; + self.log_cleaned_dirty_jobs(&dirty_job_objs, &txn).await?; - let dirty_job_ids = dirty_objs.iter().map(|obj| obj.oid).collect::>(); + let dirty_job_ids = dirty_job_objs.iter().map(|obj| obj.oid).collect::>(); // Filter out dummy objs for replacement. // todo: we'd better introduce a new dummy object type for replacement. - let all_dirty_table_ids = dirty_objs + let all_dirty_table_ids = dirty_job_objs .iter() .filter(|obj| obj.obj_type == ObjectType::Table) .map(|obj| obj.oid) .collect_vec(); - let dirty_table_ids: HashSet = Table::find() + let dirty_table_type_map: HashMap = Table::find() .select_only() .column(table::Column::TableId) + .column(table::Column::TableType) .filter(table::Column::TableId.is_in(all_dirty_table_ids)) - .into_tuple::() + .into_tuple::<(ObjectId, TableType)>() .all(&txn) .await? .into_iter() .collect(); - dirty_objs - .retain(|obj| obj.obj_type != ObjectType::Table || dirty_table_ids.contains(&obj.oid)); + + // Only notify delete for failed materialized views. + let dirty_mview_objs = dirty_job_objs + .into_iter() + .filter(|obj| { + matches!( + dirty_table_type_map.get(&obj.oid), + Some(TableType::MaterializedView) + ) + }) + .collect_vec(); let associated_source_ids: Vec = Table::find() .select_only() @@ -797,15 +807,16 @@ impl CatalogController { .into_tuple() .all(&txn) .await?; - let dirty_source_objs: Vec = Object::find() - .filter(object::Column::Oid.is_in(associated_source_ids.clone())) - .into_partial_model() + + let dirty_state_table_ids: Vec = Table::find() + .select_only() + .column(table::Column::TableId) + .filter(table::Column::BelongsToJobId.is_in(dirty_job_ids.clone())) + .into_tuple() .all(&txn) .await?; - dirty_objs.extend(dirty_source_objs); - let mut dirty_state_table_ids = vec![]; - let to_drop_internal_table_objs: Vec = Object::find() + let dirty_mview_internal_table_objs = Object::find() .select_only() .columns([ object::Column::Oid, @@ -814,17 +825,15 @@ impl CatalogController { object::Column::DatabaseId, ]) .join(JoinType::InnerJoin, object::Relation::Table.def()) - .filter(table::Column::BelongsToJobId.is_in(dirty_job_ids.clone())) + .filter(table::Column::BelongsToJobId.is_in(dirty_mview_objs.iter().map(|obj| obj.oid))) .into_partial_model() .all(&txn) .await?; - dirty_state_table_ids.extend(to_drop_internal_table_objs.iter().map(|obj| obj.oid)); - dirty_objs.extend(to_drop_internal_table_objs); let to_delete_objs: HashSet = dirty_job_ids .clone() .into_iter() - .chain(dirty_state_table_ids.clone().into_iter()) + .chain(dirty_state_table_ids.into_iter()) .chain(associated_source_ids.clone().into_iter()) .collect(); @@ -836,17 +845,18 @@ impl CatalogController { txn.commit().await?; - let relation_group = build_relation_group(dirty_objs); + let relation_group = build_relation_group( + dirty_mview_objs + .into_iter() + .chain(dirty_mview_internal_table_objs.into_iter()) + .collect_vec(), + ); let _version = self .notify_frontend(NotificationOperation::Delete, relation_group) .await; - Ok(ReleaseContext { - state_table_ids: dirty_state_table_ids, - source_ids: associated_source_ids, - ..Default::default() - }) + Ok(associated_source_ids) } async fn log_cleaned_dirty_jobs( @@ -3137,12 +3147,16 @@ impl CatalogControllerInner { Ok(table_ids) } - /// `list_tables` return all `CREATED` tables and internal tables that belong to `CREATED` streaming jobs. + /// `list_tables` return all `CREATED` tables, `CREATING` materialized views and internal tables that belong to them. async fn list_tables(&self) -> MetaResult> { let table_objs = Table::find() .find_also_related(Object) .join(JoinType::LeftJoin, object::Relation::StreamingJob.def()) - .filter(streaming_job::Column::JobStatus.eq(JobStatus::Created)) + .filter( + streaming_job::Column::JobStatus + .eq(JobStatus::Created) + .or(table::Column::TableType.eq(TableType::MaterializedView)), + ) .all(&self.db) .await?; @@ -3154,12 +3168,18 @@ impl CatalogControllerInner { .all(&self.db) .await?; + let job_ids: HashSet = table_objs + .iter() + .map(|(t, _)| t.table_id) + .chain(created_streaming_job_ids.iter().cloned()) + .collect(); + let internal_table_objs = Table::find() .find_also_related(Object) .filter( table::Column::TableType .eq(TableType::Internal) - .and(table::Column::BelongsToJobId.is_in(created_streaming_job_ids)), + .and(table::Column::BelongsToJobId.is_in(job_ids)), ) .all(&self.db) .await?; @@ -3167,7 +3187,19 @@ impl CatalogControllerInner { Ok(table_objs .into_iter() .chain(internal_table_objs.into_iter()) - .map(|(table, obj)| ObjectModel(table, obj.unwrap()).into()) + .map(|(table, obj)| { + // Correctly set the stream job status for creating materialized views and internal tables. + let is_created = created_streaming_job_ids.contains(&table.table_id) + || (table.table_type == TableType::Internal + && created_streaming_job_ids.contains(&table.belongs_to_job_id.unwrap())); + let mut pb_table: PbTable = ObjectModel(table, obj.unwrap()).into(); + pb_table.stream_job_status = if is_created { + PbStreamJobStatus::Created.into() + } else { + PbStreamJobStatus::Creating.into() + }; + pb_table + }) .collect()) } diff --git a/src/meta/src/controller/mod.rs b/src/meta/src/controller/mod.rs index e22b0f20ee86e..3e903802b86ee 100644 --- a/src/meta/src/controller/mod.rs +++ b/src/meta/src/controller/mod.rs @@ -150,7 +150,7 @@ impl From> for PbTable { Epoch::from_unix_millis(value.1.created_at.and_utc().timestamp_millis() as _).0, ), cleaned_by_watermark: value.0.cleaned_by_watermark, - stream_job_status: PbStreamJobStatus::Created as _, // todo: deprecate it. + stream_job_status: PbStreamJobStatus::Created as _, create_type: PbCreateType::Foreground as _, version: value.0.version.map(|v| v.to_protobuf()), optional_associated_source_id: value @@ -236,7 +236,7 @@ impl From> for PbSink { ), db_name: value.0.db_name, sink_from_name: value.0.sink_from_name, - stream_job_status: PbStreamJobStatus::Created as _, // todo: deprecate it. + stream_job_status: PbStreamJobStatus::Created as _, format_desc: value.0.sink_format_desc.map(|desc| desc.to_protobuf()), target_table: value.0.target_table.map(|id| id as _), initialized_at_cluster_version: value.1.initialized_at_cluster_version, @@ -299,7 +299,7 @@ impl From> for PbIndex { created_at_epoch: Some( Epoch::from_unix_millis(value.1.created_at.and_utc().timestamp_millis() as _).0, ), - stream_job_status: PbStreamJobStatus::Created as _, // todo: deprecate it. + stream_job_status: PbStreamJobStatus::Created as _, initialized_at_cluster_version: value.1.initialized_at_cluster_version, created_at_cluster_version: value.1.created_at_cluster_version, } diff --git a/src/meta/src/controller/streaming_job.rs b/src/meta/src/controller/streaming_job.rs index fd12630fd1649..2121f0a096383 100644 --- a/src/meta/src/controller/streaming_job.rs +++ b/src/meta/src/controller/streaming_job.rs @@ -26,6 +26,7 @@ use risingwave_meta_model_v2::prelude::{ Actor, ActorDispatcher, Fragment, Index, Object, ObjectDependency, Sink, Source, StreamingJob as StreamingJobModel, Table, }; +use risingwave_meta_model_v2::table::TableType; use risingwave_meta_model_v2::{ actor, actor_dispatcher, fragment, index, object, object_dependency, sink, source, streaming_job, table, ActorId, ActorUpstreamActors, ColumnCatalogArray, CreateType, DatabaseId, @@ -208,9 +209,6 @@ impl CatalogController { sink.id = job_id as _; let sink_model: sink::ActiveModel = sink.clone().into(); Sink::insert(sink_model).exec(&txn).await?; - relations.push(Relation { - relation_info: Some(RelationInfo::Sink(sink.to_owned())), - }); } StreamingJob::Table(src, table, _) => { let job_id = Self::create_streaming_job_obj( @@ -242,15 +240,9 @@ impl CatalogController { ); let source: source::ActiveModel = src.clone().into(); Source::insert(source).exec(&txn).await?; - relations.push(Relation { - relation_info: Some(RelationInfo::Source(src.to_owned())), - }); } let table_model: table::ActiveModel = table.clone().into(); Table::insert(table_model).exec(&txn).await?; - relations.push(Relation { - relation_info: Some(RelationInfo::Table(table.to_owned())), - }); } StreamingJob::Index(index, table) => { ensure_object_id(ObjectType::Table, index.primary_table_id as _, &txn).await?; @@ -282,12 +274,6 @@ impl CatalogController { Table::insert(table_model).exec(&txn).await?; let index_model: index::ActiveModel = index.clone().into(); Index::insert(index_model).exec(&txn).await?; - relations.push(Relation { - relation_info: Some(RelationInfo::Table(table.to_owned())), - }); - relations.push(Relation { - relation_info: Some(RelationInfo::Index(index.to_owned())), - }); } StreamingJob::Source(src) => { let job_id = Self::create_streaming_job_obj( @@ -304,9 +290,6 @@ impl CatalogController { src.id = job_id as _; let source_model: source::ActiveModel = src.clone().into(); Source::insert(source_model).exec(&txn).await?; - relations.push(Relation { - relation_info: Some(RelationInfo::Source(src.to_owned())), - }); } } @@ -331,21 +314,23 @@ impl CatalogController { txn.commit().await?; - let _version = self - .notify_frontend( + if !relations.is_empty() { + self.notify_frontend( Operation::Add, Info::RelationGroup(RelationGroup { relations }), ) .await; + } Ok(()) } pub async fn create_internal_table_catalog( &self, - job_id: ObjectId, + job: &StreamingJob, mut internal_tables: Vec, ) -> MetaResult> { + let job_id = job.id() as ObjectId; let inner = self.inner.write().await; let txn = inner.db.begin().await?; let mut table_id_map = HashMap::new(); @@ -363,13 +348,14 @@ impl CatalogController { table.id = table_id as _; let mut table_model: table::ActiveModel = table.clone().into(); table_model.table_id = Set(table_id as _); - table_model.belongs_to_job_id = Set(Some(job_id as _)); + table_model.belongs_to_job_id = Set(Some(job_id)); table_model.fragment_id = NotSet; Table::insert(table_model).exec(&txn).await?; } txn.commit().await?; - let _version = self - .notify_frontend( + + if job.is_materialized_view() { + self.notify_frontend( Operation::Add, Info::RelationGroup(RelationGroup { relations: internal_tables @@ -381,6 +367,8 @@ impl CatalogController { }), ) .await; + } + Ok(table_id_map) } @@ -497,64 +485,52 @@ impl CatalogController { .all(&txn) .await?; - let associated_source_id: Option = Table::find_by_id(job_id) - .select_only() - .column(table::Column::OptionalAssociatedSourceId) - .filter(table::Column::OptionalAssociatedSourceId.is_not_null()) - .into_tuple() - .one(&txn) - .await?; - - // Get notification info + // Get the notification info if the job is a materialized view. + let table_obj = Table::find_by_id(job_id).one(&txn).await?; let mut objs = vec![]; - let obj: Option = Object::find_by_id(job_id) - .select_only() - .columns([ - object::Column::Oid, - object::Column::ObjType, - object::Column::SchemaId, - object::Column::DatabaseId, - ]) - .into_partial_model() - .one(&txn) - .await?; - let obj = obj.ok_or_else(|| MetaError::catalog_id_not_found("streaming job", job_id))?; - objs.push(obj); - let internal_table_objs: Vec = Object::find() - .select_only() - .columns([ - object::Column::Oid, - object::Column::ObjType, - object::Column::SchemaId, - object::Column::DatabaseId, - ]) - .join(JoinType::InnerJoin, object::Relation::Table.def()) - .filter(table::Column::BelongsToJobId.eq(job_id)) - .into_partial_model() - .all(&txn) - .await?; - objs.extend(internal_table_objs); - if let Some(source_id) = associated_source_id { - let source_obj = Object::find_by_id(source_id) + if let Some(table) = &table_obj + && table.table_type == TableType::MaterializedView + { + let obj: Option = Object::find_by_id(job_id) .select_only() - .column(object::Column::ObjType) + .columns([ + object::Column::Oid, + object::Column::ObjType, + object::Column::SchemaId, + object::Column::DatabaseId, + ]) .into_partial_model() .one(&txn) - .await? - .ok_or_else(|| MetaError::catalog_id_not_found("source", source_id))?; - objs.push(source_obj); + .await?; + let obj = + obj.ok_or_else(|| MetaError::catalog_id_not_found("streaming job", job_id))?; + objs.push(obj); + let internal_table_objs: Vec = Object::find() + .select_only() + .columns([ + object::Column::Oid, + object::Column::ObjType, + object::Column::SchemaId, + object::Column::DatabaseId, + ]) + .join(JoinType::InnerJoin, object::Relation::Table.def()) + .filter(table::Column::BelongsToJobId.eq(job_id)) + .into_partial_model() + .all(&txn) + .await?; + objs.extend(internal_table_objs); } - let relation_group = build_relation_group(objs); - // Can delete objects after queried notification info Object::delete_by_id(job_id).exec(&txn).await?; if !internal_table_ids.is_empty() { Object::delete_many() - .filter(object::Column::Oid.is_in(internal_table_ids.iter().cloned())) + .filter(object::Column::Oid.is_in(internal_table_ids)) .exec(&txn) .await?; } - if let Some(source_id) = associated_source_id { + if let Some(t) = &table_obj + && let Some(source_id) = t.optional_associated_source_id + { Object::delete_by_id(source_id).exec(&txn).await?; } @@ -576,9 +552,10 @@ impl CatalogController { } txn.commit().await?; - let _version = self - .notify_frontend(Operation::Delete, relation_group) - .await; + if !objs.is_empty() { + self.notify_frontend(Operation::Delete, build_relation_group(objs)) + .await; + } Ok(true) } @@ -778,6 +755,7 @@ impl CatalogController { )), }) .collect_vec(); + let mut notification_op = NotificationOperation::Add; match job_type { ObjectType::Table => { @@ -786,6 +764,10 @@ impl CatalogController { .one(&txn) .await? .ok_or_else(|| MetaError::catalog_id_not_found("table", job_id))?; + if table.table_type == TableType::MaterializedView { + notification_op = NotificationOperation::Update; + } + if let Some(source_id) = table.optional_associated_source_id { let (src, obj) = Source::find_by_id(source_id) .find_also_related(Object) @@ -892,7 +874,7 @@ impl CatalogController { let mut version = self .notify_frontend( - NotificationOperation::Update, + notification_op, NotificationInfo::RelationGroup(PbRelationGroup { relations }), ) .await; @@ -1465,6 +1447,7 @@ impl CatalogController { .exec(&txn) .await?; + // add new actors for ( PbStreamActor { actor_id, @@ -1572,6 +1555,23 @@ impl CatalogController { actor.update(&txn).await?; } + // Update actor_splits for existing actors + for (actor_id, splits) in actor_splits { + if new_created_actors.contains(&(actor_id as ActorId)) { + continue; + } + + let actor = Actor::find_by_id(actor_id as ActorId) + .one(&txn) + .await? + .ok_or_else(|| MetaError::catalog_id_not_found("actor", actor_id))?; + + let mut actor = actor.into_active_model(); + let splits = splits.iter().map(PbConnectorSplit::from).collect_vec(); + actor.splits = Set(Some((&PbConnectorSplits { splits }).into())); + actor.update(&txn).await?; + } + // fragment update let fragment = Fragment::find_by_id(fragment_id) .one(&txn) diff --git a/src/meta/src/controller/utils.rs b/src/meta/src/controller/utils.rs index 2d517272a3d00..43fed3380d6bc 100644 --- a/src/meta/src/controller/utils.rs +++ b/src/meta/src/controller/utils.rs @@ -230,7 +230,7 @@ pub fn construct_sink_cycle_check_query( .to_owned() } -#[derive(Clone, DerivePartialModel, FromQueryResult)] +#[derive(Clone, DerivePartialModel, FromQueryResult, Debug)] #[sea_orm(entity = "Object")] pub struct PartialObject { pub oid: ObjectId, diff --git a/src/meta/src/hummock/compactor_manager.rs b/src/meta/src/hummock/compactor_manager.rs index 252f92c404015..f8d8ae7f23c4e 100644 --- a/src/meta/src/hummock/compactor_manager.rs +++ b/src/meta/src/hummock/compactor_manager.rs @@ -476,16 +476,18 @@ impl CompactorManager { #[cfg(test)] mod tests { + use std::sync::Arc; use std::time::Duration; use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; use risingwave_pb::hummock::CompactTaskProgress; + use risingwave_rpc_client::HummockMetaClient; use crate::hummock::compaction::selector::default_compaction_selector; use crate::hummock::test_utils::{ add_ssts, register_table_ids_to_compaction_group, setup_compute_env, }; - use crate::hummock::CompactorManager; + use crate::hummock::{CompactorManager, MockHummockMetaClient}; #[tokio::test] async fn test_compactor_manager() { @@ -493,6 +495,9 @@ mod tests { let (env, context_id) = { let (env, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new( + MockHummockMetaClient::new(hummock_manager.clone(), worker_node.id), + ); let compactor_manager = hummock_manager.compactor_manager_ref_for_test(); register_table_ids_to_compaction_group( hummock_manager.as_ref(), @@ -500,7 +505,8 @@ mod tests { StaticCompactionGroupId::StateDefault.into(), ) .await; - let _sst_infos = add_ssts(1, hummock_manager.as_ref(), context_id).await; + let _sst_infos = + add_ssts(1, hummock_manager.as_ref(), hummock_meta_client.clone()).await; let _receiver = compactor_manager.add_compactor(context_id); hummock_manager .get_compact_task( diff --git a/src/meta/src/hummock/manager/commit_epoch.rs b/src/meta/src/hummock/manager/commit_epoch.rs index e92e91c8503d0..8c021509dcbb2 100644 --- a/src/meta/src/hummock/manager/commit_epoch.rs +++ b/src/meta/src/hummock/manager/commit_epoch.rs @@ -70,37 +70,6 @@ pub struct CommitEpochInfo { } impl HummockManager { - #[cfg(any(test, feature = "test"))] - pub async fn commit_epoch_for_test( - &self, - epoch: u64, - sstables: Vec>, - sst_to_context: HashMap, - ) -> Result<()> { - let tables = self - .versioning - .read() - .await - .current_version - .state_table_info - .info() - .keys() - .cloned() - .collect(); - let info = CommitEpochInfo { - sstables: sstables.into_iter().map(Into::into).collect(), - new_table_watermarks: HashMap::new(), - sst_to_context, - new_table_fragment_info: NewTableFragmentInfo::None, - change_log_delta: HashMap::new(), - committed_epoch: epoch, - tables_to_commit: tables, - is_visible_table_committed_epoch: true, - }; - self.commit_epoch(info).await?; - Ok(()) - } - /// Caller should ensure `epoch` > `max_committed_epoch` pub async fn commit_epoch( &self, diff --git a/src/meta/src/hummock/manager/gc.rs b/src/meta/src/hummock/manager/gc.rs index 97a99945bcf41..596c36857907f 100644 --- a/src/meta/src/hummock/manager/gc.rs +++ b/src/meta/src/hummock/manager/gc.rs @@ -331,16 +331,24 @@ mod tests { use std::time::Duration; use itertools::Itertools; + use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; use risingwave_hummock_sdk::HummockSstableObjectId; + use risingwave_rpc_client::HummockMetaClient; use super::ResponseEvent; use crate::hummock::test_utils::{add_test_tables, setup_compute_env}; + use crate::hummock::MockHummockMetaClient; use crate::MetaOpts; #[tokio::test] async fn test_full_gc() { let (mut env, hummock_manager, cluster_manager, worker_node) = setup_compute_env(80).await; let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); let compactor_manager = hummock_manager.compactor_manager_ref_for_test(); // Use smaller spin interval to accelerate test. env.opts = Arc::new(MetaOpts { @@ -426,7 +434,12 @@ mod tests { ); // All committed SST ids should be excluded from GC. - let sst_infos = add_test_tables(hummock_manager.as_ref(), context_id).await; + let sst_infos = add_test_tables( + hummock_manager.as_ref(), + hummock_meta_client.clone(), + compaction_group_id, + ) + .await; let committed_object_ids = sst_infos .into_iter() .flatten() diff --git a/src/meta/src/hummock/manager/tests.rs b/src/meta/src/hummock/manager/tests.rs index d0183d84d23c5..09d43bf5fc72c 100644 --- a/src/meta/src/hummock/manager/tests.rs +++ b/src/meta/src/hummock/manager/tests.rs @@ -13,38 +13,41 @@ // limitations under the License. #![cfg(test)] - -use std::borrow::Borrow; use std::cmp::Ordering; use std::collections::HashMap; +use std::sync::Arc; use itertools::Itertools; use prometheus::Registry; use risingwave_common::catalog::TableId; +use risingwave_common::hash::VirtualNode; use risingwave_common::util::epoch::{test_epoch, EpochExt, INVALID_EPOCH}; use risingwave_hummock_sdk::compact::compact_task_to_string; use risingwave_hummock_sdk::compact_task::CompactTask; use risingwave_hummock_sdk::compaction_group::hummock_version_ext::get_compaction_group_ssts; use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; +use risingwave_hummock_sdk::key::{gen_key_from_str, FullKey}; use risingwave_hummock_sdk::key_range::KeyRange; use risingwave_hummock_sdk::sstable_info::SstableInfo; use risingwave_hummock_sdk::table_stats::{to_prost_table_stats_map, TableStats, TableStatsMap}; use risingwave_hummock_sdk::version::HummockVersion; use risingwave_hummock_sdk::{ CompactionGroupId, HummockContextId, HummockEpoch, HummockSstableObjectId, HummockVersionId, - LocalSstableInfo, FIRST_VERSION_ID, + LocalSstableInfo, SyncResult, FIRST_VERSION_ID, }; use risingwave_pb::common::{HostAddress, WorkerType}; use risingwave_pb::hummock::compact_task::TaskStatus; use risingwave_pb::hummock::{HummockPinnedSnapshot, HummockPinnedVersion, HummockSnapshot}; use risingwave_pb::meta::add_worker_node_request::Property; +use risingwave_rpc_client::HummockMetaClient; +use thiserror_ext::AsReport; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; use crate::hummock::compaction::selector::{default_compaction_selector, ManualCompactionOption}; use crate::hummock::error::Error; use crate::hummock::test_utils::*; -use crate::hummock::{HummockManager, HummockManagerRef}; -use crate::manager::{MetaSrvEnv, MetaStoreImpl, WorkerId}; +use crate::hummock::{HummockManagerRef, MockHummockMetaClient}; +use crate::manager::{MetaSrvEnv, MetaStoreImpl}; use crate::model::MetadataModel; use crate::rpc::metrics::MetaMetrics; @@ -59,12 +62,23 @@ fn pin_snapshots_epoch(pin_snapshots: &[HummockPinnedSnapshot]) -> Vec { .collect_vec() } -fn gen_sstable_info(sst_id: u64, idx: usize, table_ids: Vec) -> SstableInfo { +fn gen_sstable_info(sst_id: u64, table_ids: Vec, epoch: u64) -> SstableInfo { + let table_key_l = gen_key_from_str(VirtualNode::ZERO, "1"); + let table_key_r = gen_key_from_str(VirtualNode::MAX_FOR_TEST, "1"); + let full_key_l = FullKey::for_test( + TableId::new(*table_ids.first().unwrap()), + table_key_l, + epoch, + ) + .encode(); + let full_key_r = + FullKey::for_test(TableId::new(*table_ids.last().unwrap()), table_key_r, epoch).encode(); + SstableInfo { sst_id, key_range: KeyRange { - left: iterator_test_key_of_epoch(1, idx, 1).into(), - right: iterator_test_key_of_epoch(1, idx, 1).into(), + left: full_key_l.into(), + right: full_key_r.into(), right_exclusive: false, }, table_ids, @@ -77,9 +91,9 @@ fn gen_sstable_info(sst_id: u64, idx: usize, table_ids: Vec) -> SstableInfo } } -fn gen_local_sstable_info(sst_id: u64, idx: usize, table_ids: Vec) -> LocalSstableInfo { +fn gen_local_sstable_info(sst_id: u64, table_ids: Vec, epoch: u64) -> LocalSstableInfo { LocalSstableInfo { - sst_info: gen_sstable_info(sst_id, idx, table_ids), + sst_info: gen_sstable_info(sst_id, table_ids, epoch), table_stats: Default::default(), } } @@ -181,8 +195,12 @@ async fn test_unpin_snapshot_before() { #[tokio::test] async fn test_hummock_compaction_task() { - let (_, hummock_manager, _, _worker_node) = setup_compute_env(80).await; + let (_, hummock_manager, _, worker_node) = setup_compute_env(80).await; let sst_num = 2; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); // No compaction task available. assert!(hummock_manager @@ -196,9 +214,10 @@ async fn test_hummock_compaction_task() { // Add some sstables and commit. let epoch = test_epoch(1); + let table_id = 1; let original_tables = generate_test_sstables_with_table_id( epoch, - 1, + table_id, get_sst_ids(&hummock_manager, sst_num).await, ); register_sstable_infos_to_compaction_group( @@ -207,20 +226,23 @@ async fn test_hummock_compaction_task() { StaticCompactionGroupId::StateDefault.into(), ) .await; - commit_from_meta_node( - hummock_manager.borrow(), - epoch, - to_local_sstable_info(&original_tables), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&original_tables), + ..Default::default() + }, + false, + ) + .await + .unwrap(); // Get a compaction task. + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), table_id).await; let compact_task = hummock_manager - .get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - &mut default_compaction_selector(), - ) + .get_compact_task(compaction_group_id, &mut default_compaction_selector()) .await .unwrap() .unwrap(); @@ -235,10 +257,7 @@ async fn test_hummock_compaction_task() { // Get a compaction task. let compact_task = hummock_manager - .get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - &mut default_compaction_selector(), - ) + .get_compact_task(compaction_group_id, &mut default_compaction_selector()) .await .unwrap() .unwrap(); @@ -253,7 +272,11 @@ async fn test_hummock_compaction_task() { #[tokio::test] async fn test_hummock_table() { - let (_env, hummock_manager, _cluster_manager, _worker_node) = setup_compute_env(80).await; + let (_env, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let epoch = test_epoch(1); let original_tables = generate_test_tables(epoch, get_sst_ids(&hummock_manager, 2).await); @@ -263,17 +286,21 @@ async fn test_hummock_table() { StaticCompactionGroupId::StateDefault.into(), ) .await; - commit_from_meta_node( - hummock_manager.borrow(), - epoch, - to_local_sstable_info(&original_tables), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&original_tables), + ..Default::default() + }, + false, + ) + .await + .unwrap(); + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); let pinned_version = hummock_manager.get_current_version().await; - let levels = - pinned_version.get_compaction_group_levels(StaticCompactionGroupId::StateDefault.into()); + let levels = pinned_version.get_compaction_group_levels(compaction_group_id); assert_eq!( Ordering::Equal, levels @@ -290,14 +317,18 @@ async fn test_hummock_table() { // Confirm tables got are equal to original tables assert_eq!( get_sorted_object_ids(&original_tables), - get_sorted_committed_object_ids(&pinned_version) + get_sorted_committed_object_ids(&pinned_version, compaction_group_id) ); } #[tokio::test] async fn test_hummock_transaction() { - let (_env, hummock_manager, _cluster_manager, _worker_node) = setup_compute_env(80).await; + let (_env, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; let mut committed_tables = vec![]; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); // Add and commit tables in epoch1. // BEFORE: committed_epochs = [] @@ -318,24 +349,30 @@ async fn test_hummock_transaction() { current_version.visible_table_committed_epoch(), INVALID_EPOCH ); - assert!(get_sorted_committed_object_ids(¤t_version).is_empty()); + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); + assert!(get_sorted_committed_object_ids(¤t_version, compaction_group_id).is_empty()); // Commit epoch1 - commit_from_meta_node( - hummock_manager.borrow(), - epoch1, - to_local_sstable_info(&tables_in_epoch1), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch1, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&tables_in_epoch1), + ..Default::default() + }, + false, + ) + .await + .unwrap(); committed_tables.extend(tables_in_epoch1.clone()); // Get tables after committing epoch1. All tables committed in epoch1 should be returned let current_version = hummock_manager.get_current_version().await; + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); assert_eq!(current_version.visible_table_committed_epoch(), epoch1); assert_eq!( get_sorted_object_ids(&committed_tables), - get_sorted_committed_object_ids(¤t_version) + get_sorted_committed_object_ids(¤t_version, compaction_group_id) ); } @@ -355,29 +392,35 @@ async fn test_hummock_transaction() { // Get tables before committing epoch2. tables_in_epoch1 should be returned and // tables_in_epoch2 should be invisible. let current_version = hummock_manager.get_current_version().await; + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); assert_eq!(current_version.visible_table_committed_epoch(), epoch1); assert_eq!( get_sorted_object_ids(&committed_tables), - get_sorted_committed_object_ids(¤t_version) + get_sorted_committed_object_ids(¤t_version, compaction_group_id) ); // Commit epoch2 - commit_from_meta_node( - hummock_manager.borrow(), - epoch2, - to_local_sstable_info(&tables_in_epoch2), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch2, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&tables_in_epoch2), + ..Default::default() + }, + false, + ) + .await + .unwrap(); committed_tables.extend(tables_in_epoch2); // Get tables after committing epoch2. tables_in_epoch1 and tables_in_epoch2 should be // returned let current_version = hummock_manager.get_current_version().await; + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); assert_eq!(current_version.visible_table_committed_epoch(), epoch2); assert_eq!( get_sorted_object_ids(&committed_tables), - get_sorted_committed_object_ids(¤t_version) + get_sorted_committed_object_ids(¤t_version, compaction_group_id) ); } } @@ -469,6 +512,10 @@ async fn test_context_id_validation() { async fn test_hummock_manager_basic() { let (_env, hummock_manager, cluster_manager, worker_node) = setup_compute_env(1).await; let context_id_1 = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let fake_host_address_2 = HostAddress { host: "127.0.0.1".to_string(), @@ -501,7 +548,9 @@ async fn test_hummock_manager_basic() { let mut epoch = test_epoch(1); let mut register_log_count = 0; let mut commit_log_count = 0; - let commit_one = |epoch: HummockEpoch, hummock_manager: HummockManagerRef| async move { + let commit_one = |epoch: HummockEpoch, + hummock_manager: HummockManagerRef, + hummock_meta_client: Arc| async move { let original_tables = generate_test_tables(test_epoch(epoch), get_sst_ids(&hummock_manager, 2).await); register_sstable_infos_to_compaction_group( @@ -510,16 +559,21 @@ async fn test_hummock_manager_basic() { StaticCompactionGroupId::StateDefault.into(), ) .await; - commit_from_meta_node( - hummock_manager.borrow(), - epoch, - to_local_sstable_info(&original_tables), - ) - .await - .unwrap(); + + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&original_tables), + ..Default::default() + }, + false, + ) + .await + .unwrap(); }; - commit_one(epoch, hummock_manager.clone()).await; + commit_one(epoch, hummock_manager.clone(), hummock_meta_client.clone()).await; register_log_count += 1; commit_log_count += 1; epoch.inc_epoch(); @@ -559,7 +613,7 @@ async fn test_hummock_manager_basic() { ); } - commit_one(epoch, hummock_manager.clone()).await; + commit_one(epoch, hummock_manager.clone(), hummock_meta_client.clone()).await; commit_log_count += 1; register_log_count += 1; @@ -618,6 +672,10 @@ async fn test_hummock_manager_basic() { #[tokio::test] async fn test_pin_snapshot_response_lost() { let (_env, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let context_id = worker_node.id; let mut epoch = test_epoch(1); @@ -629,13 +687,17 @@ async fn test_pin_snapshot_response_lost() { ) .await; // [ ] -> [ e0 ] - commit_from_meta_node( - hummock_manager.borrow(), - epoch, - to_local_sstable_info(&test_tables), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&test_tables), + ..Default::default() + }, + false, + ) + .await + .unwrap(); epoch.inc_epoch(); // Pin a snapshot with smallest last_pin @@ -652,13 +714,17 @@ async fn test_pin_snapshot_response_lost() { ) .await; // [ e0:pinned ] -> [ e0:pinned, e1 ] - commit_from_meta_node( - hummock_manager.borrow(), - epoch, - to_local_sstable_info(&test_tables), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&test_tables), + ..Default::default() + }, + false, + ) + .await + .unwrap(); epoch.inc_epoch(); // Assume the response of the previous rpc is lost. @@ -683,13 +749,17 @@ async fn test_pin_snapshot_response_lost() { ) .await; // [ e0, e1:pinned ] -> [ e0, e1:pinned, e2 ] - commit_from_meta_node( - hummock_manager.borrow(), - epoch, - to_local_sstable_info(&test_tables), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&test_tables), + ..Default::default() + }, + false, + ) + .await + .unwrap(); epoch.inc_epoch(); // Use correct snapshot id. @@ -708,13 +778,17 @@ async fn test_pin_snapshot_response_lost() { ) .await; // [ e0, e1:pinned, e2:pinned ] -> [ e0, e1:pinned, e2:pinned, e3 ] - commit_from_meta_node( - hummock_manager.borrow(), - epoch, - to_local_sstable_info(&test_tables), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&test_tables), + ..Default::default() + }, + false, + ) + .await + .unwrap(); epoch.inc_epoch(); // Use u64::MAX as epoch to pin greatest snapshot @@ -728,31 +802,37 @@ async fn test_pin_snapshot_response_lost() { #[tokio::test] async fn test_print_compact_task() { - let (_, hummock_manager, _cluster_manager, _) = setup_compute_env(80).await; + let (_, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); // Add some sstables and commit. let epoch = test_epoch(1); let original_tables = generate_test_sstables_with_table_id(epoch, 1, get_sst_ids(&hummock_manager, 2).await); + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); register_sstable_infos_to_compaction_group( &hummock_manager, &original_tables, - StaticCompactionGroupId::StateDefault.into(), + compaction_group_id, ) .await; - commit_from_meta_node( - hummock_manager.borrow(), - epoch, - to_local_sstable_info(&original_tables), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&original_tables), + ..Default::default() + }, + false, + ) + .await + .unwrap(); // Get a compaction task. let compact_task = hummock_manager - .get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - &mut default_compaction_selector(), - ) + .get_compact_task(compaction_group_id, &mut default_compaction_selector()) .await .unwrap() .unwrap(); @@ -765,33 +845,45 @@ async fn test_print_compact_task() { #[tokio::test] async fn test_invalid_sst_id() { let (_, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; - let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let epoch = test_epoch(1); let ssts = generate_test_tables(epoch, vec![1]); - register_sstable_infos_to_compaction_group( - &hummock_manager, - &ssts, - StaticCompactionGroupId::StateDefault.into(), - ) - .await; + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); + register_sstable_infos_to_compaction_group(&hummock_manager, &ssts, compaction_group_id).await; let ssts = to_local_sstable_info(&ssts); // reject due to invalid context id - let sst_to_worker = ssts - .iter() - .map(|LocalSstableInfo { sst_info, .. }| (sst_info.object_id, WorkerId::MAX)) - .collect(); - let error = hummock_manager - .commit_epoch_for_test(epoch, ssts.clone(), sst_to_worker) - .await - .unwrap_err(); - assert!(matches!(error, Error::InvalidSst(1))); + { + let hummock_meta_client: Arc = + Arc::new(MockHummockMetaClient::new(hummock_manager.clone(), 23333)); + let error = hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: ssts.clone(), + ..Default::default() + }, + false, + ) + .await + .unwrap_err(); + assert_eq!( + error.as_report().to_string(), + "mock error: SST 1 is invalid" + ); + } - let sst_to_worker = ssts - .iter() - .map(|LocalSstableInfo { sst_info, .. }| (sst_info.object_id, context_id)) - .collect(); - hummock_manager - .commit_epoch_for_test(epoch, ssts, sst_to_worker) + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: ssts.clone(), + ..Default::default() + }, + false, + ) .await .unwrap(); } @@ -799,6 +891,10 @@ async fn test_invalid_sst_id() { #[tokio::test] async fn test_trigger_manual_compaction() { let (_, hummock_manager, _, worker_node) = setup_compute_env(80).await; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let context_id = worker_node.id; { @@ -831,7 +927,13 @@ async fn test_trigger_manual_compaction() { } // Generate data for compaction task - let _ = add_test_tables(&hummock_manager, context_id).await; + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); + let _ = add_test_tables( + &hummock_manager, + hummock_meta_client.clone(), + compaction_group_id, + ) + .await; { // to check compactor send task fail drop(receiver); @@ -879,6 +981,10 @@ async fn test_hummock_compaction_task_heartbeat() { use crate::hummock::HummockManager; let (_env, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let context_id = worker_node.id; let sst_num = 2; @@ -910,13 +1016,17 @@ async fn test_hummock_compaction_task_heartbeat() { StaticCompactionGroupId::StateDefault.into(), ) .await; - commit_from_meta_node( - hummock_manager.borrow(), - epoch, - to_local_sstable_info(&original_tables), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&original_tables), + ..Default::default() + }, + false, + ) + .await + .unwrap(); // Get a compaction task. let compact_task = hummock_manager @@ -992,6 +1102,10 @@ async fn test_hummock_compaction_task_heartbeat_removal_on_node_removal() { use crate::hummock::HummockManager; let (_env, hummock_manager, cluster_manager, worker_node) = setup_compute_env(80).await; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let context_id = worker_node.id; let sst_num = 2; @@ -1023,13 +1137,17 @@ async fn test_hummock_compaction_task_heartbeat_removal_on_node_removal() { StaticCompactionGroupId::StateDefault.into(), ) .await; - commit_from_meta_node( - hummock_manager.borrow(), - epoch, - to_local_sstable_info(&original_tables), - ) - .await - .unwrap(); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: to_local_sstable_info(&original_tables), + ..Default::default() + }, + false, + ) + .await + .unwrap(); // Get a compaction task. let compact_task = hummock_manager @@ -1071,8 +1189,18 @@ async fn test_hummock_compaction_task_heartbeat_removal_on_node_removal() { async fn test_extend_objects_to_delete() { let (_env, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let _pinned_version1 = hummock_manager.pin_version(context_id).await.unwrap(); - let sst_infos = add_test_tables(hummock_manager.as_ref(), context_id).await; + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); + let sst_infos = add_test_tables( + hummock_manager.as_ref(), + hummock_meta_client.clone(), + compaction_group_id, + ) + .await; let max_committed_object_id = sst_infos .iter() .map(|ssts| { @@ -1150,11 +1278,14 @@ async fn test_extend_objects_to_delete() { let objects_to_delete = hummock_manager.get_objects_to_delete(); assert_eq!(objects_to_delete.len(), orphan_sst_num as usize); let new_epoch = pinned_version2.visible_table_committed_epoch().next_epoch(); - hummock_manager - .commit_epoch_for_test( + hummock_meta_client + .commit_epoch( new_epoch, - Vec::::new(), - Default::default(), + SyncResult { + uncommitted_ssts: vec![], + ..Default::default() + }, + false, ) .await .unwrap(); @@ -1179,6 +1310,11 @@ async fn test_extend_objects_to_delete() { #[tokio::test] async fn test_version_stats() { let (_env, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); + let init_stats = hummock_manager.get_version_stats().await; assert!(init_stats.table_stats.is_empty()); @@ -1222,12 +1358,15 @@ async fn test_version_stats() { .collect(), }) .collect_vec(); - let sst_to_worker = ssts - .iter() - .map(|LocalSstableInfo { sst_info, .. }| (sst_info.object_id, worker_node.id)) - .collect(); - hummock_manager - .commit_epoch_for_test(epoch, ssts, sst_to_worker) + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: ssts, + ..Default::default() + }, + false, + ) .await .unwrap(); @@ -1246,11 +1385,6 @@ async fn test_version_stats() { assert_eq!(table3_stats.total_value_size, 100); assert_eq!(table3_stats.total_key_size, 1000); - // Report compaction - hummock_manager - .compactor_manager_ref_for_test() - .add_compactor(worker_node.id); - let compact_task = hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), @@ -1305,13 +1439,12 @@ async fn test_version_stats() { #[tokio::test] async fn test_split_compaction_group_on_commit() { let (_env, hummock_manager, _, worker_node) = setup_compute_env(80).await; - let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); hummock_manager - .register_table_ids_for_test(&[(100, 2)]) - .await - .unwrap(); - hummock_manager - .register_table_ids_for_test(&[(101, 3)]) + .register_table_ids_for_test(&[(100, 2), (101, 3)]) .await .unwrap(); let sst_1 = LocalSstableInfo { @@ -1343,8 +1476,15 @@ async fn test_split_compaction_group_on_commit() { ), ]), }; - hummock_manager - .commit_epoch_for_test(30, vec![sst_1], HashMap::from([(10, context_id)])) + hummock_meta_client + .commit_epoch( + test_epoch(30), + SyncResult { + uncommitted_ssts: vec![sst_1], + ..Default::default() + }, + false, + ) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; @@ -1380,7 +1520,10 @@ async fn test_split_compaction_group_on_commit() { #[tokio::test] async fn test_split_compaction_group_on_demand_basic() { let (_env, hummock_manager, _, worker_node) = setup_compute_env(80).await; - let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let original_groups = hummock_manager .get_current_version() .await @@ -1407,11 +1550,7 @@ async fn test_split_compaction_group_on_demand_basic() { ); hummock_manager - .register_table_ids_for_test(&[(100, 2)]) - .await - .unwrap(); - hummock_manager - .register_table_ids_for_test(&[(101, 2)]) + .register_table_ids_for_test(&[(100, 2), (101, 2)]) .await .unwrap(); let sst_1 = LocalSstableInfo { @@ -1450,17 +1589,21 @@ async fn test_split_compaction_group_on_demand_basic() { }, table_stats: Default::default(), }; - hummock_manager - .commit_epoch_for_test( - 30, - vec![sst_1, sst_2], - HashMap::from([(10, context_id), (11, context_id)]), + hummock_meta_client + .commit_epoch( + test_epoch(30), + SyncResult { + uncommitted_ssts: vec![sst_1, sst_2], + ..Default::default() + }, + false, ) .await .unwrap(); + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); let err = hummock_manager - .split_compaction_group(2, &[100, 101], 0) + .split_compaction_group(compaction_group_id, &[100, 101], 0) .await .unwrap_err(); assert_eq!( @@ -1476,25 +1619,29 @@ async fn test_split_compaction_group_on_demand_basic() { .unwrap(); hummock_manager - .split_compaction_group(2, &[100, 101], 0) + .split_compaction_group(compaction_group_id, &[100, 101], 0) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; assert_eq!(current_version.levels.len(), 3); - let new_group_id = current_version.levels.keys().max().cloned().unwrap(); - assert!(new_group_id > StaticCompactionGroupId::End as u64); + let new_compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 100).await; + assert!(new_compaction_group_id > StaticCompactionGroupId::End as u64); + + let old_compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 102).await; assert_eq!( - get_compaction_group_object_ids(¤t_version, 2), + get_compaction_group_object_ids(¤t_version, old_compaction_group_id), Vec::::new() ); assert_eq!( - get_compaction_group_object_ids(¤t_version, new_group_id), + get_compaction_group_object_ids(¤t_version, new_compaction_group_id), vec![10, 11] ); assert_eq!( current_version .state_table_info - .compaction_group_member_table_ids(2) + .compaction_group_member_table_ids(old_compaction_group_id) .iter() .map(|table_id| table_id.table_id) .collect_vec(), @@ -1503,7 +1650,7 @@ async fn test_split_compaction_group_on_demand_basic() { assert_eq!( current_version .state_table_info - .compaction_group_member_table_ids(new_group_id) + .compaction_group_member_table_ids(new_compaction_group_id) .iter() .map(|table_id| table_id.table_id) .sorted() @@ -1515,7 +1662,10 @@ async fn test_split_compaction_group_on_demand_basic() { #[tokio::test] async fn test_split_compaction_group_on_demand_non_trivial() { let (_env, hummock_manager, _, worker_node) = setup_compute_env(80).await; - let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let sst_1 = LocalSstableInfo { sst_info: SstableInfo { object_id: 10, @@ -1531,39 +1681,46 @@ async fn test_split_compaction_group_on_demand_non_trivial() { table_stats: Default::default(), }; hummock_manager - .register_table_ids_for_test(&[(100, 2)]) - .await - .unwrap(); - hummock_manager - .register_table_ids_for_test(&[(101, 2)]) + .register_table_ids_for_test(&[(100, 2), (101, 2)]) .await .unwrap(); - hummock_manager - .commit_epoch_for_test(30, vec![sst_1], HashMap::from([(10, context_id)])) + hummock_meta_client + .commit_epoch( + 30, + SyncResult { + uncommitted_ssts: vec![sst_1], + ..Default::default() + }, + false, + ) .await .unwrap(); + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); hummock_manager - .split_compaction_group(2, &[100], 0) + .split_compaction_group(compaction_group_id, &[100], 0) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; assert_eq!(current_version.levels.len(), 3); - let new_group_id = current_version.levels.keys().max().cloned().unwrap(); - assert!(new_group_id > StaticCompactionGroupId::End as u64); + let new_compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 100).await; + assert!(new_compaction_group_id > StaticCompactionGroupId::End as u64); + let old_compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 101).await; assert_eq!( - get_compaction_group_object_ids(¤t_version, 2), + get_compaction_group_object_ids(¤t_version, old_compaction_group_id), vec![10] ); assert_eq!( - get_compaction_group_object_ids(¤t_version, new_group_id), + get_compaction_group_object_ids(¤t_version, new_compaction_group_id), vec![10] ); assert_eq!( current_version .state_table_info - .compaction_group_member_table_ids(2) + .compaction_group_member_table_ids(old_compaction_group_id) .iter() .map(|table_id| table_id.table_id) .collect_vec(), @@ -1572,7 +1729,7 @@ async fn test_split_compaction_group_on_demand_non_trivial() { assert_eq!( current_version .state_table_info - .compaction_group_member_table_ids(new_group_id) + .compaction_group_member_table_ids(new_compaction_group_id) .iter() .map(|table_id| table_id.table_id) .collect_vec(), @@ -1583,7 +1740,10 @@ async fn test_split_compaction_group_on_demand_non_trivial() { #[tokio::test] async fn test_split_compaction_group_trivial_expired() { let (_env, hummock_manager, _, worker_node) = setup_compute_env(80).await; - let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let original_groups = hummock_manager .get_current_version() .await @@ -1593,14 +1753,9 @@ async fn test_split_compaction_group_trivial_expired() { .sorted() .collect_vec(); assert_eq!(original_groups, vec![2, 3]); - hummock_manager.compactor_manager.add_compactor(context_id); hummock_manager - .register_table_ids_for_test(&[(100, 2)]) - .await - .unwrap(); - hummock_manager - .register_table_ids_for_test(&[(101, 2)]) + .register_table_ids_for_test(&[(100, 2), (101, 2)]) .await .unwrap(); let sst_1 = LocalSstableInfo { @@ -1645,16 +1800,14 @@ async fn test_split_compaction_group_trivial_expired() { sst_3.sst_info.object_id = 8; sst_4.sst_info.sst_id = 9; sst_4.sst_info.object_id = 9; - hummock_manager - .commit_epoch_for_test( + hummock_meta_client + .commit_epoch( 30, - vec![sst_1, sst_2, sst_3, sst_4], - HashMap::from([ - (10, context_id), - (11, context_id), - (9, context_id), - (8, context_id), - ]), + SyncResult { + uncommitted_ssts: vec![sst_1, sst_2, sst_3, sst_4], + ..Default::default() + }, + false, ) .await .unwrap(); @@ -1671,19 +1824,23 @@ async fn test_split_compaction_group_trivial_expired() { .unwrap() .unwrap(); + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); hummock_manager - .split_compaction_group(2, &[100], 0) + .split_compaction_group(compaction_group_id, &[100], 0) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; - let new_group_id = current_version.levels.keys().max().cloned().unwrap(); + let new_compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 100).await; + let old_compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 101).await; assert_eq!(current_version.levels.len(), 3); - assert!(new_group_id > StaticCompactionGroupId::End as u64); + assert!(new_compaction_group_id > StaticCompactionGroupId::End as u64); assert_eq!( current_version .state_table_info - .compaction_group_member_table_ids(2) + .compaction_group_member_table_ids(old_compaction_group_id) .iter() .map(|table_id| table_id.table_id) .sorted() @@ -1693,7 +1850,7 @@ async fn test_split_compaction_group_trivial_expired() { assert_eq!( current_version .state_table_info - .compaction_group_member_table_ids(new_group_id) + .compaction_group_member_table_ids(new_compaction_group_id) .iter() .map(|table_id| table_id.table_id) .collect_vec(), @@ -1701,7 +1858,7 @@ async fn test_split_compaction_group_trivial_expired() { ); let task2 = hummock_manager - .get_compact_task(new_group_id, &mut default_compaction_selector()) + .get_compact_task(new_compaction_group_id, &mut default_compaction_selector()) .await .unwrap() .unwrap(); @@ -1735,18 +1892,17 @@ async fn test_split_compaction_group_trivial_expired() { } async fn get_manual_compact_task( - hummock_manager: &HummockManager, - context_id: HummockContextId, + hummock_manager_ref: HummockManagerRef, + compaction_group_id: u64, + level: usize, ) -> CompactTask { - hummock_manager.compactor_manager.add_compactor(context_id); - hummock_manager - .manual_get_compact_task( - 2, - ManualCompactionOption { - level: 0, - ..Default::default() - }, - ) + let manual_compcation_option = ManualCompactionOption { + level, + ..Default::default() + }; + + hummock_manager_ref + .manual_get_compact_task(compaction_group_id, manual_compcation_option) .await .unwrap() .unwrap() @@ -1755,14 +1911,13 @@ async fn get_manual_compact_task( #[tokio::test] async fn test_split_compaction_group_on_demand_bottom_levels() { let (_env, hummock_manager, _, worker_node) = setup_compute_env(80).await; - let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); hummock_manager - .register_table_ids_for_test(&[(100, 2)]) - .await - .unwrap(); - hummock_manager - .register_table_ids_for_test(&[(101, 2)]) + .register_table_ids_for_test(&[(100, 2), (101, 2)]) .await .unwrap(); @@ -1784,12 +1939,22 @@ async fn test_split_compaction_group_on_demand_bottom_levels() { }, table_stats: Default::default(), }; - hummock_manager - .commit_epoch_for_test(30, vec![sst_1.clone()], HashMap::from([(10, context_id)])) + hummock_meta_client + .commit_epoch( + 30, + SyncResult { + uncommitted_ssts: vec![sst_1.clone()], + ..Default::default() + }, + false, + ) .await .unwrap(); + // Construct data via manual compaction - let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); + let compaction_task = + get_manual_compact_task(hummock_manager.clone(), compaction_group_id, 0).await; let base_level: usize = 6; assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 1); assert_eq!(compaction_task.target_level, base_level as u32); @@ -1832,43 +1997,56 @@ async fn test_split_compaction_group_on_demand_bottom_levels() { .unwrap()); let current_version = hummock_manager.get_current_version().await; assert!(current_version - .get_compaction_group_levels(2) + .get_compaction_group_levels(compaction_group_id) .l0 .sub_levels .is_empty()); assert_eq!( - current_version.get_compaction_group_levels(2).levels[base_level - 1] + current_version + .get_compaction_group_levels(compaction_group_id) + .levels[base_level - 1] .table_infos .len(), 2 ); hummock_manager - .split_compaction_group(2, &[100], 0) + .split_compaction_group(compaction_group_id, &[100], 0) .await .unwrap(); let current_version = hummock_manager.get_current_version().await; - let new_group_id = current_version.levels.keys().max().cloned().unwrap(); + let new_compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 100).await; + let old_compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 101).await; assert_eq!( - current_version.get_compaction_group_levels(2).levels[base_level - 1] + current_version + .get_compaction_group_levels(old_compaction_group_id) + .levels[base_level - 1] .table_infos .len(), 1 ); assert_eq!( - current_version.get_compaction_group_levels(2).levels[base_level - 1].table_infos[0] + current_version + .get_compaction_group_levels(old_compaction_group_id) + .levels[base_level - 1] + .table_infos[0] .object_id, sst_1.sst_info.object_id + 1, ); assert_eq!( - current_version.get_compaction_group_levels(2).levels[base_level - 1].table_infos[0] + current_version + .get_compaction_group_levels(old_compaction_group_id) + .levels[base_level - 1] + .table_infos[0] .table_ids, vec![101] ); assert_eq!( current_version - .get_compaction_group_levels(new_group_id) + .get_compaction_group_levels(new_compaction_group_id) .levels[base_level - 1] .table_infos .len(), @@ -1876,7 +2054,7 @@ async fn test_split_compaction_group_on_demand_bottom_levels() { ); assert_eq!( current_version - .get_compaction_group_levels(new_group_id) + .get_compaction_group_levels(new_compaction_group_id) .levels[base_level - 1] .table_infos[0] .table_ids, @@ -1884,7 +2062,7 @@ async fn test_split_compaction_group_on_demand_bottom_levels() { ); assert_eq!( current_version - .get_compaction_group_levels(new_group_id) + .get_compaction_group_levels(new_compaction_group_id) .levels[base_level - 1] .table_infos[1] .table_ids, @@ -1895,14 +2073,14 @@ async fn test_split_compaction_group_on_demand_bottom_levels() { #[tokio::test] async fn test_compaction_task_expiration_due_to_split_group() { let (_env, hummock_manager, _, worker_node) = setup_compute_env(80).await; - let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); hummock_manager - .register_table_ids_for_test(&[(100, 2)]) - .await - .unwrap(); - hummock_manager - .register_table_ids_for_test(&[(101, 2)]) + .register_table_ids_for_test(&[(100, compaction_group_id), (101, compaction_group_id)]) .await .unwrap(); let sst_1 = LocalSstableInfo { @@ -1941,24 +2119,29 @@ async fn test_compaction_task_expiration_due_to_split_group() { }, table_stats: Default::default(), }; - hummock_manager - .commit_epoch_for_test( + + hummock_meta_client + .commit_epoch( 30, - vec![sst_1, sst_2], - HashMap::from([(10, context_id), (11, context_id)]), + SyncResult { + uncommitted_ssts: vec![sst_1, sst_2], + ..Default::default() + }, + false, ) .await .unwrap(); - let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); + let compaction_task = + get_manual_compact_task(hummock_manager.clone(), compaction_group_id, 0).await; assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 2); hummock_manager - .split_compaction_group(2, &[100], 0) + .split_compaction_group(compaction_group_id, &[100], 0) .await .unwrap(); let version_1 = hummock_manager.get_current_version().await; - // compaction_task.task_status = TaskStatus::Success.into(); assert!(!hummock_manager .report_compact_task(compaction_task.task_id, TaskStatus::Success, vec![], None) .await @@ -1969,7 +2152,8 @@ async fn test_compaction_task_expiration_due_to_split_group() { "version should not change because compaction task has been cancelled" ); - let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; + let compaction_task = + get_manual_compact_task(hummock_manager.clone(), compaction_group_id, 0).await; assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 2); hummock_manager .report_compact_task(compaction_task.task_id, TaskStatus::Success, vec![], None) @@ -1986,75 +2170,73 @@ async fn test_compaction_task_expiration_due_to_split_group() { #[tokio::test] async fn test_move_tables_between_compaction_group() { let (_env, hummock_manager, _, worker_node) = setup_compute_env(80).await; - let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); hummock_manager - .register_table_ids_for_test(&[(100, 2)]) + .register_table_ids_for_test(&[(100, 2), (101, 2), (102, 2)]) .await .unwrap(); - hummock_manager - .register_table_ids_for_test(&[(101, 2)]) - .await - .unwrap(); - hummock_manager - .register_table_ids_for_test(&[(102, 2)]) - .await - .unwrap(); - let sst_1 = gen_local_sstable_info(10, 1, vec![100, 101, 102]); - hummock_manager - .commit_epoch_for_test(30, vec![sst_1.clone()], HashMap::from([(10, context_id)])) + let sst_1 = gen_local_sstable_info(10, vec![100, 101, 102], test_epoch(1)); + + hummock_meta_client + .commit_epoch( + 30, + SyncResult { + uncommitted_ssts: vec![sst_1.clone()], + ..Default::default() + }, + false, + ) .await .unwrap(); - // Construct data via manual compaction - let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; - let base_level: usize = 6; - assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 1); - assert_eq!(compaction_task.target_level, base_level as u32); - assert!(hummock_manager - .report_compact_task( - compaction_task.task_id, - TaskStatus::Success, - vec![ - gen_sstable_info(11, 1, vec![100]), - gen_sstable_info(12, 2, vec![100, 101]), - gen_sstable_info(13, 3, vec![101, 102]), - ], - None, + + let sst_2 = gen_local_sstable_info(14, vec![101, 102], test_epoch(2)); + + hummock_meta_client + .commit_epoch( + 31, + SyncResult { + uncommitted_ssts: vec![sst_2.clone()], + ..Default::default() + }, + false, ) .await - .unwrap()); - let sst_2 = gen_local_sstable_info(14, 1, vec![101, 102]); - hummock_manager - .commit_epoch_for_test(31, vec![sst_2.clone()], HashMap::from([(14, context_id)])) - .await .unwrap(); + let current_version = hummock_manager.get_current_version().await; - assert_eq!( - current_version.get_compaction_group_levels(2).levels[base_level - 1] - .table_infos - .len(), - 3 - ); + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 100).await; + let sst_ids = current_version + .get_sst_ids_by_group_id(compaction_group_id) + .collect_vec(); + assert_eq!(2, sst_ids.len()); + assert!(sst_ids.contains(&10)); + assert!(sst_ids.contains(&14)); hummock_manager .split_compaction_group(2, &[100], 0) .await .unwrap(); + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 101).await; let current_version = hummock_manager.get_current_version().await; - let new_group_id = current_version.levels.keys().max().cloned().unwrap(); - assert_eq!( - current_version.get_compaction_group_levels(2).levels[base_level - 1] - .table_infos - .len(), - 2 - ); + let sst_ids = current_version + .get_sst_ids_by_group_id(compaction_group_id) + .collect_vec(); + assert_eq!(2, sst_ids.len()); + assert!(!sst_ids.contains(&10)); - let level = ¤t_version - .get_compaction_group_levels(new_group_id) - .levels[base_level - 1]; - assert_eq!(level.table_infos[0].table_ids, vec![100]); - assert_eq!(level.table_infos[1].table_ids, vec![100]); - assert_eq!(level.table_infos.len(), 2); + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 100).await; + let sst_ids = current_version + .get_sst_ids_by_group_id(compaction_group_id) + .collect_vec(); + assert_eq!(1, sst_ids.len()); + assert!(!sst_ids.contains(&10)); } #[tokio::test] @@ -2068,6 +2250,10 @@ async fn test_gc_stats() { let registry = Registry::new(); let (_env, hummock_manager, _, worker_node) = setup_compute_env_with_metric(80, config, Some(MetaMetrics::for_test(®istry))).await; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let context_id = worker_node.id; let assert_eq_gc_stats = |stale_object_size, stale_object_count, @@ -2106,8 +2292,14 @@ async fn test_gc_stats() { 0 ); + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); hummock_manager.pin_version(context_id).await.unwrap(); - let _ = add_test_tables(&hummock_manager, context_id).await; + let _ = add_test_tables( + &hummock_manager, + hummock_meta_client.clone(), + compaction_group_id, + ) + .await; assert_eq_gc_stats(0, 0, 0, 0, 0, 0); assert_ne!( hummock_manager.create_version_checkpoint(0).await.unwrap(), @@ -2125,7 +2317,6 @@ async fn test_gc_stats() { hummock_manager.create_version_checkpoint(0).await.unwrap(), 0 ); - assert_eq_gc_stats(6, 3, 0, 0, 2, 4); } #[tokio::test] @@ -2139,79 +2330,63 @@ async fn test_partition_level() { let (env, hummock_manager, _, worker_node) = setup_compute_env_with_metric(80, config.clone(), Some(MetaMetrics::for_test(®istry))) .await; - let context_id = worker_node.id; - - hummock_manager - .register_table_ids_for_test(&[(100, 2)]) - .await - .unwrap(); - hummock_manager - .register_table_ids_for_test(&[(101, 2)]) - .await - .unwrap(); - let sst_1 = gen_local_sstable_info(10, 1, vec![100, 101]); + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); hummock_manager - .commit_epoch_for_test(30, vec![sst_1.clone()], HashMap::from([(10, context_id)])) + .register_table_ids_for_test(&[(100, 2), (101, 2)]) .await .unwrap(); - // Construct data via manual compaction - let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; - let base_level: usize = 6; - assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 1); - assert_eq!(compaction_task.target_level, base_level as u32); - assert!(hummock_manager - .report_compact_task( - compaction_task.task_id, - TaskStatus::Success, - vec![ - gen_sstable_info(11, 1, vec![100]), - gen_sstable_info(12, 2, vec![101]), - ], - None, + let sst_1 = gen_local_sstable_info(10, vec![100, 101], test_epoch(1)); + + hummock_meta_client + .commit_epoch( + 30, + SyncResult { + uncommitted_ssts: vec![sst_1], + ..Default::default() + }, + false, ) .await - .unwrap()); + .unwrap(); + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); hummock_manager - .split_compaction_group(2, &[100], env.opts.partition_vnode_count) + .split_compaction_group(compaction_group_id, &[100], env.opts.partition_vnode_count) .await .unwrap(); - let current_version = hummock_manager.get_current_version().await; - - let new_group_id = current_version.levels.keys().max().cloned().unwrap(); - assert_eq!( - current_version - .get_compaction_group_levels(new_group_id) - .levels[base_level - 1] - .table_infos - .len(), - 1 - ); - + let new_compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager.clone(), 100).await; let mut global_sst_id = 13; const MB: u64 = 1024 * 1024; let mut selector = default_compaction_selector(); for epoch in 31..100 { - let mut sst = gen_local_sstable_info(global_sst_id, 10, vec![100]); + let mut sst = gen_local_sstable_info(global_sst_id, vec![100], test_epoch(epoch)); sst.sst_info.file_size = 10 * MB; sst.sst_info.sst_size = 10 * MB; sst.sst_info.uncompressed_file_size = 10 * MB; - hummock_manager - .commit_epoch_for_test( + hummock_meta_client + .commit_epoch( epoch, - vec![sst], - HashMap::from([(global_sst_id, context_id)]), + SyncResult { + uncommitted_ssts: vec![sst], + ..Default::default() + }, + false, ) .await .unwrap(); + global_sst_id += 1; if let Some(task) = hummock_manager - .get_compact_task(new_group_id, &mut selector) + .get_compact_task(new_compaction_group_id, &mut selector) .await .unwrap() { - let mut sst = gen_sstable_info(global_sst_id, 10, vec![100]); + let mut sst = gen_sstable_info(global_sst_id, vec![100], test_epoch(epoch)); sst.file_size = task .input_ssts .iter() @@ -2233,7 +2408,7 @@ async fn test_partition_level() { } } let current_version = hummock_manager.get_current_version().await; - let group = current_version.get_compaction_group_levels(new_group_id); + let group = current_version.get_compaction_group_levels(new_compaction_group_id); for sub_level in &group.l0.sub_levels { if sub_level.total_file_size > config.sub_level_max_compaction_bytes { assert!(sub_level.vnode_partition_count > 0); @@ -2244,7 +2419,10 @@ async fn test_partition_level() { #[tokio::test] async fn test_unregister_moved_table() { let (_env, hummock_manager, _, worker_node) = setup_compute_env(80).await; - let context_id = worker_node.id; + let hummock_meta_client = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let original_groups = hummock_manager .get_current_version() .await @@ -2262,11 +2440,7 @@ async fn test_unregister_moved_table() { ); hummock_manager - .register_table_ids_for_test(&[(100, 2)]) - .await - .unwrap(); - hummock_manager - .register_table_ids_for_test(&[(101, 2)]) + .register_table_ids_for_test(&[(100, 2), (101, 2)]) .await .unwrap(); let sst_1 = LocalSstableInfo { @@ -2305,25 +2479,30 @@ async fn test_unregister_moved_table() { }, table_stats: Default::default(), }; - hummock_manager - .commit_epoch_for_test( + + hummock_meta_client + .commit_epoch( 30, - vec![sst_1, sst_2], - HashMap::from([(10, context_id), (11, context_id)]), + SyncResult { + uncommitted_ssts: vec![sst_1, sst_2], + ..Default::default() + }, + false, ) .await .unwrap(); - let new_group_id = hummock_manager - .split_compaction_group(2, &[100], 0) + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); + let new_compaction_group_id = hummock_manager + .split_compaction_group(compaction_group_id, &[100], 0) .await .unwrap(); - assert_ne!(new_group_id, 2); - assert!(new_group_id > StaticCompactionGroupId::End as u64); + assert_ne!(new_compaction_group_id, 2); + assert!(new_compaction_group_id > StaticCompactionGroupId::End as u64); let current_version = hummock_manager.get_current_version().await; assert_eq!( - new_group_id, + new_compaction_group_id, current_version.levels.keys().max().cloned().unwrap() ); assert_eq!(current_version.levels.len(), 3); @@ -2332,7 +2511,7 @@ async fn test_unregister_moved_table() { vec![11] ); assert_eq!( - get_compaction_group_object_ids(¤t_version, new_group_id), + get_compaction_group_object_ids(¤t_version, new_compaction_group_id), vec![10, 11] ); assert_eq!( @@ -2347,7 +2526,7 @@ async fn test_unregister_moved_table() { assert_eq!( current_version .state_table_info - .compaction_group_member_table_ids(new_group_id) + .compaction_group_member_table_ids(new_compaction_group_id) .iter() .map(|table_id| table_id.table_id) .collect_vec(), @@ -2360,7 +2539,9 @@ async fn test_unregister_moved_table() { .unwrap(); let current_version = hummock_manager.get_current_version().await; assert_eq!(current_version.levels.len(), 2); - assert!(!current_version.levels.contains_key(&new_group_id)); + assert!(!current_version + .levels + .contains_key(&new_compaction_group_id)); assert_eq!( get_compaction_group_object_ids(¤t_version, 2), vec![11] diff --git a/src/meta/src/hummock/mock_hummock_meta_client.rs b/src/meta/src/hummock/mock_hummock_meta_client.rs index 499d9df0958c4..c926e2145e886 100644 --- a/src/meta/src/hummock/mock_hummock_meta_client.rs +++ b/src/meta/src/hummock/mock_hummock_meta_client.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeSet; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::Arc; use std::time::SystemTime; @@ -22,6 +23,7 @@ use fail::fail_point; use futures::stream::BoxStream; use futures::{Stream, StreamExt}; use itertools::Itertools; +use risingwave_common::catalog::TableId; use risingwave_hummock_sdk::change_log::build_table_change_log_delta; use risingwave_hummock_sdk::compact_task::CompactTask; use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; @@ -162,14 +164,63 @@ impl HummockMetaClient for MockHummockMetaClient { }) } - async fn commit_epoch(&self, epoch: HummockEpoch, sync_result: SyncResult) -> Result<()> { + async fn commit_epoch( + &self, + epoch: HummockEpoch, + sync_result: SyncResult, + is_log_store: bool, + ) -> Result<()> { let version: HummockVersion = self.hummock_manager.get_current_version().await; - let sst_to_worker = sync_result + let table_ids = version + .state_table_info + .info() + .keys() + .map(|table_id| table_id.table_id) + .collect::>(); + + let old_value_ssts_vec = if is_log_store { + sync_result.old_value_ssts.clone() + } else { + vec![] + }; + let commit_table_ids = sync_result + .uncommitted_ssts + .iter() + .flat_map(|sstable| sstable.sst_info.table_ids.clone()) + .chain({ + old_value_ssts_vec + .iter() + .flat_map(|sstable| sstable.sst_info.table_ids.clone()) + }) + .collect::>(); + + let new_table_fragment_info = if commit_table_ids + .iter() + .all(|table_id| table_ids.contains(table_id)) + { + NewTableFragmentInfo::None + } else { + NewTableFragmentInfo::Normal { + mv_table_id: None, + internal_table_ids: commit_table_ids + .iter() + .cloned() + .map(TableId::from) + .collect_vec(), + } + }; + + let sst_to_context = sync_result .uncommitted_ssts .iter() .map(|LocalSstableInfo { sst_info, .. }| (sst_info.object_id, self.context_id)) .collect(); let new_table_watermark = sync_result.table_watermarks; + let table_change_log_table_ids = if is_log_store { + commit_table_ids.clone() + } else { + BTreeSet::new() + }; let table_change_log = build_table_change_log_delta( sync_result .old_value_ssts @@ -177,22 +228,24 @@ impl HummockMetaClient for MockHummockMetaClient { .map(|sst| sst.sst_info), sync_result.uncommitted_ssts.iter().map(|sst| &sst.sst_info), &vec![epoch], - version - .state_table_info - .info() - .keys() - .map(|table_id| (table_id.table_id, 0)), + table_change_log_table_ids + .into_iter() + .map(|table_id| (table_id, 0)), ); self.hummock_manager .commit_epoch(CommitEpochInfo { sstables: sync_result.uncommitted_ssts, new_table_watermarks: new_table_watermark, - sst_to_context: sst_to_worker, - new_table_fragment_info: NewTableFragmentInfo::None, + sst_to_context, + new_table_fragment_info, change_log_delta: table_change_log, committed_epoch: epoch, - tables_to_commit: version.state_table_info.info().keys().cloned().collect(), + tables_to_commit: commit_table_ids + .iter() + .cloned() + .map(TableId::from) + .collect(), is_visible_table_committed_epoch: true, }) .await diff --git a/src/meta/src/hummock/test_utils.rs b/src/meta/src/hummock/test_utils.rs index 2188d9b539325..00cb52b34a0a4 100644 --- a/src/meta/src/hummock/test_utils.rs +++ b/src/meta/src/hummock/test_utils.rs @@ -22,7 +22,6 @@ use bytes::Bytes; use itertools::Itertools; use risingwave_common::catalog::{TableId, TableOption}; use risingwave_common::util::epoch::test_epoch; -use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; use risingwave_hummock_sdk::key::key_with_epoch; use risingwave_hummock_sdk::key_range::KeyRange; use risingwave_hummock_sdk::level::Levels; @@ -30,12 +29,13 @@ use risingwave_hummock_sdk::sstable_info::SstableInfo; use risingwave_hummock_sdk::table_watermark::TableWatermarks; use risingwave_hummock_sdk::version::{HummockVersion, HummockVersionStateTableInfo}; use risingwave_hummock_sdk::{ - CompactionGroupId, HummockContextId, HummockEpoch, HummockSstableObjectId, LocalSstableInfo, + CompactionGroupId, HummockEpoch, HummockSstableObjectId, LocalSstableInfo, SyncResult, }; use risingwave_pb::common::{HostAddress, WorkerNode, WorkerType}; use risingwave_pb::hummock::compact_task::TaskStatus; use risingwave_pb::hummock::CompactionConfig; use risingwave_pb::meta::add_worker_node_request::Property; +use risingwave_rpc_client::HummockMetaClient; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; use crate::hummock::compaction::selector::{default_compaction_selector, LocalSelectorStatistic}; @@ -44,9 +44,7 @@ use crate::hummock::level_handler::LevelHandler; pub use crate::hummock::manager::CommitEpochInfo; use crate::hummock::model::CompactionGroup; use crate::hummock::{CompactorManager, HummockManager, HummockManagerRef}; -use crate::manager::{ - ClusterManager, ClusterManagerRef, FragmentManager, MetaSrvEnv, META_NODE_ID, -}; +use crate::manager::{ClusterManager, ClusterManagerRef, FragmentManager, MetaSrvEnv}; use crate::rpc::metrics::MetaMetrics; pub fn to_local_sstable_info(ssts: &[SstableInfo]) -> Vec { @@ -55,9 +53,15 @@ pub fn to_local_sstable_info(ssts: &[SstableInfo]) -> Vec { .collect_vec() } +// This function has 3 phases: +// 1. add 3 ssts to +// 2. trigger a compaction and replace the input from phase 1 with the 1 new sst +// 3. add 1 new sst +// Please make sure the function do what you want before using it. pub async fn add_test_tables( hummock_manager: &HummockManager, - context_id: HummockContextId, + hummock_meta_client: Arc, + compaction_group_id: CompactionGroupId, ) -> Vec> { // Increase version by 2. @@ -66,43 +70,31 @@ pub async fn add_test_tables( let mut epoch = test_epoch(1); let sstable_ids = get_sst_ids(hummock_manager, 3).await; let test_tables = generate_test_sstables_with_table_id(epoch, 1, sstable_ids); - register_sstable_infos_to_compaction_group( - hummock_manager, - &test_tables, - StaticCompactionGroupId::StateDefault.into(), - ) - .await; - let ssts = to_local_sstable_info(&test_tables); - let sst_to_worker = ssts - .iter() - .map(|LocalSstableInfo { sst_info, .. }| (sst_info.object_id, context_id)) - .collect(); - hummock_manager - .commit_epoch_for_test(epoch, ssts, sst_to_worker) + register_sstable_infos_to_compaction_group(hummock_manager, &test_tables, compaction_group_id) + .await; + let test_local_tables = to_local_sstable_info(&test_tables); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: test_local_tables, + ..Default::default() + }, + false, + ) .await .unwrap(); + // Simulate a compaction and increase version by 1. - let mut temp_compactor = false; - if hummock_manager - .compactor_manager_ref_for_test() - .compactor_num() - == 0 - { - hummock_manager - .compactor_manager_ref_for_test() - .add_compactor(context_id); - temp_compactor = true; - } let test_tables_2 = generate_test_tables(epoch, get_sst_ids(hummock_manager, 1).await); register_sstable_infos_to_compaction_group( hummock_manager, &test_tables_2, - StaticCompactionGroupId::StateDefault.into(), + compaction_group_id, ) .await; - let mut selector = default_compaction_selector(); let mut compact_task = hummock_manager - .get_compact_task(StaticCompactionGroupId::StateDefault.into(), &mut selector) + .get_compact_task(compaction_group_id, &mut default_compaction_selector()) .await .unwrap() .unwrap(); @@ -114,15 +106,8 @@ pub async fn add_test_tables( .sum::(), 3 ); - compact_task.target_level = 6; - if temp_compactor { - let compactor = hummock_manager - .compactor_manager_ref_for_test() - .next_compactor() - .unwrap(); - assert_eq!(compactor.context_id(), context_id); - } + compact_task.target_level = 6; hummock_manager .report_compact_task_for_test( compact_task.task_id, @@ -133,27 +118,25 @@ pub async fn add_test_tables( ) .await .unwrap(); - if temp_compactor { - hummock_manager - .compactor_manager_ref_for_test() - .remove_compactor(context_id); - } // Increase version by 1. epoch.inc_epoch(); let test_tables_3 = generate_test_tables(epoch, get_sst_ids(hummock_manager, 1).await); register_sstable_infos_to_compaction_group( hummock_manager, &test_tables_3, - StaticCompactionGroupId::StateDefault.into(), + compaction_group_id, ) .await; - let ssts = to_local_sstable_info(&test_tables_3); - let sst_to_worker = ssts - .iter() - .map(|LocalSstableInfo { sst_info, .. }| (sst_info.object_id, context_id)) - .collect(); - hummock_manager - .commit_epoch_for_test(epoch, ssts, sst_to_worker) + let test_local_tables_3 = to_local_sstable_info(&test_tables_3); + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: test_local_tables_3, + ..Default::default() + }, + false, + ) .await .unwrap(); vec![test_tables, test_tables_2, test_tables_3] @@ -290,11 +273,9 @@ pub fn get_sorted_object_ids(sstables: &[SstableInfo]) -> Vec Vec { - let levels = match hummock_version - .levels - .get(&StaticCompactionGroupId::StateDefault.into()) - { + let levels = match hummock_version.levels.get(&compaction_group_id) { Some(levels) => levels, None => return vec![], }; @@ -385,34 +366,23 @@ pub async fn get_sst_ids( (range.start_id..range.end_id).collect_vec() } -pub async fn commit_from_meta_node( - hummock_manager_ref: &HummockManager, - epoch: HummockEpoch, - ssts: Vec, -) -> crate::hummock::error::Result<()> { - let sst_to_worker = ssts - .iter() - .map(|LocalSstableInfo { sst_info, .. }| (sst_info.object_id, META_NODE_ID)) - .collect(); - hummock_manager_ref - .commit_epoch_for_test(epoch, ssts, sst_to_worker) - .await -} - pub async fn add_ssts( epoch: HummockEpoch, hummock_manager: &HummockManager, - context_id: HummockContextId, + hummock_meta_client: Arc, ) -> Vec { let table_ids = get_sst_ids(hummock_manager, 3).await; let test_tables = generate_test_sstables_with_table_id(test_epoch(epoch), 1, table_ids); let ssts = to_local_sstable_info(&test_tables); - let sst_to_worker = ssts - .iter() - .map(|LocalSstableInfo { sst_info, .. }| (sst_info.object_id, context_id)) - .collect(); - hummock_manager - .commit_epoch_for_test(epoch, ssts, sst_to_worker) + hummock_meta_client + .commit_epoch( + epoch, + SyncResult { + uncommitted_ssts: ssts, + ..Default::default() + }, + false, + ) .await .unwrap(); test_tables @@ -441,3 +411,12 @@ pub fn compaction_selector_context<'a>( state_table_info, } } + +pub async fn get_compaction_group_id_by_table_id( + hummock_manager_ref: HummockManagerRef, + table_id: u32, +) -> u64 { + let version = hummock_manager_ref.get_current_version().await; + let mapping = version.state_table_info.build_table_compaction_group_id(); + *mapping.get(&(table_id.into())).unwrap() +} diff --git a/src/meta/src/hummock/vacuum.rs b/src/meta/src/hummock/vacuum.rs index d747651b86d43..10e2c08abd6e5 100644 --- a/src/meta/src/hummock/vacuum.rs +++ b/src/meta/src/hummock/vacuum.rs @@ -228,17 +228,23 @@ mod tests { use std::sync::Arc; use itertools::Itertools; + use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; use risingwave_hummock_sdk::HummockVersionId; use risingwave_pb::hummock::VacuumTask; + use risingwave_rpc_client::HummockMetaClient; use crate::backup_restore::BackupManager; use crate::hummock::test_utils::{add_test_tables, setup_compute_env}; - use crate::hummock::VacuumManager; + use crate::hummock::{MockHummockMetaClient, VacuumManager}; #[tokio::test] async fn test_vacuum() { let (env, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; let context_id = worker_node.id; + let hummock_meta_client: Arc = Arc::new(MockHummockMetaClient::new( + hummock_manager.clone(), + worker_node.id, + )); let compactor_manager = hummock_manager.compactor_manager_ref_for_test(); let backup_manager = Arc::new(BackupManager::for_test(env.clone(), hummock_manager.clone()).await); @@ -251,7 +257,13 @@ mod tests { assert_eq!(vacuum.vacuum_metadata().await.unwrap(), 0); assert_eq!(vacuum.vacuum_object().await.unwrap().len(), 0); hummock_manager.pin_version(context_id).await.unwrap(); - let sst_infos = add_test_tables(hummock_manager.as_ref(), context_id).await; + let compaction_group_id = StaticCompactionGroupId::StateDefault.into(); + let sst_infos = add_test_tables( + hummock_manager.as_ref(), + hummock_meta_client.clone(), + compaction_group_id, + ) + .await; assert_eq!(vacuum.vacuum_metadata().await.unwrap(), 0); hummock_manager.create_version_checkpoint(1).await.unwrap(); assert_eq!(vacuum.vacuum_metadata().await.unwrap(), 6); diff --git a/src/meta/src/manager/catalog/fragment.rs b/src/meta/src/manager/catalog/fragment.rs index b734cdb54602a..6ec70a4b8d286 100644 --- a/src/meta/src/manager/catalog/fragment.rs +++ b/src/meta/src/manager/catalog/fragment.rs @@ -38,7 +38,6 @@ use risingwave_pb::stream_plan::update_mutation::MergeUpdate; use risingwave_pb::stream_plan::{ DispatchStrategy, Dispatcher, DispatcherType, FragmentTypeFlag, StreamActor, StreamNode, }; -use risingwave_pb::stream_service::BuildActorInfo; use tokio::sync::{RwLock, RwLockReadGuard}; use crate::barrier::Reschedule; @@ -49,7 +48,7 @@ use crate::model::{ TableParallelism, }; use crate::storage::Transaction; -use crate::stream::{to_build_actor_info, SplitAssignment, TableRevision}; +use crate::stream::{SplitAssignment, TableRevision}; use crate::{MetaError, MetaResult}; pub struct FragmentManagerCore { @@ -965,20 +964,14 @@ impl FragmentManager { pub async fn all_node_actors( &self, include_inactive: bool, - subscriptions: &HashMap>, - ) -> HashMap> { + ) -> HashMap> { let mut actor_maps = HashMap::new(); let map = &self.core.read().await.table_fragments; for fragments in map.values() { - let table_id = fragments.table_id(); for (node_id, actors) in fragments.worker_actors(include_inactive) { let node_actors = actor_maps.entry(node_id).or_insert_with(Vec::new); - node_actors.extend( - actors - .into_iter() - .map(|actor| to_build_actor_info(actor, subscriptions, table_id)), - ); + node_actors.extend(actors); } } diff --git a/src/meta/src/manager/metadata.rs b/src/meta/src/manager/metadata.rs index 935d4773865ed..4e0ca9e77f4b8 100644 --- a/src/meta/src/manager/metadata.rs +++ b/src/meta/src/manager/metadata.rs @@ -26,7 +26,6 @@ use risingwave_pb::common::{HostAddress, PbWorkerNode, PbWorkerType, WorkerNode, use risingwave_pb::meta::add_worker_node_request::Property as AddNodeProperty; use risingwave_pb::meta::table_fragments::{ActorStatus, Fragment, PbFragment}; use risingwave_pb::stream_plan::{PbDispatchStrategy, StreamActor}; -use risingwave_pb::stream_service::BuildActorInfo; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver}; use tokio::sync::oneshot; use tokio::time::{sleep, Instant}; @@ -42,7 +41,7 @@ use crate::manager::{ use crate::model::{ ActorId, ClusterId, FragmentId, MetadataModel, TableFragments, TableParallelism, }; -use crate::stream::{to_build_actor_info, SplitAssignment}; +use crate::stream::SplitAssignment; use crate::telemetry::MetaTelemetryJobDesc; use crate::{MetaError, MetaResult}; @@ -760,26 +759,19 @@ impl MetadataManager { pub async fn all_node_actors( &self, include_inactive: bool, - subscriptions: &HashMap>, - ) -> MetaResult>> { + ) -> MetaResult>> { match &self { - MetadataManager::V1(mgr) => Ok(mgr - .fragment_manager - .all_node_actors(include_inactive, subscriptions) - .await), + MetadataManager::V1(mgr) => { + Ok(mgr.fragment_manager.all_node_actors(include_inactive).await) + } MetadataManager::V2(mgr) => { let table_fragments = mgr.catalog_controller.table_fragments().await?; let mut actor_maps = HashMap::new(); for (_, fragments) in table_fragments { let tf = TableFragments::from_protobuf(fragments); - let table_id = tf.table_id(); for (node_id, actors) in tf.worker_actors(include_inactive) { let node_actors = actor_maps.entry(node_id).or_insert_with(Vec::new); - node_actors.extend( - actors - .into_iter() - .map(|actor| to_build_actor_info(actor, subscriptions, table_id)), - ) + node_actors.extend(actors) } } Ok(actor_maps) diff --git a/src/meta/src/rpc/ddl_controller_v2.rs b/src/meta/src/rpc/ddl_controller_v2.rs index c097fa5acb5c6..5e83e49b767a7 100644 --- a/src/meta/src/rpc/ddl_controller_v2.rs +++ b/src/meta/src/rpc/ddl_controller_v2.rs @@ -146,7 +146,7 @@ impl DdlController { let internal_tables = fragment_graph.internal_tables().into_values().collect_vec(); let table_id_map = mgr .catalog_controller - .create_internal_table_catalog(streaming_job.id() as _, internal_tables) + .create_internal_table_catalog(&streaming_job, internal_tables) .await?; fragment_graph.refill_internal_table_ids(table_id_map); diff --git a/src/meta/src/stream/mod.rs b/src/meta/src/stream/mod.rs index ec542ed8c008b..8f97f1a4634de 100644 --- a/src/meta/src/stream/mod.rs +++ b/src/meta/src/stream/mod.rs @@ -21,36 +21,8 @@ mod stream_manager; mod test_fragmenter; mod test_scale; -use std::collections::HashMap; - -use risingwave_common::catalog::TableId; -use risingwave_pb::stream_plan::StreamActor; -use risingwave_pb::stream_service::build_actor_info::SubscriptionIds; -use risingwave_pb::stream_service::BuildActorInfo; pub use scale::*; pub use sink::*; pub use source_manager::*; pub use stream_graph::*; pub use stream_manager::*; - -pub(crate) fn to_build_actor_info( - actor: StreamActor, - subscriptions: &HashMap>, - subscription_depend_table_id: TableId, -) -> BuildActorInfo { - BuildActorInfo { - actor: Some(actor), - related_subscriptions: subscriptions - .get(&subscription_depend_table_id) - .into_iter() - .map(|subscriptions| { - ( - subscription_depend_table_id.table_id, - SubscriptionIds { - subscription_ids: subscriptions.keys().cloned().collect(), - }, - ) - }) - .collect(), - } -} diff --git a/src/meta/src/stream/source_manager.rs b/src/meta/src/stream/source_manager.rs index ae5ca2a610b9c..8fcceac82c0cf 100644 --- a/src/meta/src/stream/source_manager.rs +++ b/src/meta/src/stream/source_manager.rs @@ -588,32 +588,6 @@ where ) } -pub fn validate_assignment(assignment: &mut HashMap>) { - // check if one split is assign to multiple actors - let mut split_to_actor = HashMap::new(); - for (actor_id, splits) in &mut *assignment { - let _ = splits.iter().map(|split| { - split_to_actor - .entry(split.id()) - .or_insert_with(Vec::new) - .push(*actor_id) - }); - } - - for (split_id, actor_ids) in &mut split_to_actor { - if actor_ids.len() > 1 { - tracing::warn!(split_id = ?split_id, actor_ids = ?actor_ids, "split is assigned to multiple actors"); - } - // keep the first actor and remove the rest from the assignment - for actor_id in actor_ids.iter().skip(1) { - assignment - .get_mut(actor_id) - .unwrap() - .retain(|split| split.id() != *split_id); - } - } -} - fn align_backfill_splits( backfill_actors: impl IntoIterator)>, upstream_assignment: &HashMap>, @@ -1205,14 +1179,11 @@ mod tests { use risingwave_common::types::JsonbVal; use risingwave_connector::error::ConnectorResult; - use risingwave_connector::source::test_source::TestSourceSplit; - use risingwave_connector::source::{SplitId, SplitImpl, SplitMetaData}; + use risingwave_connector::source::{SplitId, SplitMetaData}; use serde::{Deserialize, Serialize}; - use super::validate_assignment; use crate::model::{ActorId, FragmentId}; use crate::stream::source_manager::{reassign_splits, SplitDiffOptions}; - use crate::stream::SplitAssignment; #[derive(Debug, Copy, Clone, Serialize, Deserialize)] struct TestSplit { @@ -1333,49 +1304,6 @@ mod tests { assert!(!diff.is_empty()) } - #[test] - fn test_validate_assignment() { - let mut fragment_assignment: SplitAssignment; - let test_assignment: HashMap> = maplit::hashmap! { - 0 => vec![SplitImpl::Test( - TestSourceSplit {id: "1".into(), properties: Default::default(), offset: Default::default()} - ), SplitImpl::Test( - TestSourceSplit {id: "2".into(), properties: Default::default(), offset: Default::default()} - )], - 1 => vec![SplitImpl::Test( - TestSourceSplit {id: "3".into(), properties: Default::default(), offset: Default::default()} - )], - 2 => vec![SplitImpl::Test( - TestSourceSplit {id: "1".into(), properties: Default::default(), offset: Default::default()} - )], - }; - fragment_assignment = maplit::hashmap! { - 1 => test_assignment, - }; - - fragment_assignment.iter_mut().for_each(|(_, assignment)| { - validate_assignment(assignment); - }); - - { - let mut split_to_actor = HashMap::new(); - for actor_to_splits in fragment_assignment.values() { - for (actor_id, splits) in actor_to_splits { - let _ = splits.iter().map(|split| { - split_to_actor - .entry(split.id()) - .or_insert_with(Vec::new) - .push(*actor_id) - }); - } - } - - for actor_ids in split_to_actor.values() { - assert_eq!(actor_ids.len(), 1); - } - } - } - #[test] fn test_reassign_splits() { let actor_splits = HashMap::new(); diff --git a/src/meta/src/stream/stream_manager.rs b/src/meta/src/stream/stream_manager.rs index 5dc174106197c..118252038dbbd 100644 --- a/src/meta/src/stream/stream_manager.rs +++ b/src/meta/src/stream/stream_manager.rs @@ -846,7 +846,6 @@ mod tests { let mut guard = inner.actor_streams.lock().unwrap(); let mut actor_ids = inner.actor_ids.lock().unwrap(); for actor in req.actors_to_build { - let actor = actor.actor.as_ref().unwrap(); assert!(actor_ids.insert(actor.actor_id)); guard.insert(actor.get_actor_id(), actor.clone()); } diff --git a/src/prost/build.rs b/src/prost/build.rs index da595949b4427..0afbaef2ea730 100644 --- a/src/prost/build.rs +++ b/src/prost/build.rs @@ -147,6 +147,7 @@ fn main() -> Result<(), Box> { "plan_common.AdditionalColumnPartition", "#[derive(Eq, Hash)]", ) + .type_attribute("plan_common.AdditionalColumnPayload", "#[derive(Eq, Hash)]") .type_attribute( "plan_common.AdditionalColumnTimestamp", "#[derive(Eq, Hash)]", diff --git a/src/rpc_client/Cargo.toml b/src/rpc_client/Cargo.toml index 49729c6d9e8ac..6a25be3c21738 100644 --- a/src/rpc_client/Cargo.toml +++ b/src/rpc_client/Cargo.toml @@ -23,7 +23,7 @@ http = "1" hyper = "1" itertools = { workspace = true } lru = { workspace = true } -moka = { version = "0.12", features = ["future"] } +moka = { version = "0.12.0", features = ["future"] } paste = "1" rand = { workspace = true } risingwave_common = { workspace = true } diff --git a/src/rpc_client/src/hummock_meta_client.rs b/src/rpc_client/src/hummock_meta_client.rs index bb62875b3fae1..db99036a34754 100644 --- a/src/rpc_client/src/hummock_meta_client.rs +++ b/src/rpc_client/src/hummock_meta_client.rs @@ -38,7 +38,12 @@ pub trait HummockMetaClient: Send + Sync + 'static { async fn get_snapshot(&self) -> Result; async fn get_new_sst_ids(&self, number: u32) -> Result; // We keep `commit_epoch` only for test/benchmark. - async fn commit_epoch(&self, epoch: HummockEpoch, sync_result: SyncResult) -> Result<()>; + async fn commit_epoch( + &self, + epoch: HummockEpoch, + sync_result: SyncResult, + is_log_store: bool, + ) -> Result<()>; async fn report_vacuum_task(&self, vacuum_task: VacuumTask) -> Result<()>; async fn trigger_manual_compaction( &self, diff --git a/src/rpc_client/src/meta_client.rs b/src/rpc_client/src/meta_client.rs index c7b7204bff7c8..67ea55269b2bd 100644 --- a/src/rpc_client/src/meta_client.rs +++ b/src/rpc_client/src/meta_client.rs @@ -1531,7 +1531,12 @@ impl HummockMetaClient for MetaClient { Ok(SstObjectIdRange::new(resp.start_id, resp.end_id)) } - async fn commit_epoch(&self, _epoch: HummockEpoch, _sync_result: SyncResult) -> Result<()> { + async fn commit_epoch( + &self, + _epoch: HummockEpoch, + _sync_result: SyncResult, + _is_log_store: bool, + ) -> Result<()> { panic!("Only meta service can commit_epoch in production.") } diff --git a/src/rpc_client/src/stream_client.rs b/src/rpc_client/src/stream_client.rs index 40a6d48dacb37..92f0b25f762a0 100644 --- a/src/rpc_client/src/stream_client.rs +++ b/src/rpc_client/src/stream_client.rs @@ -12,16 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; use anyhow::anyhow; use async_trait::async_trait; use futures::TryStreamExt; +use risingwave_common::catalog::TableId; use risingwave_common::config::MAX_CONNECTION_WINDOW_SIZE; use risingwave_common::monitor::{EndpointExt, TcpConfig}; use risingwave_common::util::addr::HostAddr; use risingwave_hummock_sdk::HummockVersionId; +use risingwave_pb::stream_plan::SubscriptionUpstreamInfo; use risingwave_pb::stream_service::stream_service_client::StreamServiceClient; use risingwave_pb::stream_service::streaming_control_stream_request::InitRequest; use risingwave_pb::stream_service::streaming_control_stream_response::InitResponse; @@ -86,11 +89,23 @@ impl StreamClient { pub async fn start_streaming_control( &self, version_id: HummockVersionId, + mv_depended_subscriptions: &HashMap>, ) -> Result { let first_request = StreamingControlStreamRequest { request: Some(streaming_control_stream_request::Request::Init( InitRequest { version_id: version_id.to_u64(), + subscriptions: mv_depended_subscriptions + .iter() + .flat_map(|(table_id, subscriptions)| { + subscriptions + .keys() + .map(|subscriber_id| SubscriptionUpstreamInfo { + subscriber_id: *subscriber_id, + upstream_mv_table_id: table_id.table_id, + }) + }) + .collect(), }, )), }; diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index 2886c4e4e23f7..b321c43b99e43 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -36,7 +36,7 @@ libc = "0.2" lz4 = "1.25.0" memcomparable = "0.2" metrics-prometheus = "0.7" -moka = { version = "0.12", features = ["future", "sync"] } +moka = { version = "0.12.0", features = ["future", "sync"] } more-asserts = "0.3" num-integer = "0.1" parking_lot = { workspace = true } @@ -96,7 +96,7 @@ workspace-hack = { path = "../workspace-hack" } bincode = "1" criterion = { workspace = true, features = ["async_futures", "async_tokio"] } expect-test = "1" -risingwave_hummock_sdk = { workspace = true } +risingwave_hummock_sdk = { workspace = true, features = ["test"] } risingwave_test_runner = { workspace = true } uuid = { version = "1", features = ["v4"] } diff --git a/src/storage/benches/bench_table_watermarks.rs b/src/storage/benches/bench_table_watermarks.rs index 4a9e1c5edda0b..5153dd0f9fe38 100644 --- a/src/storage/benches/bench_table_watermarks.rs +++ b/src/storage/benches/bench_table_watermarks.rs @@ -166,7 +166,7 @@ fn bench_table_watermarks(c: &mut Criterion) { let mut pinned_version = PinnedVersion::new(versions.pop_front().unwrap(), unbounded_channel().0); while let Some(version) = versions.pop_front() { - pinned_version = pinned_version.new_pin_version(version); + pinned_version = pinned_version.new_pin_version(version).unwrap(); } }, BatchSize::SmallInput, diff --git a/src/storage/hummock_sdk/src/change_log.rs b/src/storage/hummock_sdk/src/change_log.rs index 433309acab930..c231b0eb6b7b5 100644 --- a/src/storage/hummock_sdk/src/change_log.rs +++ b/src/storage/hummock_sdk/src/change_log.rs @@ -16,32 +16,42 @@ use std::collections::HashMap; use risingwave_common::catalog::TableId; use risingwave_pb::hummock::hummock_version_delta::PbChangeLogDelta; -use risingwave_pb::hummock::{PbEpochNewChangeLog, PbTableChangeLog}; +use risingwave_pb::hummock::{PbEpochNewChangeLog, PbSstableInfo, PbTableChangeLog}; use tracing::warn; use crate::sstable_info::SstableInfo; #[derive(Debug, Clone, PartialEq)] -pub struct TableChangeLog(pub Vec); +pub struct TableChangeLogCommon(pub Vec>); + +pub type TableChangeLog = TableChangeLogCommon; #[derive(Debug, Clone, PartialEq)] -pub struct EpochNewChangeLog { - pub new_value: Vec, - pub old_value: Vec, +pub struct EpochNewChangeLogCommon { + pub new_value: Vec, + pub old_value: Vec, pub epochs: Vec, } -impl From<&EpochNewChangeLog> for PbEpochNewChangeLog { - fn from(val: &EpochNewChangeLog) -> Self { +pub type EpochNewChangeLog = EpochNewChangeLogCommon; + +impl From<&EpochNewChangeLogCommon> for PbEpochNewChangeLog +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(val: &EpochNewChangeLogCommon) -> Self { Self { - new_value: val.new_value.iter().map(|a| a.clone().into()).collect(), - old_value: val.old_value.iter().map(|a| a.clone().into()).collect(), + new_value: val.new_value.iter().map(|a| a.into()).collect(), + old_value: val.old_value.iter().map(|a| a.into()).collect(), epochs: val.epochs.clone(), } } } -impl From<&PbEpochNewChangeLog> for EpochNewChangeLog { +impl From<&PbEpochNewChangeLog> for EpochNewChangeLogCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(value: &PbEpochNewChangeLog) -> Self { Self { new_value: value.new_value.iter().map(|a| a.into()).collect(), @@ -51,30 +61,28 @@ impl From<&PbEpochNewChangeLog> for EpochNewChangeLog { } } -impl From for PbEpochNewChangeLog { - fn from(val: EpochNewChangeLog) -> Self { +impl From> for PbEpochNewChangeLog +where + PbSstableInfo: From, +{ + fn from(val: EpochNewChangeLogCommon) -> Self { Self { - new_value: val - .new_value - .into_iter() - .map(|a| a.clone().into()) - .collect(), - old_value: val - .old_value - .into_iter() - .map(|a| a.clone().into()) - .collect(), - epochs: val.epochs.clone(), + new_value: val.new_value.into_iter().map(|a| a.into()).collect(), + old_value: val.old_value.into_iter().map(|a| a.into()).collect(), + epochs: val.epochs, } } } -impl From for EpochNewChangeLog { +impl From for EpochNewChangeLogCommon +where + T: From, +{ fn from(value: PbEpochNewChangeLog) -> Self { Self { new_value: value.new_value.into_iter().map(|a| a.into()).collect(), old_value: value.old_value.into_iter().map(|a| a.into()).collect(), - epochs: value.epochs.clone(), + epochs: value.epochs, } } } @@ -117,15 +125,23 @@ impl TableChangeLog { } } -impl TableChangeLog { +impl TableChangeLogCommon +where + PbSstableInfo: for<'a> From<&'a T>, +{ pub fn to_protobuf(&self) -> PbTableChangeLog { PbTableChangeLog { change_logs: self.0.iter().map(|a| a.into()).collect(), } } +} +impl TableChangeLogCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ pub fn from_protobuf(val: &PbTableChangeLog) -> Self { - Self(val.change_logs.clone().iter().map(|a| a.into()).collect()) + Self(val.change_logs.iter().map(|a| a.into()).collect()) } } @@ -173,13 +189,18 @@ pub fn build_table_change_log_delta<'a>( } #[derive(Debug, PartialEq, Clone)] -pub struct ChangeLogDelta { +pub struct ChangeLogDeltaCommon { pub truncate_epoch: u64, - pub new_log: Option, + pub new_log: Option>, } -impl From<&ChangeLogDelta> for PbChangeLogDelta { - fn from(val: &ChangeLogDelta) -> Self { +pub type ChangeLogDelta = ChangeLogDeltaCommon; + +impl From<&ChangeLogDeltaCommon> for PbChangeLogDelta +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(val: &ChangeLogDeltaCommon) -> Self { Self { truncate_epoch: val.truncate_epoch, new_log: val.new_log.as_ref().map(|a| a.into()), @@ -187,7 +208,10 @@ impl From<&ChangeLogDelta> for PbChangeLogDelta { } } -impl From<&PbChangeLogDelta> for ChangeLogDelta { +impl From<&PbChangeLogDelta> for ChangeLogDeltaCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(val: &PbChangeLogDelta) -> Self { Self { truncate_epoch: val.truncate_epoch, @@ -196,8 +220,11 @@ impl From<&PbChangeLogDelta> for ChangeLogDelta { } } -impl From for PbChangeLogDelta { - fn from(val: ChangeLogDelta) -> Self { +impl From> for PbChangeLogDelta +where + PbSstableInfo: From, +{ + fn from(val: ChangeLogDeltaCommon) -> Self { Self { truncate_epoch: val.truncate_epoch, new_log: val.new_log.map(|a| a.into()), @@ -205,7 +232,10 @@ impl From for PbChangeLogDelta { } } -impl From for ChangeLogDelta { +impl From for ChangeLogDeltaCommon +where + T: From, +{ fn from(val: PbChangeLogDelta) -> Self { Self { truncate_epoch: val.truncate_epoch, @@ -218,11 +248,12 @@ impl From for ChangeLogDelta { mod tests { use itertools::Itertools; - use crate::change_log::{EpochNewChangeLog, TableChangeLog}; + use crate::change_log::{EpochNewChangeLog, TableChangeLogCommon}; + use crate::sstable_info::SstableInfo; #[test] fn test_filter_epoch() { - let table_change_log = TableChangeLog(vec![ + let table_change_log = TableChangeLogCommon::(vec![ EpochNewChangeLog { new_value: vec![], old_value: vec![], @@ -262,7 +293,7 @@ mod tests { #[test] fn test_truncate() { - let mut table_change_log = TableChangeLog(vec![ + let mut table_change_log = TableChangeLogCommon::(vec![ EpochNewChangeLog { new_value: vec![], old_value: vec![], @@ -288,7 +319,7 @@ mod tests { table_change_log.truncate(1); assert_eq!( table_change_log, - TableChangeLog(vec![ + TableChangeLogCommon::(vec![ EpochNewChangeLog { new_value: vec![], old_value: vec![], @@ -310,7 +341,7 @@ mod tests { table_change_log.truncate(3); assert_eq!( table_change_log, - TableChangeLog(vec![ + TableChangeLogCommon::(vec![ EpochNewChangeLog { new_value: vec![], old_value: vec![], diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs index 682cb107f3395..376626e844242 100644 --- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs +++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs @@ -29,7 +29,7 @@ use tracing::warn; use super::group_split::get_sub_level_insert_hint; use super::{group_split, StateTableId}; -use crate::change_log::TableChangeLog; +use crate::change_log::TableChangeLogCommon; use crate::compaction_group::StaticCompactionGroupId; use crate::key_range::KeyRangeCommon; use crate::level::{Level, Levels, OverlappingLevel}; @@ -795,7 +795,7 @@ impl HummockVersion { change_log.0.push(new_change_log.clone()); } Entry::Vacant(entry) => { - entry.insert(TableChangeLog(vec![new_change_log.clone()])); + entry.insert(TableChangeLogCommon(vec![new_change_log.clone()])); } }; } @@ -1458,20 +1458,22 @@ mod tests { #[test] fn test_get_sst_object_ids() { - let mut version = HummockVersion::default(); - version.id = HummockVersionId::new(0); - version.levels = HashMap::from_iter([( - 0, - Levels { - levels: vec![], - l0: OverlappingLevel { - sub_levels: vec![], - total_file_size: 0, - uncompressed_file_size: 0, + let mut version = HummockVersion { + id: HummockVersionId::new(0), + levels: HashMap::from_iter([( + 0, + Levels { + levels: vec![], + l0: OverlappingLevel { + sub_levels: vec![], + total_file_size: 0, + uncompressed_file_size: 0, + }, + ..Default::default() }, - ..Default::default() - }, - )]); + )]), + ..Default::default() + }; assert_eq!(version.get_object_ids().len(), 0); // Add to sub level @@ -1505,68 +1507,72 @@ mod tests { #[test] fn test_apply_version_delta() { - let mut version = HummockVersion::default(); - version.id = HummockVersionId::new(0); - version.levels = HashMap::from_iter([ - ( - 0, - build_initial_compaction_group_levels( + let mut version = HummockVersion { + id: HummockVersionId::new(0), + levels: HashMap::from_iter([ + ( 0, - &CompactionConfig { - max_level: 6, - ..Default::default() - }, + build_initial_compaction_group_levels( + 0, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ), ), - ), - ( - 1, - build_initial_compaction_group_levels( + ( 1, - &CompactionConfig { - max_level: 6, - ..Default::default() - }, - ), - ), - ]); - let mut version_delta = HummockVersionDelta::default(); - version_delta.id = HummockVersionId::new(1); - version_delta.group_deltas = HashMap::from_iter([ - ( - 2, - GroupDeltas { - group_deltas: vec![GroupDelta::GroupConstruct(GroupConstruct { - group_config: Some(CompactionConfig { + build_initial_compaction_group_levels( + 1, + &CompactionConfig { max_level: 6, ..Default::default() - }), - ..Default::default() - })], - }, - ), - ( - 0, - GroupDeltas { - group_deltas: vec![GroupDelta::GroupDestroy(GroupDestroy {})], - }, - ), - ( - 1, - GroupDeltas { - group_deltas: vec![GroupDelta::IntraLevel(IntraLevelDelta::new( - 1, - 0, - vec![], - vec![SstableInfo { - object_id: 1, - sst_id: 1, + }, + ), + ), + ]), + ..Default::default() + }; + let version_delta = HummockVersionDelta { + id: HummockVersionId::new(1), + group_deltas: HashMap::from_iter([ + ( + 2, + GroupDeltas { + group_deltas: vec![GroupDelta::GroupConstruct(GroupConstruct { + group_config: Some(CompactionConfig { + max_level: 6, + ..Default::default() + }), ..Default::default() - }], - 0, - ))], - }, - ), - ]); + })], + }, + ), + ( + 0, + GroupDeltas { + group_deltas: vec![GroupDelta::GroupDestroy(GroupDestroy {})], + }, + ), + ( + 1, + GroupDeltas { + group_deltas: vec![GroupDelta::IntraLevel(IntraLevelDelta::new( + 1, + 0, + vec![], + vec![SstableInfo { + object_id: 1, + sst_id: 1, + ..Default::default() + }], + 0, + ))], + }, + ), + ]), + ..Default::default() + }; let version_delta = version_delta; version.apply_version_delta(&version_delta); @@ -1587,24 +1593,26 @@ mod tests { }], ..Default::default() }; - assert_eq!(version, { - let mut version = HummockVersion::default(); - version.id = HummockVersionId::new(1); - version.levels = HashMap::from_iter([ - ( - 2, - build_initial_compaction_group_levels( + assert_eq!( + version, + HummockVersion { + id: HummockVersionId::new(1), + levels: HashMap::from_iter([ + ( 2, - &CompactionConfig { - max_level: 6, - ..Default::default() - }, + build_initial_compaction_group_levels( + 2, + &CompactionConfig { + max_level: 6, + ..Default::default() + }, + ), ), - ), - (1, cg1), - ]); - version - }); + (1, cg1), + ]), + ..Default::default() + } + ); } fn gen_sst_info(object_id: u64, table_ids: Vec, left: Bytes, right: Bytes) -> SstableInfo { diff --git a/src/storage/hummock_sdk/src/key.rs b/src/storage/hummock_sdk/src/key.rs index 6a33d1ff1a09b..0f04440ec5489 100644 --- a/src/storage/hummock_sdk/src/key.rs +++ b/src/storage/hummock_sdk/src/key.rs @@ -24,7 +24,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut}; use risingwave_common::catalog::TableId; use risingwave_common::hash::VirtualNode; use risingwave_common_estimate_size::EstimateSize; -use serde::{Deserialize, Serialize}; use crate::{EpochWithGap, HummockEpoch}; @@ -441,14 +440,8 @@ impl CopyFromSlice for Bytes { /// /// Its name come from the assumption that Hummock is always accessed by a table-like structure /// identified by a [`TableId`]. -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Serialize, Deserialize)] -pub struct TableKey>( - #[serde(bound( - serialize = "T: serde::Serialize + serde_bytes::Serialize", - deserialize = "T: serde::Deserialize<'de> + serde_bytes::Deserialize<'de>" - ))] - pub T, -); +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] +pub struct TableKey>(pub T); impl> Debug for TableKey { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -542,15 +535,11 @@ pub fn gen_key_from_str(vnode: VirtualNode, payload: &str) -> TableKey { /// will group these two values into one struct for convenient filtering. /// /// The encoded format is | `table_id` | `table_key` |. -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Serialize, Deserialize)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] pub struct UserKey> { // When comparing `UserKey`, we first compare `table_id`, then `table_key`. So the order of // declaration matters. pub table_id: TableId, - #[serde(bound( - serialize = "T: serde::Serialize + serde_bytes::Serialize", - deserialize = "T: serde::Deserialize<'de> + serde_bytes::Deserialize<'de>" - ))] pub table_key: TableKey, } @@ -590,15 +579,6 @@ impl> UserKey { buf.put_slice(self.table_key.as_ref()); } - /// Encode in to a buffer. - /// - /// length prefixed requires 4B more than its `encoded_len()` - pub fn encode_length_prefixed(&self, mut buf: impl BufMut) { - buf.put_u32(self.table_id.table_id()); - buf.put_u32(self.table_key.as_ref().len() as u32); - buf.put_slice(self.table_key.as_ref()); - } - pub fn encode(&self) -> Vec { let mut ret = Vec::with_capacity(TABLE_PREFIX_LEN + self.table_key.as_ref().len()); self.encode_into(&mut ret); @@ -658,16 +638,6 @@ impl> UserKey { } } -impl UserKey> { - pub fn decode_length_prefixed(buf: &mut &[u8]) -> Self { - let table_id = buf.get_u32(); - let len = buf.get_u32() as usize; - let data = buf[..len].to_vec(); - buf.advance(len); - UserKey::new(TableId::new(table_id), TableKey(data)) - } -} - impl> UserKey { /// Use this method to override an old `UserKey>` with a `UserKey<&[u8]>` to own the /// table key without reallocating a new `UserKey` object. @@ -882,48 +852,50 @@ impl + Ord + Eq> PartialOrd for FullKey { } } -#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] -pub struct PointRange> { - // When comparing `PointRange`, we first compare `left_user_key`, then - // `is_exclude_left_key`. Therefore the order of declaration matters. - #[serde(bound( - serialize = "T: serde::Serialize + serde_bytes::Serialize", - deserialize = "T: serde::Deserialize<'de> + serde_bytes::Deserialize<'de>" - ))] - pub left_user_key: UserKey, - /// `PointRange` represents the left user key itself if `is_exclude_left_key==false` - /// while represents the right δ Neighborhood of the left user key if - /// `is_exclude_left_key==true`. - pub is_exclude_left_key: bool, -} +pub mod range_delete_backward_compatibility_serde_struct { + use bytes::{Buf, BufMut}; + use risingwave_common::catalog::TableId; + use serde::{Deserialize, Serialize}; -impl> PointRange { - pub fn from_user_key(left_user_key: UserKey, is_exclude_left_key: bool) -> Self { - Self { - left_user_key, - is_exclude_left_key, - } - } + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] + pub struct TableKey(Vec); - pub fn as_ref(&self) -> PointRange<&[u8]> { - PointRange::from_user_key(self.left_user_key.as_ref(), self.is_exclude_left_key) + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] + pub struct UserKey { + // When comparing `UserKey`, we first compare `table_id`, then `table_key`. So the order of + // declaration matters. + pub table_id: TableId, + pub table_key: TableKey, } - pub fn is_empty(&self) -> bool { - self.left_user_key.is_empty() - } -} + impl UserKey { + pub fn decode_length_prefixed(buf: &mut &[u8]) -> Self { + let table_id = buf.get_u32(); + let len = buf.get_u32() as usize; + let data = buf[..len].to_vec(); + buf.advance(len); + UserKey { + table_id: TableId::new(table_id), + table_key: TableKey(data), + } + } -impl<'a> PointRange<&'a [u8]> { - pub fn to_vec(&self) -> PointRange> { - self.copy_into() + pub fn encode_length_prefixed(&self, mut buf: impl BufMut) { + buf.put_u32(self.table_id.table_id()); + buf.put_u32(self.table_key.0.as_slice().len() as u32); + buf.put_slice(self.table_key.0.as_slice()); + } } - pub fn copy_into>(&self) -> PointRange { - PointRange { - left_user_key: self.left_user_key.copy_into(), - is_exclude_left_key: self.is_exclude_left_key, - } + #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] + pub struct PointRange { + // When comparing `PointRange`, we first compare `left_user_key`, then + // `is_exclude_left_key`. Therefore the order of declaration matters. + pub left_user_key: UserKey, + /// `PointRange` represents the left user key itself if `is_exclude_left_key==false` + /// while represents the right δ Neighborhood of the left user key if + /// `is_exclude_left_key==true`. + pub is_exclude_left_key: bool, } } diff --git a/src/storage/hummock_sdk/src/level.rs b/src/storage/hummock_sdk/src/level.rs index c7db09e69e76d..762b5abd25ac9 100644 --- a/src/storage/hummock_sdk/src/level.rs +++ b/src/storage/hummock_sdk/src/level.rs @@ -23,19 +23,24 @@ use risingwave_pb::hummock::{ use crate::sstable_info::SstableInfo; #[derive(Debug, Clone, PartialEq, Default)] -pub struct OverlappingLevel { - pub sub_levels: Vec, +pub struct OverlappingLevelCommon { + pub sub_levels: Vec>, pub total_file_size: u64, pub uncompressed_file_size: u64, } -impl From<&PbOverlappingLevel> for OverlappingLevel { +pub type OverlappingLevel = OverlappingLevelCommon; + +impl From<&PbOverlappingLevel> for OverlappingLevelCommon +where + for<'a> LevelCommon: From<&'a PbLevel>, +{ fn from(pb_overlapping_level: &PbOverlappingLevel) -> Self { Self { sub_levels: pb_overlapping_level .sub_levels .iter() - .map(Level::from) + .map(LevelCommon::from) .collect_vec(), total_file_size: pb_overlapping_level.total_file_size, uncompressed_file_size: pb_overlapping_level.uncompressed_file_size, @@ -43,13 +48,16 @@ impl From<&PbOverlappingLevel> for OverlappingLevel { } } -impl From<&OverlappingLevel> for PbOverlappingLevel { - fn from(overlapping_level: &OverlappingLevel) -> Self { +impl From<&OverlappingLevelCommon> for PbOverlappingLevel +where + for<'a> &'a LevelCommon: Into, +{ + fn from(overlapping_level: &OverlappingLevelCommon) -> Self { Self { sub_levels: overlapping_level .sub_levels .iter() - .map(|pb_level| pb_level.into()) + .map(|level| level.into()) .collect_vec(), total_file_size: overlapping_level.total_file_size, uncompressed_file_size: overlapping_level.uncompressed_file_size, @@ -57,8 +65,11 @@ impl From<&OverlappingLevel> for PbOverlappingLevel { } } -impl From for PbOverlappingLevel { - fn from(overlapping_level: OverlappingLevel) -> Self { +impl From> for PbOverlappingLevel +where + LevelCommon: Into, +{ + fn from(overlapping_level: OverlappingLevelCommon) -> Self { Self { sub_levels: overlapping_level .sub_levels @@ -71,13 +82,16 @@ impl From for PbOverlappingLevel { } } -impl From for OverlappingLevel { +impl From for OverlappingLevelCommon +where + LevelCommon: From, +{ fn from(pb_overlapping_level: PbOverlappingLevel) -> Self { Self { sub_levels: pb_overlapping_level .sub_levels .into_iter() - .map(Level::from) + .map(LevelCommon::from) .collect_vec(), total_file_size: pb_overlapping_level.total_file_size, uncompressed_file_size: pb_overlapping_level.uncompressed_file_size, @@ -97,26 +111,27 @@ impl OverlappingLevel { } #[derive(Debug, Clone, PartialEq, Default)] -pub struct Level { +pub struct LevelCommon { pub level_idx: u32, pub level_type: PbLevelType, - pub table_infos: Vec, + pub table_infos: Vec, pub total_file_size: u64, pub sub_level_id: u64, pub uncompressed_file_size: u64, pub vnode_partition_count: u32, } -impl From<&PbLevel> for Level { +pub type Level = LevelCommon; + +impl From<&PbLevel> for LevelCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_level: &PbLevel) -> Self { Self { level_idx: pb_level.level_idx, level_type: PbLevelType::try_from(pb_level.level_type).unwrap(), - table_infos: pb_level - .table_infos - .iter() - .map(SstableInfo::from) - .collect_vec(), + table_infos: pb_level.table_infos.iter().map(Into::into).collect_vec(), total_file_size: pb_level.total_file_size, sub_level_id: pb_level.sub_level_id, uncompressed_file_size: pb_level.uncompressed_file_size, @@ -125,16 +140,15 @@ impl From<&PbLevel> for Level { } } -impl From<&Level> for PbLevel { - fn from(level: &Level) -> Self { +impl From<&LevelCommon> for PbLevel +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(level: &LevelCommon) -> Self { Self { level_idx: level.level_idx, level_type: level.level_type.into(), - table_infos: level - .table_infos - .iter() - .map(PbSstableInfo::from) - .collect_vec(), + table_infos: level.table_infos.iter().map(Into::into).collect_vec(), total_file_size: level.total_file_size, sub_level_id: level.sub_level_id, uncompressed_file_size: level.uncompressed_file_size, @@ -143,16 +157,15 @@ impl From<&Level> for PbLevel { } } -impl From for PbLevel { - fn from(level: Level) -> Self { +impl From> for PbLevel +where + PbSstableInfo: From, +{ + fn from(level: LevelCommon) -> Self { Self { level_idx: level.level_idx, level_type: level.level_type.into(), - table_infos: level - .table_infos - .into_iter() - .map(PbSstableInfo::from) - .collect_vec(), + table_infos: level.table_infos.into_iter().map(Into::into).collect_vec(), total_file_size: level.total_file_size, sub_level_id: level.sub_level_id, uncompressed_file_size: level.uncompressed_file_size, @@ -161,7 +174,10 @@ impl From for PbLevel { } } -impl From for Level { +impl From for LevelCommon +where + T: From, +{ fn from(pb_level: PbLevel) -> Self { Self { level_idx: pb_level.level_idx, @@ -169,7 +185,7 @@ impl From for Level { table_infos: pb_level .table_infos .into_iter() - .map(SstableInfo::from) + .map(Into::into) .collect_vec(), total_file_size: pb_level.total_file_size, sub_level_id: pb_level.sub_level_id, @@ -196,9 +212,9 @@ impl Level { } #[derive(Debug, Clone, PartialEq, Default)] -pub struct Levels { - pub levels: Vec, - pub l0: OverlappingLevel, +pub struct LevelsCommon { + pub levels: Vec>, + pub l0: OverlappingLevelCommon, pub group_id: u64, pub parent_group_id: u64, @@ -206,6 +222,8 @@ pub struct Levels { pub member_table_ids: Vec, } +pub type Levels = LevelsCommon; + impl Levels { pub fn level0(&self) -> &OverlappingLevel { &self.l0 @@ -236,15 +254,25 @@ impl Levels { } } -impl Levels { - pub fn from_protobuf(pb_levels: &PbLevels) -> Self { - Self::from(pb_levels) - } - +impl LevelsCommon +where + PbLevels: for<'a> From<&'a LevelsCommon>, +{ pub fn to_protobuf(&self) -> PbLevels { self.into() } +} + +impl LevelsCommon +where + LevelsCommon: for<'a> From<&'a PbLevels>, +{ + pub fn from_protobuf(pb_levels: &PbLevels) -> LevelsCommon { + LevelsCommon::::from(pb_levels) + } +} +impl Levels { pub fn estimated_encode_len(&self) -> usize { let mut basic = self .levels @@ -260,12 +288,15 @@ impl Levels { } } -impl From<&PbLevels> for Levels { +impl From<&PbLevels> for LevelsCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ #[expect(deprecated)] fn from(pb_levels: &PbLevels) -> Self { Self { - l0: OverlappingLevel::from(pb_levels.l0.as_ref().unwrap()), - levels: pb_levels.levels.iter().map(Level::from).collect_vec(), + l0: OverlappingLevelCommon::from(pb_levels.l0.as_ref().unwrap()), + levels: pb_levels.levels.iter().map(Into::into).collect_vec(), group_id: pb_levels.group_id, parent_group_id: pb_levels.parent_group_id, member_table_ids: pb_levels.member_table_ids.clone(), @@ -273,9 +304,12 @@ impl From<&PbLevels> for Levels { } } -impl From<&Levels> for PbLevels { +impl From<&LevelsCommon> for PbLevels +where + PbSstableInfo: for<'a> From<&'a T>, +{ #[expect(deprecated)] - fn from(levels: &Levels) -> Self { + fn from(levels: &LevelsCommon) -> Self { Self { l0: Some((&levels.l0).into()), levels: levels.levels.iter().map(PbLevel::from).collect_vec(), @@ -286,28 +320,38 @@ impl From<&Levels> for PbLevels { } } -impl From for Levels { +impl From for LevelsCommon +where + T: From, +{ #[expect(deprecated)] fn from(pb_levels: PbLevels) -> Self { Self { - l0: OverlappingLevel::from(pb_levels.l0.as_ref().unwrap()), - levels: pb_levels.levels.into_iter().map(Level::from).collect_vec(), + l0: OverlappingLevelCommon::from(pb_levels.l0.unwrap()), + levels: pb_levels + .levels + .into_iter() + .map(LevelCommon::from) + .collect_vec(), group_id: pb_levels.group_id, parent_group_id: pb_levels.parent_group_id, - member_table_ids: pb_levels.member_table_ids.clone(), + member_table_ids: pb_levels.member_table_ids, } } } -impl From for PbLevels { - fn from(levels: Levels) -> Self { +impl From> for PbLevels +where + PbSstableInfo: From, +{ + fn from(levels: LevelsCommon) -> Self { #[expect(deprecated)] Self { l0: Some(levels.l0.into()), levels: levels.levels.into_iter().map(PbLevel::from).collect_vec(), group_id: levels.group_id, parent_group_id: levels.parent_group_id, - member_table_ids: levels.member_table_ids.clone(), + member_table_ids: levels.member_table_ids, } } } diff --git a/src/storage/hummock_sdk/src/time_travel.rs b/src/storage/hummock_sdk/src/time_travel.rs index 380d75340df27..e828c94a4d781 100644 --- a/src/storage/hummock_sdk/src/time_travel.rs +++ b/src/storage/hummock_sdk/src/time_travel.rs @@ -13,87 +13,20 @@ // limitations under the License. use std::collections::{HashMap, HashSet}; -use std::sync::Arc; -use risingwave_common::catalog::TableId; -use risingwave_pb::hummock::hummock_version_delta::PbGroupDeltas; -use risingwave_pb::hummock::{PbHummockVersion, PbHummockVersionDelta, PbStateTableInfoDelta}; +use risingwave_pb::hummock::hummock_version::PbLevels; +use risingwave_pb::hummock::hummock_version_delta::{PbChangeLogDelta, PbGroupDeltas}; +use risingwave_pb::hummock::{PbEpochNewChangeLog, PbSstableInfo}; -use crate::change_log::{ChangeLogDelta, EpochNewChangeLog, TableChangeLog}; -use crate::level::{Level, Levels, OverlappingLevel}; +use crate::change_log::{TableChangeLog, TableChangeLogCommon}; +use crate::level::Level; use crate::sstable_info::SstableInfo; -use crate::table_watermark::TableWatermarks; use crate::version::{ - GroupDelta, GroupDeltas, HummockVersion, HummockVersionDelta, HummockVersionStateTableInfo, - IntraLevelDelta, + HummockVersion, HummockVersionCommon, HummockVersionDelta, HummockVersionDeltaCommon, }; -use crate::{CompactionGroupId, HummockSstableId, HummockVersionId}; +use crate::{CompactionGroupId, HummockSstableId}; -/// [`IncompleteHummockVersion`] is incomplete because `SSTableInfo` only has the `sst_id` set in the following fields: -/// - `PbLevels` -/// - `TableChangeLog` -#[derive(Debug, Clone, PartialEq)] -pub struct IncompleteHummockVersion { - pub id: HummockVersionId, - pub levels: HashMap, - max_committed_epoch: u64, - safe_epoch: u64, - pub table_watermarks: HashMap>, - pub table_change_log: HashMap, - pub state_table_info: HummockVersionStateTableInfo, -} - -/// Clone from an `SstableInfo`, but only set the `sst_id` for the target, leaving other fields as default. -/// The goal is to reduce the size of pb object generated afterward. -fn stripped_sstable_info(origin: &SstableInfo) -> SstableInfo { - SstableInfo { - object_id: Default::default(), - sst_id: origin.sst_id, - key_range: Default::default(), - file_size: Default::default(), - table_ids: Default::default(), - meta_offset: Default::default(), - stale_key_count: Default::default(), - total_key_count: Default::default(), - min_epoch: Default::default(), - max_epoch: Default::default(), - uncompressed_file_size: Default::default(), - range_tombstone_count: Default::default(), - bloom_filter_kind: Default::default(), - sst_size: Default::default(), - } -} - -fn stripped_epoch_new_change_log(origin: &EpochNewChangeLog) -> EpochNewChangeLog { - EpochNewChangeLog { - old_value: origin.old_value.iter().map(stripped_sstable_info).collect(), - new_value: origin.new_value.iter().map(stripped_sstable_info).collect(), - epochs: origin.epochs.clone(), - } -} - -fn stripped_change_log_delta(origin: &ChangeLogDelta) -> ChangeLogDelta { - ChangeLogDelta { - new_log: origin.new_log.as_ref().map(stripped_epoch_new_change_log), - truncate_epoch: origin.truncate_epoch, - } -} - -fn stripped_level(origin: &Level) -> Level { - Level { - level_idx: origin.level_idx, - level_type: origin.level_type, - table_infos: origin - .table_infos - .iter() - .map(stripped_sstable_info) - .collect(), - total_file_size: origin.total_file_size, - sub_level_id: origin.sub_level_id, - uncompressed_file_size: origin.uncompressed_file_size, - vnode_partition_count: origin.vnode_partition_count, - } -} +pub type IncompleteHummockVersion = HummockVersionCommon; pub fn refill_version( version: &mut HummockVersion, @@ -146,55 +79,6 @@ fn refill_sstable_info( .clone(); } -fn stripped_l0(origin: &OverlappingLevel) -> OverlappingLevel { - OverlappingLevel { - sub_levels: origin.sub_levels.iter().map(stripped_level).collect(), - total_file_size: origin.total_file_size, - uncompressed_file_size: origin.uncompressed_file_size, - } -} - -#[allow(deprecated)] -fn stripped_levels(origin: &Levels) -> Levels { - Levels { - levels: origin.levels.iter().map(stripped_level).collect(), - l0: stripped_l0(&origin.l0), - group_id: origin.group_id, - parent_group_id: origin.parent_group_id, - member_table_ids: Default::default(), - } -} - -fn stripped_intra_level_delta(origin: &IntraLevelDelta) -> IntraLevelDelta { - IntraLevelDelta { - level_idx: origin.level_idx, - l0_sub_level_id: origin.l0_sub_level_id, - removed_table_ids: origin.removed_table_ids.clone(), - inserted_table_infos: origin - .inserted_table_infos - .iter() - .map(stripped_sstable_info) - .collect(), - vnode_partition_count: origin.vnode_partition_count, - } -} - -fn stripped_group_delta(origin: &GroupDelta) -> GroupDelta { - match origin { - GroupDelta::IntraLevel(l) => GroupDelta::IntraLevel(stripped_intra_level_delta(l)), - _ => panic!("time travel expects DeltaType::IntraLevel only"), - } -} - -fn stripped_group_deltas(origin: &GroupDeltas) -> GroupDeltas { - let group_deltas = origin - .group_deltas - .iter() - .map(stripped_group_delta) - .collect(); - GroupDeltas { group_deltas } -} - /// `SStableInfo` will be stripped. impl From<(&HummockVersion, &HashSet)> for IncompleteHummockVersion { fn from(p: (&HummockVersion, &HashSet)) -> Self { @@ -206,7 +90,10 @@ impl From<(&HummockVersion, &HashSet)> for IncompleteHummockV .iter() .filter_map(|(group_id, levels)| { if select_group.contains(group_id) { - Some((*group_id as CompactionGroupId, stripped_levels(levels))) + Some(( + *group_id as CompactionGroupId, + PbLevels::from(levels).into(), + )) } else { None } @@ -215,7 +102,7 @@ impl From<(&HummockVersion, &HashSet)> for IncompleteHummockV max_committed_epoch: version.visible_table_committed_epoch(), safe_epoch: version.visible_table_safe_epoch(), table_watermarks: version.table_watermarks.clone(), - // TODO: optimization: strip table change log + // TODO: optimization: strip table change log based on select_group table_change_log: version .table_change_log .iter() @@ -223,9 +110,9 @@ impl From<(&HummockVersion, &HashSet)> for IncompleteHummockV let incomplete_table_change_log = change_log .0 .iter() - .map(stripped_epoch_new_change_log) + .map(|e| PbEpochNewChangeLog::from(e).into()) .collect(); - (*table_id, TableChangeLog(incomplete_table_change_log)) + (*table_id, TableChangeLogCommon(incomplete_table_change_log)) }) .collect(), state_table_info: version.state_table_info.clone(), @@ -233,49 +120,10 @@ impl From<(&HummockVersion, &HashSet)> for IncompleteHummockV } } -impl IncompleteHummockVersion { - /// Resulted `SStableInfo` is incompelte. - pub fn to_protobuf(&self) -> PbHummockVersion { - PbHummockVersion { - id: self.id.0, - levels: self - .levels - .iter() - .map(|(group_id, levels)| (*group_id as _, levels.to_protobuf())) - .collect(), - max_committed_epoch: self.max_committed_epoch, - safe_epoch: self.safe_epoch, - table_watermarks: self - .table_watermarks - .iter() - .map(|(table_id, watermark)| (table_id.table_id, watermark.to_protobuf())) - .collect(), - table_change_logs: self - .table_change_log - .iter() - .map(|(table_id, change_log)| (table_id.table_id, change_log.to_protobuf())) - .collect(), - state_table_info: self.state_table_info.to_protobuf(), - } - } -} - /// [`IncompleteHummockVersionDelta`] is incomplete because `SSTableInfo` only has the `sst_id` set in the following fields: /// - `PbGroupDeltas` /// - `ChangeLogDelta` -#[derive(Debug, PartialEq, Clone)] -pub struct IncompleteHummockVersionDelta { - pub id: HummockVersionId, - pub prev_id: HummockVersionId, - pub group_deltas: HashMap, - pub max_committed_epoch: u64, - pub safe_epoch: u64, - pub trivial_move: bool, - pub new_table_watermarks: HashMap, - pub removed_table_ids: HashSet, - pub change_log_delta: HashMap, - pub state_table_info_delta: HashMap, -} +pub type IncompleteHummockVersionDelta = HummockVersionDeltaCommon; /// `SStableInfo` will be stripped. impl From<(&HummockVersionDelta, &HashSet)> for IncompleteHummockVersionDelta { @@ -289,7 +137,7 @@ impl From<(&HummockVersionDelta, &HashSet)> for IncompleteHum .iter() .filter_map(|(cg_id, deltas)| { if select_group.contains(cg_id) { - Some((*cg_id, stripped_group_deltas(deltas).to_protobuf())) + Some((*cg_id, PbGroupDeltas::from(deltas).into())) } else { None } @@ -300,47 +148,42 @@ impl From<(&HummockVersionDelta, &HashSet)> for IncompleteHum trivial_move: delta.trivial_move, new_table_watermarks: delta.new_table_watermarks.clone(), removed_table_ids: delta.removed_table_ids.clone(), - // TODO: optimization: strip table change log + // TODO: optimization: strip table change log based on select_group change_log_delta: delta .change_log_delta .iter() - .map(|(table_id, log_delta)| (*table_id, stripped_change_log_delta(log_delta))) + .map(|(table_id, log_delta)| (*table_id, PbChangeLogDelta::from(log_delta).into())) .collect(), state_table_info_delta: delta.state_table_info_delta.clone(), } } } -impl IncompleteHummockVersionDelta { - /// Resulted `SStableInfo` is incompelte. - pub fn to_protobuf(&self) -> PbHummockVersionDelta { - PbHummockVersionDelta { - id: self.id.0, - prev_id: self.prev_id.0, - group_deltas: self.group_deltas.clone(), - max_committed_epoch: self.max_committed_epoch, - safe_epoch: self.safe_epoch, - trivial_move: self.trivial_move, - new_table_watermarks: self - .new_table_watermarks - .iter() - .map(|(table_id, watermarks)| (table_id.table_id, watermarks.to_protobuf())) - .collect(), - removed_table_ids: self - .removed_table_ids - .iter() - .map(|table_id| table_id.table_id) - .collect(), - change_log_delta: self - .change_log_delta - .iter() - .map(|(table_id, log_delta)| (table_id.table_id, log_delta.into())) - .collect(), - state_table_info_delta: self - .state_table_info_delta - .iter() - .map(|(table_id, delta)| (table_id.table_id, *delta)) - .collect(), +pub struct SstableIdInVersion(HummockSstableId); + +impl From<&SstableIdInVersion> for PbSstableInfo { + fn from(sst_id: &SstableIdInVersion) -> Self { + Self { + sst_id: sst_id.0, + ..Default::default() } } } + +impl From for PbSstableInfo { + fn from(sst_id: SstableIdInVersion) -> Self { + (&sst_id).into() + } +} + +impl From<&PbSstableInfo> for SstableIdInVersion { + fn from(value: &PbSstableInfo) -> Self { + SstableIdInVersion(value.sst_id) + } +} + +impl From for SstableIdInVersion { + fn from(value: PbSstableInfo) -> Self { + (&value).into() + } +} diff --git a/src/storage/hummock_sdk/src/version.rs b/src/storage/hummock_sdk/src/version.rs index 1c8cfd1e310b4..4aecfcde0cf48 100644 --- a/src/storage/hummock_sdk/src/version.rs +++ b/src/storage/hummock_sdk/src/version.rs @@ -26,14 +26,14 @@ use risingwave_pb::hummock::hummock_version_delta::PbGroupDeltas; use risingwave_pb::hummock::{ CompactionConfig, PbGroupConstruct, PbGroupDelta, PbGroupDestroy, PbGroupMerge, PbGroupMetaChange, PbGroupTableChange, PbHummockVersion, PbHummockVersionDelta, - PbIntraLevelDelta, PbStateTableInfo, StateTableInfo, StateTableInfoDelta, + PbIntraLevelDelta, PbSstableInfo, PbStateTableInfo, StateTableInfo, StateTableInfoDelta, }; use tracing::warn; -use crate::change_log::{ChangeLogDelta, TableChangeLog}; +use crate::change_log::{ChangeLogDeltaCommon, TableChangeLogCommon}; use crate::compaction_group::hummock_version_ext::build_initial_compaction_group_levels; use crate::compaction_group::StaticCompactionGroupId; -use crate::level::Levels; +use crate::level::LevelsCommon; use crate::sstable_info::SstableInfo; use crate::table_watermark::TableWatermarks; use crate::{CompactionGroupId, HummockSstableObjectId, HummockVersionId, FIRST_VERSION_ID}; @@ -209,33 +209,39 @@ impl HummockVersionStateTableInfo { } #[derive(Debug, Clone, PartialEq)] -pub struct HummockVersion { +pub struct HummockVersionCommon { pub id: HummockVersionId, - pub levels: HashMap, - max_committed_epoch: u64, - safe_epoch: u64, + pub levels: HashMap>, + pub(crate) max_committed_epoch: u64, + pub(crate) safe_epoch: u64, pub table_watermarks: HashMap>, - pub table_change_log: HashMap, + pub table_change_log: HashMap>, pub state_table_info: HummockVersionStateTableInfo, } +pub type HummockVersion = HummockVersionCommon; + impl Default for HummockVersion { fn default() -> Self { HummockVersion::from(&PbHummockVersion::default()) } } -impl HummockVersion { +impl HummockVersionCommon +where + T: for<'a> From<&'a PbSstableInfo>, + PbSstableInfo: for<'a> From<&'a T>, +{ /// Convert the `PbHummockVersion` received from rpc to `HummockVersion`. No need to /// maintain backward compatibility. pub fn from_rpc_protobuf(pb_version: &PbHummockVersion) -> Self { - HummockVersion::from(pb_version) + pb_version.into() } /// Convert the `PbHummockVersion` deserialized from persisted state to `HummockVersion`. /// We should maintain backward compatibility. pub fn from_persisted_protobuf(pb_version: &PbHummockVersion) -> Self { - HummockVersion::from(pb_version) + pb_version.into() } pub fn to_protobuf(&self) -> PbHummockVersion { @@ -260,14 +266,19 @@ impl HummockVersion { } } -impl From<&PbHummockVersion> for HummockVersion { +impl From<&PbHummockVersion> for HummockVersionCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_version: &PbHummockVersion) -> Self { Self { id: HummockVersionId(pb_version.id), levels: pb_version .levels .iter() - .map(|(group_id, levels)| (*group_id as CompactionGroupId, Levels::from(levels))) + .map(|(group_id, levels)| { + (*group_id as CompactionGroupId, LevelsCommon::from(levels)) + }) .collect(), max_committed_epoch: pb_version.max_committed_epoch, safe_epoch: pb_version.safe_epoch, @@ -287,7 +298,7 @@ impl From<&PbHummockVersion> for HummockVersion { .map(|(table_id, change_log)| { ( TableId::new(*table_id), - TableChangeLog::from_protobuf(change_log), + TableChangeLogCommon::from_protobuf(change_log), ) }) .collect(), @@ -298,8 +309,11 @@ impl From<&PbHummockVersion> for HummockVersion { } } -impl From<&HummockVersion> for PbHummockVersion { - fn from(version: &HummockVersion) -> Self { +impl From<&HummockVersionCommon> for PbHummockVersion +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(version: &HummockVersionCommon) -> Self { Self { id: version.id.0, levels: version @@ -324,8 +338,12 @@ impl From<&HummockVersion> for PbHummockVersion { } } -impl From for PbHummockVersion { - fn from(version: HummockVersion) -> Self { +impl From> for PbHummockVersion +where + PbSstableInfo: From, + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(version: HummockVersionCommon) -> Self { Self { id: version.id.0, levels: version @@ -453,36 +471,42 @@ impl HummockVersion { } #[derive(Debug, PartialEq, Clone)] -pub struct HummockVersionDelta { +pub struct HummockVersionDeltaCommon { pub id: HummockVersionId, pub prev_id: HummockVersionId, - pub group_deltas: HashMap, - max_committed_epoch: u64, - safe_epoch: u64, + pub group_deltas: HashMap>, + pub(crate) max_committed_epoch: u64, + pub(crate) safe_epoch: u64, pub trivial_move: bool, pub new_table_watermarks: HashMap, pub removed_table_ids: HashSet, - pub change_log_delta: HashMap, + pub change_log_delta: HashMap>, pub state_table_info_delta: HashMap, } +pub type HummockVersionDelta = HummockVersionDeltaCommon; + impl Default for HummockVersionDelta { fn default() -> Self { HummockVersionDelta::from(&PbHummockVersionDelta::default()) } } -impl HummockVersionDelta { +impl HummockVersionDeltaCommon +where + T: for<'a> From<&'a PbSstableInfo>, + PbSstableInfo: for<'a> From<&'a T>, +{ /// Convert the `PbHummockVersionDelta` deserialized from persisted state to `HummockVersionDelta`. /// We should maintain backward compatibility. pub fn from_persisted_protobuf(delta: &PbHummockVersionDelta) -> Self { - Self::from(delta) + delta.into() } /// Convert the `PbHummockVersionDelta` received from rpc to `HummockVersionDelta`. No need to /// maintain backward compatibility. pub fn from_rpc_protobuf(delta: &PbHummockVersionDelta) -> Self { - Self::from(delta) + delta.into() } pub fn to_protobuf(&self) -> PbHummockVersionDelta { @@ -592,7 +616,10 @@ impl HummockVersionDelta { } } -impl From<&PbHummockVersionDelta> for HummockVersionDelta { +impl From<&PbHummockVersionDelta> for HummockVersionDeltaCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_version_delta: &PbHummockVersionDelta) -> Self { Self { id: HummockVersionId(pb_version_delta.id), @@ -601,7 +628,10 @@ impl From<&PbHummockVersionDelta> for HummockVersionDelta { .group_deltas .iter() .map(|(group_id, deltas)| { - (*group_id as CompactionGroupId, GroupDeltas::from(deltas)) + ( + *group_id as CompactionGroupId, + GroupDeltasCommon::from(deltas), + ) }) .collect(), max_committed_epoch: pb_version_delta.max_committed_epoch, @@ -625,8 +655,8 @@ impl From<&PbHummockVersionDelta> for HummockVersionDelta { .map(|(table_id, log_delta)| { ( TableId::new(*table_id), - ChangeLogDelta { - new_log: log_delta.new_log.clone().map(Into::into), + ChangeLogDeltaCommon { + new_log: log_delta.new_log.as_ref().map(Into::into), truncate_epoch: log_delta.truncate_epoch, }, ) @@ -642,8 +672,11 @@ impl From<&PbHummockVersionDelta> for HummockVersionDelta { } } -impl From<&HummockVersionDelta> for PbHummockVersionDelta { - fn from(version_delta: &HummockVersionDelta) -> Self { +impl From<&HummockVersionDeltaCommon> for PbHummockVersionDelta +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(version_delta: &HummockVersionDeltaCommon) -> Self { Self { id: version_delta.id.0, prev_id: version_delta.prev_id.0, @@ -679,8 +712,11 @@ impl From<&HummockVersionDelta> for PbHummockVersionDelta { } } -impl From for PbHummockVersionDelta { - fn from(version_delta: HummockVersionDelta) -> Self { +impl From> for PbHummockVersionDelta +where + PbSstableInfo: From, +{ + fn from(version_delta: HummockVersionDeltaCommon) -> Self { Self { id: version_delta.id.0, prev_id: version_delta.prev_id.0, @@ -716,7 +752,10 @@ impl From for PbHummockVersionDelta { } } -impl From for HummockVersionDelta { +impl From for HummockVersionDeltaCommon +where + T: From, +{ fn from(pb_version_delta: PbHummockVersionDelta) -> Self { Self { id: HummockVersionId(pb_version_delta.id), @@ -745,7 +784,7 @@ impl From for HummockVersionDelta { .map(|(table_id, log_delta)| { ( TableId::new(*table_id), - ChangeLogDelta { + ChangeLogDeltaCommon { new_log: log_delta.new_log.clone().map(Into::into), truncate_epoch: log_delta.truncate_epoch, }, @@ -762,14 +801,16 @@ impl From for HummockVersionDelta { } #[derive(Debug, PartialEq, Clone)] -pub struct IntraLevelDelta { +pub struct IntraLevelDeltaCommon { pub level_idx: u32, pub l0_sub_level_id: u64, pub removed_table_ids: Vec, - pub inserted_table_infos: Vec, + pub inserted_table_infos: Vec, pub vnode_partition_count: u32, } +pub type IntraLevelDelta = IntraLevelDeltaCommon; + impl IntraLevelDelta { pub fn estimated_encode_len(&self) -> usize { size_of::() @@ -784,40 +825,49 @@ impl IntraLevelDelta { } } -impl From for IntraLevelDelta { +impl From for IntraLevelDeltaCommon +where + T: From, +{ fn from(pb_intra_level_delta: PbIntraLevelDelta) -> Self { Self { level_idx: pb_intra_level_delta.level_idx, l0_sub_level_id: pb_intra_level_delta.l0_sub_level_id, - removed_table_ids: pb_intra_level_delta.removed_table_ids.clone(), + removed_table_ids: pb_intra_level_delta.removed_table_ids, inserted_table_infos: pb_intra_level_delta .inserted_table_infos .into_iter() - .map(SstableInfo::from) + .map(Into::into) .collect_vec(), vnode_partition_count: pb_intra_level_delta.vnode_partition_count, } } } -impl From for PbIntraLevelDelta { - fn from(intra_level_delta: IntraLevelDelta) -> Self { +impl From> for PbIntraLevelDelta +where + PbSstableInfo: From, +{ + fn from(intra_level_delta: IntraLevelDeltaCommon) -> Self { Self { level_idx: intra_level_delta.level_idx, l0_sub_level_id: intra_level_delta.l0_sub_level_id, - removed_table_ids: intra_level_delta.removed_table_ids.clone(), + removed_table_ids: intra_level_delta.removed_table_ids, inserted_table_infos: intra_level_delta .inserted_table_infos .into_iter() - .map(|sst| sst.into()) + .map(Into::into) .collect_vec(), vnode_partition_count: intra_level_delta.vnode_partition_count, } } } -impl From<&IntraLevelDelta> for PbIntraLevelDelta { - fn from(intra_level_delta: &IntraLevelDelta) -> Self { +impl From<&IntraLevelDeltaCommon> for PbIntraLevelDelta +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(intra_level_delta: &IntraLevelDeltaCommon) -> Self { Self { level_idx: intra_level_delta.level_idx, l0_sub_level_id: intra_level_delta.l0_sub_level_id, @@ -825,14 +875,17 @@ impl From<&IntraLevelDelta> for PbIntraLevelDelta { inserted_table_infos: intra_level_delta .inserted_table_infos .iter() - .map(|sst| sst.into()) + .map(Into::into) .collect_vec(), vnode_partition_count: intra_level_delta.vnode_partition_count, } } } -impl From<&PbIntraLevelDelta> for IntraLevelDelta { +impl From<&PbIntraLevelDelta> for IntraLevelDeltaCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_intra_level_delta: &PbIntraLevelDelta) -> Self { Self { level_idx: pb_intra_level_delta.level_idx, @@ -841,7 +894,7 @@ impl From<&PbIntraLevelDelta> for IntraLevelDelta { inserted_table_infos: pb_intra_level_delta .inserted_table_infos .iter() - .map(SstableInfo::from) + .map(Into::into) .collect_vec(), vnode_partition_count: pb_intra_level_delta.vnode_partition_count, } @@ -867,8 +920,8 @@ impl IntraLevelDelta { } #[derive(Debug, PartialEq, Clone)] -pub enum GroupDelta { - IntraLevel(IntraLevelDelta), +pub enum GroupDeltaCommon { + IntraLevel(IntraLevelDeltaCommon), GroupConstruct(PbGroupConstruct), GroupDestroy(PbGroupDestroy), GroupMetaChange(PbGroupMetaChange), @@ -879,100 +932,116 @@ pub enum GroupDelta { GroupMerge(PbGroupMerge), } -impl From for GroupDelta { +pub type GroupDelta = GroupDeltaCommon; + +impl From for GroupDeltaCommon +where + T: From, +{ fn from(pb_group_delta: PbGroupDelta) -> Self { match pb_group_delta.delta_type { Some(PbDeltaType::IntraLevel(pb_intra_level_delta)) => { - GroupDelta::IntraLevel(IntraLevelDelta::from(pb_intra_level_delta)) + GroupDeltaCommon::IntraLevel(IntraLevelDeltaCommon::from(pb_intra_level_delta)) } Some(PbDeltaType::GroupConstruct(pb_group_construct)) => { - GroupDelta::GroupConstruct(pb_group_construct) + GroupDeltaCommon::GroupConstruct(pb_group_construct) } Some(PbDeltaType::GroupDestroy(pb_group_destroy)) => { - GroupDelta::GroupDestroy(pb_group_destroy) + GroupDeltaCommon::GroupDestroy(pb_group_destroy) } Some(PbDeltaType::GroupMetaChange(pb_group_meta_change)) => { - GroupDelta::GroupMetaChange(pb_group_meta_change) + GroupDeltaCommon::GroupMetaChange(pb_group_meta_change) } Some(PbDeltaType::GroupTableChange(pb_group_table_change)) => { - GroupDelta::GroupTableChange(pb_group_table_change) + GroupDeltaCommon::GroupTableChange(pb_group_table_change) + } + Some(PbDeltaType::GroupMerge(pb_group_merge)) => { + GroupDeltaCommon::GroupMerge(pb_group_merge) } - Some(PbDeltaType::GroupMerge(pb_group_merge)) => GroupDelta::GroupMerge(pb_group_merge), None => panic!("delta_type is not set"), } } } -impl From for PbGroupDelta { - fn from(group_delta: GroupDelta) -> Self { +impl From> for PbGroupDelta +where + PbSstableInfo: From, +{ + fn from(group_delta: GroupDeltaCommon) -> Self { match group_delta { - GroupDelta::IntraLevel(intra_level_delta) => PbGroupDelta { + GroupDeltaCommon::IntraLevel(intra_level_delta) => PbGroupDelta { delta_type: Some(PbDeltaType::IntraLevel(intra_level_delta.into())), }, - GroupDelta::GroupConstruct(pb_group_construct) => PbGroupDelta { + GroupDeltaCommon::GroupConstruct(pb_group_construct) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupConstruct(pb_group_construct)), }, - GroupDelta::GroupDestroy(pb_group_destroy) => PbGroupDelta { + GroupDeltaCommon::GroupDestroy(pb_group_destroy) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupDestroy(pb_group_destroy)), }, - GroupDelta::GroupMetaChange(pb_group_meta_change) => PbGroupDelta { + GroupDeltaCommon::GroupMetaChange(pb_group_meta_change) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupMetaChange(pb_group_meta_change)), }, - GroupDelta::GroupTableChange(pb_group_table_change) => PbGroupDelta { + GroupDeltaCommon::GroupTableChange(pb_group_table_change) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupTableChange(pb_group_table_change)), }, - GroupDelta::GroupMerge(pb_group_merge) => PbGroupDelta { + GroupDeltaCommon::GroupMerge(pb_group_merge) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupMerge(pb_group_merge)), }, } } } -impl From<&GroupDelta> for PbGroupDelta { - fn from(group_delta: &GroupDelta) -> Self { +impl From<&GroupDeltaCommon> for PbGroupDelta +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(group_delta: &GroupDeltaCommon) -> Self { match group_delta { - GroupDelta::IntraLevel(intra_level_delta) => PbGroupDelta { + GroupDeltaCommon::IntraLevel(intra_level_delta) => PbGroupDelta { delta_type: Some(PbDeltaType::IntraLevel(intra_level_delta.into())), }, - GroupDelta::GroupConstruct(pb_group_construct) => PbGroupDelta { + GroupDeltaCommon::GroupConstruct(pb_group_construct) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupConstruct(pb_group_construct.clone())), }, - GroupDelta::GroupDestroy(pb_group_destroy) => PbGroupDelta { + GroupDeltaCommon::GroupDestroy(pb_group_destroy) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupDestroy(*pb_group_destroy)), }, - GroupDelta::GroupMetaChange(pb_group_meta_change) => PbGroupDelta { + GroupDeltaCommon::GroupMetaChange(pb_group_meta_change) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupMetaChange(pb_group_meta_change.clone())), }, - GroupDelta::GroupTableChange(pb_group_table_change) => PbGroupDelta { + GroupDeltaCommon::GroupTableChange(pb_group_table_change) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupTableChange(pb_group_table_change.clone())), }, - GroupDelta::GroupMerge(pb_group_merge) => PbGroupDelta { + GroupDeltaCommon::GroupMerge(pb_group_merge) => PbGroupDelta { delta_type: Some(PbDeltaType::GroupMerge(*pb_group_merge)), }, } } } -impl From<&PbGroupDelta> for GroupDelta { +impl From<&PbGroupDelta> for GroupDeltaCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_group_delta: &PbGroupDelta) -> Self { match &pb_group_delta.delta_type { Some(PbDeltaType::IntraLevel(pb_intra_level_delta)) => { - GroupDelta::IntraLevel(IntraLevelDelta::from(pb_intra_level_delta)) + GroupDeltaCommon::IntraLevel(IntraLevelDeltaCommon::from(pb_intra_level_delta)) } Some(PbDeltaType::GroupConstruct(pb_group_construct)) => { - GroupDelta::GroupConstruct(pb_group_construct.clone()) + GroupDeltaCommon::GroupConstruct(pb_group_construct.clone()) } Some(PbDeltaType::GroupDestroy(pb_group_destroy)) => { - GroupDelta::GroupDestroy(*pb_group_destroy) + GroupDeltaCommon::GroupDestroy(*pb_group_destroy) } Some(PbDeltaType::GroupMetaChange(pb_group_meta_change)) => { - GroupDelta::GroupMetaChange(pb_group_meta_change.clone()) + GroupDeltaCommon::GroupMetaChange(pb_group_meta_change.clone()) } Some(PbDeltaType::GroupTableChange(pb_group_table_change)) => { - GroupDelta::GroupTableChange(pb_group_table_change.clone()) + GroupDeltaCommon::GroupTableChange(pb_group_table_change.clone()) } Some(PbDeltaType::GroupMerge(pb_group_merge)) => { - GroupDelta::GroupMerge(*pb_group_merge) + GroupDeltaCommon::GroupMerge(*pb_group_merge) } None => panic!("delta_type is not set"), } @@ -980,24 +1049,32 @@ impl From<&PbGroupDelta> for GroupDelta { } #[derive(Debug, PartialEq, Clone, Default)] -pub struct GroupDeltas { - pub group_deltas: Vec, +pub struct GroupDeltasCommon { + pub group_deltas: Vec>, } -impl From for GroupDeltas { +pub type GroupDeltas = GroupDeltasCommon; + +impl From for GroupDeltasCommon +where + T: From, +{ fn from(pb_group_deltas: PbGroupDeltas) -> Self { Self { group_deltas: pb_group_deltas .group_deltas .into_iter() - .map(GroupDelta::from) + .map(GroupDeltaCommon::from) .collect_vec(), } } } -impl From for PbGroupDeltas { - fn from(group_deltas: GroupDeltas) -> Self { +impl From> for PbGroupDeltas +where + PbSstableInfo: From, +{ + fn from(group_deltas: GroupDeltasCommon) -> Self { Self { group_deltas: group_deltas .group_deltas @@ -1008,8 +1085,11 @@ impl From for PbGroupDeltas { } } -impl From<&GroupDeltas> for PbGroupDeltas { - fn from(group_deltas: &GroupDeltas) -> Self { +impl From<&GroupDeltasCommon> for PbGroupDeltas +where + PbSstableInfo: for<'a> From<&'a T>, +{ + fn from(group_deltas: &GroupDeltasCommon) -> Self { Self { group_deltas: group_deltas .group_deltas @@ -1020,19 +1100,25 @@ impl From<&GroupDeltas> for PbGroupDeltas { } } -impl From<&PbGroupDeltas> for GroupDeltas { +impl From<&PbGroupDeltas> for GroupDeltasCommon +where + T: for<'a> From<&'a PbSstableInfo>, +{ fn from(pb_group_deltas: &PbGroupDeltas) -> Self { Self { group_deltas: pb_group_deltas .group_deltas .iter() - .map(GroupDelta::from) + .map(GroupDeltaCommon::from) .collect_vec(), } } } -impl GroupDeltas { +impl GroupDeltasCommon +where + PbSstableInfo: for<'a> From<&'a T>, +{ pub fn to_protobuf(&self) -> PbGroupDeltas { self.into() } diff --git a/src/storage/hummock_test/benches/bench_hummock_iter.rs b/src/storage/hummock_test/benches/bench_hummock_iter.rs index 9f0cef22e4e43..f85367e408f10 100644 --- a/src/storage/hummock_test/benches/bench_hummock_iter.rs +++ b/src/storage/hummock_test/benches/bench_hummock_iter.rs @@ -107,7 +107,6 @@ fn criterion_benchmark(c: &mut Criterion) { (Unbounded, Unbounded), epoch, ReadOptions { - ignore_range_tombstone: true, prefetch_options: PrefetchOptions::default(), cache_policy: CachePolicy::Fill(CacheContext::Default), ..Default::default() diff --git a/src/storage/hummock_test/src/compactor_tests.rs b/src/storage/hummock_test/src/compactor_tests.rs index 92856fb5022c6..9be54ec045840 100644 --- a/src/storage/hummock_test/src/compactor_tests.rs +++ b/src/storage/hummock_test/src/compactor_tests.rs @@ -49,7 +49,8 @@ pub(crate) mod tests { default_compaction_selector, ManualCompactionOption, }; use risingwave_meta::hummock::test_utils::{ - register_table_ids_to_compaction_group, setup_compute_env, setup_compute_env_with_config, + get_compaction_group_id_by_table_id, register_table_ids_to_compaction_group, + setup_compute_env, setup_compute_env_with_config, unregister_table_ids_from_compaction_group, }; use risingwave_meta::hummock::{HummockManagerRef, MockHummockMetaClient}; @@ -162,6 +163,8 @@ pub(crate) mod tests { let mut local = storage .new_local(NewLocalOptions::for_test(TableId::default())) .await; + let table_id = local.table_id(); + let table_id_set = HashSet::from_iter([table_id]); // 1. add sstables let val = b"0"[..].repeat(value_size); local.init_for_test(epochs[0]).await.unwrap(); @@ -188,8 +191,14 @@ pub(crate) mod tests { } else { local.seal_current_epoch(u64::MAX, SealCurrentEpochOptions::for_test()); } - let res = storage.seal_and_sync_epoch(epoch).await.unwrap(); - hummock_meta_client.commit_epoch(epoch, res).await.unwrap(); + let res = storage + .seal_and_sync_epoch(epoch, table_id_set.clone()) + .await + .unwrap(); + hummock_meta_client + .commit_epoch(epoch, res, false) + .await + .unwrap(); } } @@ -228,6 +237,7 @@ pub(crate) mod tests { )); // 1. add sstables + let table_id = 0; let mut key = BytesMut::default(); key.put_u16(0); key.put_slice(b"same_key"); @@ -235,7 +245,7 @@ pub(crate) mod tests { hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - &[0], + &[table_id], ) .await; let rpc_filter_key_extractor_manager = match storage.filter_key_extractor_manager().clone() @@ -283,12 +293,12 @@ pub(crate) mod tests { .collect_vec(), ) .await; + + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager_ref.clone(), table_id).await; // 2. get compact task while let Some(mut compact_task) = hummock_manager_ref - .get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - &mut default_compaction_selector(), - ) + .get_compact_task(compaction_group_id, &mut default_compaction_selector()) .await .unwrap() { @@ -333,8 +343,7 @@ pub(crate) mod tests { // 4. get the latest version and check let version = hummock_manager_ref.get_current_version().await; - let group = - version.get_compaction_group_levels(StaticCompactionGroupId::StateDefault.into()); + let group = version.get_compaction_group_levels(compaction_group_id); // base level let output_tables = group @@ -401,11 +410,12 @@ pub(crate) mod tests { worker_node.id, )); + let table_id = 0; let storage = get_hummock_storage( hummock_meta_client.clone(), get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node.clone()), &hummock_manager_ref, - &[0], + &[table_id], ) .await; @@ -447,12 +457,10 @@ pub(crate) mod tests { // 2. get compact task - // 3. compact + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager_ref.clone(), table_id).await; while let Some(compact_task) = hummock_manager_ref - .get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - &mut default_compaction_selector(), - ) + .get_compact_task(compaction_group_id, &mut default_compaction_selector()) .await .unwrap() { @@ -481,7 +489,7 @@ pub(crate) mod tests { // 4. get the latest version and check let version = hummock_manager_ref.get_current_version().await; let output_tables = version - .get_compaction_group_levels(StaticCompactionGroupId::StateDefault.into()) + .get_compaction_group_levels(compaction_group_id) .levels .iter() .flat_map(|level| level.table_infos.clone()) @@ -523,9 +531,16 @@ pub(crate) mod tests { hummock_meta_client: &Arc, storage: &HummockStorage, epoch: u64, + table_id: TableId, ) { - let res = storage.seal_and_sync_epoch(epoch).await.unwrap(); - hummock_meta_client.commit_epoch(epoch, res).await.unwrap(); + let res = storage + .seal_and_sync_epoch(epoch, HashSet::from([table_id])) + .await + .unwrap(); + hummock_meta_client + .commit_epoch(epoch, res, false) + .await + .unwrap(); } async fn prepare_data( @@ -538,8 +553,9 @@ pub(crate) mod tests { let kv_count: u16 = 128; let mut epoch = test_epoch(1); let mut local = storage.new_local(NewLocalOptions::for_test(table_id)).await; + let table_id_set = HashSet::from_iter([table_id]); - storage.start_epoch(epoch, HashSet::from_iter([table_id])); + storage.start_epoch(epoch, table_id_set); // 1. add sstables let val = Bytes::from(b"0"[..].repeat(1 << 10)); // 1024 Byte value @@ -561,7 +577,7 @@ pub(crate) mod tests { storage.start_epoch(next_epoch, HashSet::from_iter([table_id])); local.seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); - flush_and_commit(&hummock_meta_client, storage, epoch).await; + flush_and_commit(&hummock_meta_client, storage, epoch, table_id).await; epoch.inc_epoch(); } } @@ -615,6 +631,10 @@ pub(crate) mod tests { ) .await; + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager_ref.clone(), existing_table_id) + .await; + // Mimic dropping table unregister_table_ids_from_compaction_group(&hummock_manager_ref, &[existing_table_id]) .await; @@ -625,34 +645,19 @@ pub(crate) mod tests { }; // 2. get compact task and there should be none let compact_task = hummock_manager_ref - .manual_get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - manual_compcation_option, - ) + .manual_get_compact_task(compaction_group_id, manual_compcation_option) .await .unwrap(); assert!(compact_task.is_none()); - // 3. get the latest version and check - let version = hummock_manager_ref.get_current_version().await; - let output_level_info = version - .get_compaction_group_levels(StaticCompactionGroupId::StateDefault.into()) - .levels - .last() - .unwrap(); - assert_eq!(0, output_level_info.total_file_size); - - // 5. get compact task - let compact_task = hummock_manager_ref - .get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - &mut default_compaction_selector(), - ) - .await - .unwrap(); + let current_version = hummock_manager_ref.get_current_version().await; + assert!(current_version + .get_sst_ids_by_group_id(compaction_group_id) + .collect_vec() + .is_empty()); - assert!(compact_task.is_none()); + // assert_eq!(0, current_version.num_levels(compaction_group_id)); } #[tokio::test] @@ -678,6 +683,10 @@ pub(crate) mod tests { .new_local(NewLocalOptions::for_test(TableId::from(2))) .await; + let table_id_1 = storage_1.table_id(); + let table_id_2 = storage_2.table_id(); + let table_id_set = HashSet::from_iter([table_id_1, table_id_2]); + let rpc_filter_key_extractor_manager = match global_storage.filter_key_extractor_manager().clone() { FilterKeyExtractorManager::RpcFilterKeyExtractorManager( @@ -687,12 +696,12 @@ pub(crate) mod tests { }; rpc_filter_key_extractor_manager.update( - 1, + table_id_1.table_id(), Arc::new(FilterKeyExtractorImpl::FullKey(FullKeyFilterKeyExtractor)), ); rpc_filter_key_extractor_manager.update( - 2, + table_id_2.table_id(), Arc::new(FilterKeyExtractorImpl::FullKey(FullKeyFilterKeyExtractor)), ); let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( @@ -712,13 +721,13 @@ pub(crate) mod tests { // 1. add sstables let val = Bytes::from(b"0"[..].repeat(1 << 10)); // 1024 Byte value - let drop_table_id = 1; - let existing_table_ids = 2; + let drop_table_id = table_id_1.table_id(); + let existing_table_id = table_id_2.table_id(); let kv_count: usize = 128; let mut epoch = test_epoch(1); register_table_ids_to_compaction_group( &hummock_manager_ref, - &[drop_table_id, existing_table_ids], + &[drop_table_id, existing_table_id], StaticCompactionGroupId::StateDefault.into(), ) .await; @@ -728,10 +737,10 @@ pub(crate) mod tests { .await; let vnode = VirtualNode::from_index(1); - global_storage.start_epoch(epoch, HashSet::from_iter([1.into(), 2.into()])); + global_storage.start_epoch(epoch, table_id_set.clone()); for index in 0..kv_count { let next_epoch = epoch.next_epoch(); - global_storage.start_epoch(next_epoch, HashSet::from_iter([1.into(), 2.into()])); + global_storage.start_epoch(next_epoch, table_id_set.clone()); if index == 0 { storage_1.init_for_test(epoch).await.unwrap(); storage_2.init_for_test(epoch).await.unwrap(); @@ -755,8 +764,14 @@ pub(crate) mod tests { storage.seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); other.seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); - let res = global_storage.seal_and_sync_epoch(epoch).await.unwrap(); - hummock_meta_client.commit_epoch(epoch, res).await.unwrap(); + let res = global_storage + .seal_and_sync_epoch(epoch, table_id_set.clone()) + .await + .unwrap(); + hummock_meta_client + .commit_epoch(epoch, res, false) + .await + .unwrap(); epoch.inc_epoch(); } @@ -767,12 +782,12 @@ pub(crate) mod tests { level: 0, ..Default::default() }; + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager_ref.clone(), existing_table_id) + .await; // 2. get compact task let mut compact_task = hummock_manager_ref - .manual_get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - manual_compcation_option, - ) + .manual_get_compact_task(compaction_group_id, manual_compcation_option) .await .unwrap() .unwrap(); @@ -813,7 +828,7 @@ pub(crate) mod tests { // 5. get the latest version and check let version: HummockVersion = hummock_manager_ref.get_current_version().await; let mut tables_from_version = vec![]; - version.level_iter(StaticCompactionGroupId::StateDefault.into(), |level| { + version.level_iter(compaction_group_id, |level| { tables_from_version.extend(level.table_infos.iter().cloned()); true }); @@ -832,10 +847,7 @@ pub(crate) mod tests { // 6. get compact task and there should be none let compact_task = hummock_manager_ref - .get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - &mut default_compaction_selector(), - ) + .get_compact_task(compaction_group_id, &mut default_compaction_selector()) .await .unwrap(); assert!(compact_task.is_none()); @@ -853,7 +865,7 @@ pub(crate) mod tests { epoch, None, ReadOptions { - table_id: TableId::from(existing_table_ids), + table_id: TableId::from(existing_table_id), prefetch_options: PrefetchOptions::default(), cache_policy: CachePolicy::Fill(CacheContext::Default), ..Default::default() @@ -864,7 +876,7 @@ pub(crate) mod tests { let mut scan_count = 0; for (k, _) in scan_result { let table_id = k.user_key.table_id.table_id(); - assert_eq!(table_id, existing_table_ids); + assert_eq!(table_id, existing_table_id); scan_count += 1; } assert_eq!(key_count, scan_count); @@ -905,7 +917,7 @@ pub(crate) mod tests { .sstable_id_remote_fetch_number, )); rpc_filter_key_extractor_manager.update( - 2, + existing_table_id, Arc::new(FilterKeyExtractorImpl::FullKey(FullKeyFilterKeyExtractor)), ); let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( @@ -922,14 +934,15 @@ pub(crate) mod tests { let vnode = VirtualNode::from_index(1); let mut epoch_set = BTreeSet::new(); - storage.start_epoch(epoch, HashSet::from_iter([existing_table_id.into()])); + let table_id_set = HashSet::from_iter([existing_table_id.into()]); + storage.start_epoch(epoch, table_id_set.clone()); let mut local = storage .new_local(NewLocalOptions::for_test(existing_table_id.into())) .await; for i in 0..kv_count { let next_epoch = epoch + millisec_interval_epoch; - storage.start_epoch(next_epoch, HashSet::from_iter([existing_table_id.into()])); + storage.start_epoch(next_epoch, table_id_set.clone()); if i == 0 { local.init_for_test(epoch).await.unwrap(); } @@ -945,8 +958,14 @@ pub(crate) mod tests { local.flush().await.unwrap(); local.seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); - let res = storage.seal_and_sync_epoch(epoch).await.unwrap(); - hummock_meta_client.commit_epoch(epoch, res).await.unwrap(); + let res = storage + .seal_and_sync_epoch(epoch, table_id_set.clone()) + .await + .unwrap(); + hummock_meta_client + .commit_epoch(epoch, res, false) + .await + .unwrap(); epoch += millisec_interval_epoch; } @@ -954,12 +973,12 @@ pub(crate) mod tests { level: 0, ..Default::default() }; + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager_ref.clone(), existing_table_id) + .await; // 2. get compact task let mut compact_task = hummock_manager_ref - .manual_get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - manual_compcation_option, - ) + .manual_get_compact_task(compaction_group_id, manual_compcation_option) .await .unwrap() .unwrap(); @@ -1012,7 +1031,7 @@ pub(crate) mod tests { // 4. get the latest version and check let version: HummockVersion = hummock_manager_ref.get_current_version().await; let mut tables_from_version = vec![]; - version.level_iter(StaticCompactionGroupId::StateDefault.into(), |level| { + version.level_iter(compaction_group_id, |level| { tables_from_version.extend(level.table_infos.iter().cloned()); true }); @@ -1032,10 +1051,7 @@ pub(crate) mod tests { // 5. get compact task and there should be none let compact_task = hummock_manager_ref - .get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - &mut default_compaction_selector(), - ) + .get_compact_task(compaction_group_id, &mut default_compaction_selector()) .await .unwrap(); assert!(compact_task.is_none()); @@ -1130,13 +1146,14 @@ pub(crate) mod tests { let mut local = storage .new_local(NewLocalOptions::for_test(existing_table_id.into())) .await; - storage.start_epoch(epoch, HashSet::from_iter([existing_table_id.into()])); + let table_id_set = HashSet::from_iter([existing_table_id.into()]); + storage.start_epoch(epoch, table_id_set.clone()); for i in 0..kv_count { if i == 0 { local.init_for_test(epoch).await.unwrap(); } let next_epoch = epoch + millisec_interval_epoch; - storage.start_epoch(next_epoch, HashSet::from_iter([existing_table_id.into()])); + storage.start_epoch(next_epoch, table_id_set.clone()); epoch_set.insert(epoch); let ramdom_key = [key_prefix.as_ref(), &rand::thread_rng().gen::<[u8; 32]>()].concat(); @@ -1145,8 +1162,14 @@ pub(crate) mod tests { .unwrap(); local.flush().await.unwrap(); local.seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); - let res = storage.seal_and_sync_epoch(epoch).await.unwrap(); - hummock_meta_client.commit_epoch(epoch, res).await.unwrap(); + let res = storage + .seal_and_sync_epoch(epoch, table_id_set.clone()) + .await + .unwrap(); + hummock_meta_client + .commit_epoch(epoch, res, false) + .await + .unwrap(); epoch += millisec_interval_epoch; } @@ -1154,12 +1177,12 @@ pub(crate) mod tests { level: 0, ..Default::default() }; + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager_ref.clone(), existing_table_id) + .await; // 2. get compact task let mut compact_task = hummock_manager_ref - .manual_get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - manual_compcation_option, - ) + .manual_get_compact_task(compaction_group_id, manual_compcation_option) .await .unwrap() .unwrap(); @@ -1204,7 +1227,7 @@ pub(crate) mod tests { // 4. get the latest version and check let version: HummockVersion = hummock_manager_ref.get_current_version().await; let tables_from_version: Vec<_> = version - .get_compaction_group_levels(StaticCompactionGroupId::StateDefault.into()) + .get_compaction_group_levels(compaction_group_id) .levels .iter() .flat_map(|level| level.table_infos.iter()) @@ -1225,10 +1248,7 @@ pub(crate) mod tests { // 5. get compact task and there should be none let compact_task = hummock_manager_ref - .get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - &mut default_compaction_selector(), - ) + .get_compact_task(compaction_group_id, &mut default_compaction_selector()) .await .unwrap(); assert!(compact_task.is_none()); @@ -1323,18 +1343,24 @@ pub(crate) mod tests { // .unwrap(); local.seal_current_epoch(u64::MAX, SealCurrentEpochOptions::for_test()); - flush_and_commit(&hummock_meta_client, &storage, epoch).await; + flush_and_commit( + &hummock_meta_client, + &storage, + epoch, + existing_table_id.into(), + ) + .await; let manual_compcation_option = ManualCompactionOption { level: 0, ..Default::default() }; // 2. get compact task + let compaction_group_id = + get_compaction_group_id_by_table_id(hummock_manager_ref.clone(), existing_table_id) + .await; let mut compact_task = hummock_manager_ref - .manual_get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - manual_compcation_option, - ) + .manual_get_compact_task(compaction_group_id, manual_compcation_option) .await .unwrap() .unwrap(); @@ -1375,7 +1401,7 @@ pub(crate) mod tests { // 4. get the latest version and check let version = hummock_manager_ref.get_current_version().await; let output_level_info = version - .get_compaction_group_levels(StaticCompactionGroupId::StateDefault.into()) + .get_compaction_group_levels(compaction_group_id) .levels .last() .unwrap(); @@ -2103,8 +2129,14 @@ pub(crate) mod tests { .0 .seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); - let res = storage.seal_and_sync_epoch(*epoch).await.unwrap(); - hummock_meta_client.commit_epoch(*epoch, res).await.unwrap(); + let res = storage + .seal_and_sync_epoch(*epoch, table_id_set.clone()) + .await + .unwrap(); + hummock_meta_client + .commit_epoch(*epoch, res, false) + .await + .unwrap(); *epoch += millisec_interval_epoch; } } diff --git a/src/storage/hummock_test/src/failpoint_tests.rs b/src/storage/hummock_test/src/failpoint_tests.rs index 240c07cd82c4b..27072abba08f2 100644 --- a/src/storage/hummock_test/src/failpoint_tests.rs +++ b/src/storage/hummock_test/src/failpoint_tests.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashSet; use std::ops::Bound; use std::sync::Arc; @@ -140,8 +141,12 @@ async fn test_failpoints_state_store_read_upload() { ); // sync epoch1 test the read_error - let res = hummock_storage.seal_and_sync_epoch(1).await.unwrap(); - meta_client.commit_epoch(1, res).await.unwrap(); + let table_id_set = HashSet::from_iter([local.table_id()]); + let res = hummock_storage + .seal_and_sync_epoch(1, table_id_set.clone()) + .await + .unwrap(); + meta_client.commit_epoch(1, res, false).await.unwrap(); hummock_storage .try_wait_epoch(HummockReadEpoch::Committed(1)) .await @@ -208,12 +213,17 @@ async fn test_failpoints_state_store_read_upload() { // test the upload_error fail::cfg(mem_upload_err, "return").unwrap(); - let result = hummock_storage.seal_and_sync_epoch(3).await; + let result = hummock_storage + .seal_and_sync_epoch(3, table_id_set.clone()) + .await; assert!(result.is_err()); fail::remove(mem_upload_err); - let res = hummock_storage.seal_and_sync_epoch(3).await.unwrap(); - meta_client.commit_epoch(3, res).await.unwrap(); + let res = hummock_storage + .seal_and_sync_epoch(3, table_id_set) + .await + .unwrap(); + meta_client.commit_epoch(3, res, false).await.unwrap(); hummock_storage .try_wait_epoch(HummockReadEpoch::Committed(3)) .await diff --git a/src/storage/hummock_test/src/hummock_storage_tests.rs b/src/storage/hummock_test/src/hummock_storage_tests.rs index fc0fd6ae97b4f..18bad67a62570 100644 --- a/src/storage/hummock_test/src/hummock_storage_tests.rs +++ b/src/storage/hummock_test/src/hummock_storage_tests.rs @@ -461,6 +461,7 @@ async fn test_storage_basic() { #[tokio::test] async fn test_state_store_sync() { const TEST_TABLE_ID: TableId = TableId { table_id: 233 }; + let table_id_set = HashSet::from_iter([TEST_TABLE_ID]); let test_env = prepare_hummock_test_env().await; test_env.register_table_id(TEST_TABLE_ID).await; let mut hummock_storage = test_env @@ -557,10 +558,14 @@ async fn test_state_store_sync() { .start_epoch(epoch3, HashSet::from_iter([TEST_TABLE_ID])); hummock_storage.seal_current_epoch(epoch3, SealCurrentEpochOptions::for_test()); - let res = test_env.storage.seal_and_sync_epoch(epoch1).await.unwrap(); + let res = test_env + .storage + .seal_and_sync_epoch(epoch1, table_id_set.clone()) + .await + .unwrap(); test_env .meta_client - .commit_epoch(epoch1, res) + .commit_epoch(epoch1, res, false) .await .unwrap(); test_env.storage.try_wait_epoch_for_test(epoch1).await; @@ -599,10 +604,14 @@ async fn test_state_store_sync() { } } - let res = test_env.storage.seal_and_sync_epoch(epoch2).await.unwrap(); + let res = test_env + .storage + .seal_and_sync_epoch(epoch2, table_id_set.clone()) + .await + .unwrap(); test_env .meta_client - .commit_epoch(epoch2, res) + .commit_epoch(epoch2, res, false) .await .unwrap(); test_env.storage.try_wait_epoch_for_test(epoch2).await; @@ -819,6 +828,7 @@ async fn test_state_store_sync() { #[tokio::test] async fn test_delete_get() { const TEST_TABLE_ID: TableId = TableId { table_id: 233 }; + let table_id_set = HashSet::from_iter([TEST_TABLE_ID]); let test_env = prepare_hummock_test_env().await; test_env.register_table_id(TEST_TABLE_ID).await; let mut hummock_storage = test_env @@ -864,10 +874,14 @@ async fn test_delete_get() { .storage .start_epoch(epoch2, HashSet::from_iter([TEST_TABLE_ID])); hummock_storage.seal_current_epoch(epoch2, SealCurrentEpochOptions::for_test()); - let res = test_env.storage.seal_and_sync_epoch(epoch1).await.unwrap(); + let res = test_env + .storage + .seal_and_sync_epoch(epoch1, table_id_set.clone()) + .await + .unwrap(); test_env .meta_client - .commit_epoch(epoch1, res) + .commit_epoch(epoch1, res, false) .await .unwrap(); @@ -886,10 +900,14 @@ async fn test_delete_get() { .await .unwrap(); hummock_storage.seal_current_epoch(u64::MAX, SealCurrentEpochOptions::for_test()); - let res = test_env.storage.seal_and_sync_epoch(epoch2).await.unwrap(); + let res = test_env + .storage + .seal_and_sync_epoch(epoch2, table_id_set) + .await + .unwrap(); test_env .meta_client - .commit_epoch(epoch2, res) + .commit_epoch(epoch2, res, false) .await .unwrap(); test_env.storage.try_wait_epoch_for_test(epoch2).await; @@ -912,6 +930,7 @@ async fn test_delete_get() { #[tokio::test] async fn test_multiple_epoch_sync() { const TEST_TABLE_ID: TableId = TableId { table_id: 233 }; + let table_id_set = HashSet::from_iter([TEST_TABLE_ID]); let test_env = prepare_hummock_test_env().await; test_env.register_table_id(TEST_TABLE_ID).await; let mut hummock_storage = test_env @@ -1054,19 +1073,27 @@ async fn test_multiple_epoch_sync() { .storage .start_epoch(epoch4, HashSet::from_iter([TEST_TABLE_ID])); hummock_storage.seal_current_epoch(epoch4, SealCurrentEpochOptions::for_test()); - let sync_result2 = test_env.storage.seal_and_sync_epoch(epoch2).await.unwrap(); - let sync_result3 = test_env.storage.seal_and_sync_epoch(epoch3).await.unwrap(); + let sync_result2 = test_env + .storage + .seal_and_sync_epoch(epoch2, table_id_set.clone()) + .await + .unwrap(); + let sync_result3 = test_env + .storage + .seal_and_sync_epoch(epoch3, table_id_set) + .await + .unwrap(); test_get().await; test_env .meta_client - .commit_epoch(epoch2, sync_result2) + .commit_epoch(epoch2, sync_result2, false) .await .unwrap(); test_env .meta_client - .commit_epoch(epoch3, sync_result3) + .commit_epoch(epoch3, sync_result3, false) .await .unwrap(); test_env.storage.try_wait_epoch_for_test(epoch3).await; @@ -1076,6 +1103,7 @@ async fn test_multiple_epoch_sync() { #[tokio::test] async fn test_iter_with_min_epoch() { const TEST_TABLE_ID: TableId = TableId { table_id: 233 }; + let table_id_set = HashSet::from_iter([TEST_TABLE_ID]); let test_env = prepare_hummock_test_env().await; test_env.register_table_id(TEST_TABLE_ID).await; let mut hummock_storage = test_env @@ -1222,16 +1250,24 @@ async fn test_iter_with_min_epoch() { { // test after sync - let sync_result1 = test_env.storage.seal_and_sync_epoch(epoch1).await.unwrap(); - let sync_result2 = test_env.storage.seal_and_sync_epoch(epoch2).await.unwrap(); + let sync_result1 = test_env + .storage + .seal_and_sync_epoch(epoch1, table_id_set.clone()) + .await + .unwrap(); + let sync_result2 = test_env + .storage + .seal_and_sync_epoch(epoch2, table_id_set) + .await + .unwrap(); test_env .meta_client - .commit_epoch(epoch1, sync_result1) + .commit_epoch(epoch1, sync_result1, false) .await .unwrap(); test_env .meta_client - .commit_epoch(epoch2, sync_result2) + .commit_epoch(epoch2, sync_result2, false) .await .unwrap(); test_env.storage.try_wait_epoch_for_test(epoch2).await; @@ -1320,6 +1356,7 @@ async fn test_iter_with_min_epoch() { #[tokio::test] async fn test_hummock_version_reader() { const TEST_TABLE_ID: TableId = TableId { table_id: 233 }; + let table_id_set = HashSet::from_iter([TEST_TABLE_ID]); let test_env = prepare_hummock_test_env().await; test_env.register_table_id(TEST_TABLE_ID).await; let mut hummock_storage = test_env @@ -1514,26 +1551,38 @@ async fn test_hummock_version_reader() { } { - let sync_result1 = test_env.storage.seal_and_sync_epoch(epoch1).await.unwrap(); + let sync_result1 = test_env + .storage + .seal_and_sync_epoch(epoch1, table_id_set.clone()) + .await + .unwrap(); test_env .meta_client - .commit_epoch(epoch1, sync_result1) + .commit_epoch(epoch1, sync_result1, false) .await .unwrap(); test_env.storage.try_wait_epoch_for_test(epoch1).await; - let sync_result2 = test_env.storage.seal_and_sync_epoch(epoch2).await.unwrap(); + let sync_result2 = test_env + .storage + .seal_and_sync_epoch(epoch2, table_id_set.clone()) + .await + .unwrap(); test_env .meta_client - .commit_epoch(epoch2, sync_result2) + .commit_epoch(epoch2, sync_result2, false) .await .unwrap(); test_env.storage.try_wait_epoch_for_test(epoch2).await; - let sync_result3 = test_env.storage.seal_and_sync_epoch(epoch3).await.unwrap(); + let sync_result3 = test_env + .storage + .seal_and_sync_epoch(epoch3, table_id_set) + .await + .unwrap(); test_env .meta_client - .commit_epoch(epoch3, sync_result3) + .commit_epoch(epoch3, sync_result3, false) .await .unwrap(); test_env.storage.try_wait_epoch_for_test(epoch3).await; @@ -1764,6 +1813,7 @@ async fn test_hummock_version_reader() { #[tokio::test] async fn test_get_with_min_epoch() { const TEST_TABLE_ID: TableId = TableId { table_id: 233 }; + let table_id_set = HashSet::from_iter([TEST_TABLE_ID]); let test_env = prepare_hummock_test_env().await; test_env.register_table_id(TEST_TABLE_ID).await; let mut hummock_storage = test_env @@ -1908,16 +1958,24 @@ async fn test_get_with_min_epoch() { // test after sync - let sync_result1 = test_env.storage.seal_and_sync_epoch(epoch1).await.unwrap(); - let sync_result2 = test_env.storage.seal_and_sync_epoch(epoch2).await.unwrap(); + let sync_result1 = test_env + .storage + .seal_and_sync_epoch(epoch1, table_id_set.clone()) + .await + .unwrap(); + let sync_result2 = test_env + .storage + .seal_and_sync_epoch(epoch2, table_id_set) + .await + .unwrap(); test_env .meta_client - .commit_epoch(epoch1, sync_result1) + .commit_epoch(epoch1, sync_result1, false) .await .unwrap(); test_env .meta_client - .commit_epoch(epoch2, sync_result2) + .commit_epoch(epoch2, sync_result2, false) .await .unwrap(); diff --git a/src/storage/hummock_test/src/snapshot_tests.rs b/src/storage/hummock_test/src/snapshot_tests.rs index bde3c046ed6ca..b15e8a3fa372c 100644 --- a/src/storage/hummock_test/src/snapshot_tests.rs +++ b/src/storage/hummock_test/src/snapshot_tests.rs @@ -139,10 +139,13 @@ async fn test_snapshot_inner( hummock_storage.start_epoch(epoch2, HashSet::from_iter([Default::default()])); local.seal_current_epoch(epoch2, SealCurrentEpochOptions::for_test()); if enable_sync { - let res = hummock_storage.seal_and_sync_epoch(epoch1).await.unwrap(); + let res = hummock_storage + .seal_and_sync_epoch(epoch1, HashSet::from_iter([local.table_id()])) + .await + .unwrap(); if enable_commit { mock_hummock_meta_client - .commit_epoch(epoch1, res) + .commit_epoch(epoch1, res, false) .await .unwrap(); hummock_storage @@ -180,10 +183,13 @@ async fn test_snapshot_inner( hummock_storage.start_epoch(epoch3, HashSet::from_iter([Default::default()])); local.seal_current_epoch(epoch3, SealCurrentEpochOptions::for_test()); if enable_sync { - let res = hummock_storage.seal_and_sync_epoch(epoch2).await.unwrap(); + let res = hummock_storage + .seal_and_sync_epoch(epoch2, HashSet::from_iter([local.table_id()])) + .await + .unwrap(); if enable_commit { mock_hummock_meta_client - .commit_epoch(epoch2, res) + .commit_epoch(epoch2, res, false) .await .unwrap(); hummock_storage @@ -220,10 +226,13 @@ async fn test_snapshot_inner( .unwrap(); local.seal_current_epoch(u64::MAX, SealCurrentEpochOptions::for_test()); if enable_sync { - let res = hummock_storage.seal_and_sync_epoch(epoch3).await.unwrap(); + let res = hummock_storage + .seal_and_sync_epoch(epoch3, HashSet::from_iter([local.table_id()])) + .await + .unwrap(); if enable_commit { mock_hummock_meta_client - .commit_epoch(epoch3, res) + .commit_epoch(epoch3, res, false) .await .unwrap(); hummock_storage @@ -279,10 +288,13 @@ async fn test_snapshot_range_scan_inner( .unwrap(); local.seal_current_epoch(u64::MAX, SealCurrentEpochOptions::for_test()); if enable_sync { - let res = hummock_storage.seal_and_sync_epoch(epoch).await.unwrap(); + let res = hummock_storage + .seal_and_sync_epoch(epoch, HashSet::from_iter([local.table_id()])) + .await + .unwrap(); if enable_commit { mock_hummock_meta_client - .commit_epoch(epoch, res) + .commit_epoch(epoch, res, false) .await .unwrap(); hummock_storage diff --git a/src/storage/hummock_test/src/state_store_tests.rs b/src/storage/hummock_test/src/state_store_tests.rs index 67da2150735af..ab1e84aca2a66 100644 --- a/src/storage/hummock_test/src/state_store_tests.rs +++ b/src/storage/hummock_test/src/state_store_tests.rs @@ -375,8 +375,15 @@ async fn test_basic_v2() { .unwrap(); let len = count_stream(iter).await; assert_eq!(len, 4); - let res = hummock_storage.seal_and_sync_epoch(epoch1).await.unwrap(); - meta_client.commit_epoch(epoch1, res).await.unwrap(); + let res = hummock_storage + .seal_and_sync_epoch(epoch1, HashSet::from_iter([local.table_id()])) + .await + .unwrap(); + let is_log_store = false; + meta_client + .commit_epoch(epoch1, res, is_log_store) + .await + .unwrap(); hummock_storage .try_wait_epoch(HummockReadEpoch::Committed(epoch1)) .await @@ -516,11 +523,15 @@ async fn test_state_store_sync_v2() { local.seal_current_epoch(u64::MAX, SealCurrentEpochOptions::for_test()); // trigger a sync + let table_id_set = HashSet::from_iter([local.table_id()]); + hummock_storage + .seal_and_sync_epoch(epoch.prev_epoch(), table_id_set.clone()) + .await + .unwrap(); hummock_storage - .seal_and_sync_epoch(epoch.prev_epoch()) + .seal_and_sync_epoch(epoch, table_id_set) .await .unwrap(); - hummock_storage.seal_and_sync_epoch(epoch).await.unwrap(); // TODO: Uncomment the following lines after flushed sstable can be accessed. // FYI: https://github.com/risingwavelabs/risingwave/pull/1928#discussion_r852698719 @@ -1056,8 +1067,16 @@ async fn test_delete_get_v2() { hummock_storage.start_epoch(epoch2, HashSet::from_iter([Default::default()])); local.seal_current_epoch(epoch2, SealCurrentEpochOptions::for_test()); - let res = hummock_storage.seal_and_sync_epoch(epoch1).await.unwrap(); - meta_client.commit_epoch(epoch1, res).await.unwrap(); + let table_id_set = HashSet::from_iter([local.table_id()]); + let res = hummock_storage + .seal_and_sync_epoch(epoch1, table_id_set.clone()) + .await + .unwrap(); + let is_log_store = false; + meta_client + .commit_epoch(epoch1, res, is_log_store) + .await + .unwrap(); let batch2 = vec![( gen_key_from_str(VirtualNode::ZERO, "bb"), @@ -1074,8 +1093,14 @@ async fn test_delete_get_v2() { .await .unwrap(); local.seal_current_epoch(u64::MAX, SealCurrentEpochOptions::for_test()); - let res = hummock_storage.seal_and_sync_epoch(epoch2).await.unwrap(); - meta_client.commit_epoch(epoch2, res).await.unwrap(); + let res = hummock_storage + .seal_and_sync_epoch(epoch2, table_id_set) + .await + .unwrap(); + meta_client + .commit_epoch(epoch2, res, is_log_store) + .await + .unwrap(); hummock_storage .try_wait_epoch(HummockReadEpoch::Committed(epoch2)) .await @@ -1114,6 +1139,7 @@ async fn test_multiple_epoch_sync_v2() { let mut local = hummock_storage .new_local(NewLocalOptions::for_test(TableId::default())) .await; + let table_id_set = HashSet::from_iter([local.table_id()]); hummock_storage.start_epoch(epoch1, HashSet::from_iter([Default::default()])); local.init_for_test(epoch1).await.unwrap(); local @@ -1217,17 +1243,23 @@ async fn test_multiple_epoch_sync_v2() { } }; test_get().await; - let sync_result2 = hummock_storage.seal_and_sync_epoch(epoch2).await.unwrap(); - let sync_result3 = hummock_storage.seal_and_sync_epoch(epoch3).await.unwrap(); + let sync_result2 = hummock_storage + .seal_and_sync_epoch(epoch2, table_id_set.clone()) + .await + .unwrap(); + let sync_result3 = hummock_storage + .seal_and_sync_epoch(epoch3, table_id_set) + .await + .unwrap(); test_get().await; meta_client - .commit_epoch(epoch2, sync_result2) + .commit_epoch(epoch2, sync_result2, false) .await .unwrap(); meta_client - .commit_epoch(epoch3, sync_result3) + .commit_epoch(epoch3, sync_result3, false) .await .unwrap(); hummock_storage @@ -1251,6 +1283,7 @@ async fn test_gc_watermark_and_clear_shared_buffer() { let mut local_hummock_storage = hummock_storage .new_local(NewLocalOptions::for_test(Default::default())) .await; + let table_id_set = HashSet::from_iter([local_hummock_storage.table_id()]); let initial_epoch = hummock_storage.get_pinned_version().max_committed_epoch(); let epoch1 = initial_epoch.next_epoch(); @@ -1305,7 +1338,10 @@ async fn test_gc_watermark_and_clear_shared_buffer() { .unwrap() }; local_hummock_storage.seal_current_epoch(u64::MAX, SealCurrentEpochOptions::for_test()); - let sync_result1 = hummock_storage.seal_and_sync_epoch(epoch1).await.unwrap(); + let sync_result1 = hummock_storage + .seal_and_sync_epoch(epoch1, table_id_set.clone()) + .await + .unwrap(); let min_object_id_epoch1 = min_object_id(&sync_result1); assert_eq!( hummock_storage @@ -1313,7 +1349,10 @@ async fn test_gc_watermark_and_clear_shared_buffer() { .global_watermark_object_id(), min_object_id_epoch1, ); - let sync_result2 = hummock_storage.seal_and_sync_epoch(epoch2).await.unwrap(); + let sync_result2 = hummock_storage + .seal_and_sync_epoch(epoch2, table_id_set) + .await + .unwrap(); let min_object_id_epoch2 = min_object_id(&sync_result2); assert_eq!( hummock_storage @@ -1322,7 +1361,7 @@ async fn test_gc_watermark_and_clear_shared_buffer() { min_object_id_epoch1, ); meta_client - .commit_epoch(epoch1, sync_result1) + .commit_epoch(epoch1, sync_result1, false) .await .unwrap(); hummock_storage @@ -1555,12 +1594,13 @@ async fn test_iter_log() { hummock_storage.start_epoch(MAX_EPOCH, HashSet::from_iter([table_id])); let in_memory_state_store = MemoryStateStore::new(); + let is_log_store = true; let mut in_memory_local = in_memory_state_store .new_local(NewLocalOptions { table_id, op_consistency_level: OpConsistencyLevel::ConsistentOldValue { check_old_value: CHECK_BYTES_EQUAL.clone(), - is_log_store: true, + is_log_store, }, table_option: Default::default(), is_replicated: false, @@ -1575,7 +1615,7 @@ async fn test_iter_log() { table_id, op_consistency_level: OpConsistencyLevel::ConsistentOldValue { check_old_value: CHECK_BYTES_EQUAL.clone(), - is_log_store: true, + is_log_store, }, table_option: Default::default(), is_replicated: false, @@ -1585,13 +1625,17 @@ async fn test_iter_log() { // flush for about 10 times per epoch apply_test_log_data(test_log_data.clone(), &mut hummock_local, 0.001).await; + let table_id_set = HashSet::from_iter([table_id]); for (epoch, _) in &test_log_data { - let res = hummock_storage.seal_and_sync_epoch(*epoch).await.unwrap(); + let res = hummock_storage + .seal_and_sync_epoch(*epoch, table_id_set.clone()) + .await + .unwrap(); if *epoch != test_log_data[0].0 { assert!(!res.old_value_ssts.is_empty()); } assert!(!res.uncommitted_ssts.is_empty()); - meta_client.commit_epoch(*epoch, res).await.unwrap(); + meta_client.commit_epoch(*epoch, res, true).await.unwrap(); } hummock_storage diff --git a/src/storage/hummock_test/src/sync_point_tests.rs b/src/storage/hummock_test/src/sync_point_tests.rs index 008c667ccedf5..84cdf5513cdeb 100644 --- a/src/storage/hummock_test/src/sync_point_tests.rs +++ b/src/storage/hummock_test/src/sync_point_tests.rs @@ -302,7 +302,13 @@ async fn test_syncpoints_get_in_delete_range_boundary() { test_epoch(101), risingwave_storage::store::SealCurrentEpochOptions::for_test(), ); - flush_and_commit(&hummock_meta_client, &storage, test_epoch(100)).await; + flush_and_commit( + &hummock_meta_client, + &storage, + test_epoch(100), + local.table_id(), + ) + .await; compact_once( hummock_manager_ref.clone(), compact_ctx.clone(), @@ -337,7 +343,13 @@ async fn test_syncpoints_get_in_delete_range_boundary() { test_epoch(102), risingwave_storage::store::SealCurrentEpochOptions::for_test(), ); - flush_and_commit(&hummock_meta_client, &storage, test_epoch(101)).await; + flush_and_commit( + &hummock_meta_client, + &storage, + test_epoch(101), + local.table_id(), + ) + .await; compact_once( hummock_manager_ref.clone(), compact_ctx.clone(), @@ -372,7 +384,13 @@ async fn test_syncpoints_get_in_delete_range_boundary() { test_epoch(103), risingwave_storage::store::SealCurrentEpochOptions::for_test(), ); - flush_and_commit(&hummock_meta_client, &storage, test_epoch(102)).await; + flush_and_commit( + &hummock_meta_client, + &storage, + test_epoch(102), + local.table_id(), + ) + .await; // move this two file to the same level. compact_once( hummock_manager_ref.clone(), @@ -401,7 +419,13 @@ async fn test_syncpoints_get_in_delete_range_boundary() { u64::MAX, risingwave_storage::store::SealCurrentEpochOptions::for_test(), ); - flush_and_commit(&hummock_meta_client, &storage, test_epoch(103)).await; + flush_and_commit( + &hummock_meta_client, + &storage, + test_epoch(103), + local.table_id(), + ) + .await; // move this two file to the same level. compact_once( hummock_manager_ref.clone(), diff --git a/src/storage/hummock_test/src/test_utils.rs b/src/storage/hummock_test/src/test_utils.rs index bf5c4a8dd8d8c..da861ff92810c 100644 --- a/src/storage/hummock_test/src/test_utils.rs +++ b/src/storage/hummock_test/src/test_utils.rs @@ -243,8 +243,24 @@ impl HummockTestEnv { // Seal, sync and commit a epoch. // On completion of this function call, the provided epoch should be committed and visible. pub async fn commit_epoch(&self, epoch: u64) { - let res = self.storage.seal_and_sync_epoch(epoch).await.unwrap(); - self.meta_client.commit_epoch(epoch, res).await.unwrap(); + let table_ids = self + .manager + .get_current_version() + .await + .state_table_info + .info() + .keys() + .cloned() + .collect(); + let res = self + .storage + .seal_and_sync_epoch(epoch, table_ids) + .await + .unwrap(); + self.meta_client + .commit_epoch(epoch, res, false) + .await + .unwrap(); self.storage.try_wait_epoch_for_test(epoch).await; } diff --git a/src/storage/hummock_trace/src/opts.rs b/src/storage/hummock_trace/src/opts.rs index 5d480cca96b58..562e989051395 100644 --- a/src/storage/hummock_trace/src/opts.rs +++ b/src/storage/hummock_trace/src/opts.rs @@ -102,21 +102,19 @@ impl From for TableId { #[derive(Encode, Decode, PartialEq, Eq, Debug, Clone)] pub struct TracedReadOptions { pub prefix_hint: Option, - pub ignore_range_tombstone: bool, pub prefetch_options: TracedPrefetchOptions, pub cache_policy: TracedCachePolicy, pub retention_seconds: Option, pub table_id: TracedTableId, pub read_version_from_backup: bool, - pub read_version_from_time_travel: bool, + pub read_committed: bool, } impl TracedReadOptions { pub fn for_test(table_id: u32) -> Self { Self { prefix_hint: Some(TracedBytes::from(vec![0])), - ignore_range_tombstone: true, prefetch_options: TracedPrefetchOptions { prefetch: true, for_large_query: true, @@ -125,7 +123,7 @@ impl TracedReadOptions { retention_seconds: None, table_id: TracedTableId { table_id }, read_version_from_backup: false, - read_version_from_time_travel: false, + read_committed: false, } } } diff --git a/src/storage/src/hummock/compactor/compaction_utils.rs b/src/storage/src/hummock/compactor/compaction_utils.rs index f61991c0fa274..f91cf5eedb563 100644 --- a/src/storage/src/hummock/compactor/compaction_utils.rs +++ b/src/storage/src/hummock/compactor/compaction_utils.rs @@ -39,14 +39,13 @@ use crate::hummock::compactor::{ TtlCompactionFilter, }; use crate::hummock::iterator::{ - Forward, ForwardMergeRangeIterator, HummockIterator, MergeIterator, SkipWatermarkIterator, - UserIterator, + Forward, HummockIterator, MergeIterator, SkipWatermarkIterator, UserIterator, }; use crate::hummock::multi_builder::TableBuilderFactory; use crate::hummock::sstable::DEFAULT_ENTRY_SIZE; use crate::hummock::{ CachePolicy, FilterBuilder, GetObjectId, HummockResult, MemoryLimiter, SstableBuilder, - SstableBuilderOptions, SstableDeleteRangeIterator, SstableWriterFactory, SstableWriterOptions, + SstableBuilderOptions, SstableWriterFactory, SstableWriterOptions, }; use crate::monitor::StoreLocalStatistic; @@ -349,7 +348,6 @@ pub async fn check_compaction_result( } let mut table_iters = Vec::new(); - let mut del_iter = ForwardMergeRangeIterator::default(); for level in &compact_task.input_ssts { if level.table_infos.is_empty() { continue; @@ -358,7 +356,6 @@ pub async fn check_compaction_result( // Do not need to filter the table because manager has done it. if level.level_type == PbLevelType::Nonoverlapping { debug_assert!(can_concat(&level.table_infos)); - del_iter.add_concat_iter(level.table_infos.clone(), context.sstable_store.clone()); table_iters.push(ConcatSstableIterator::new( compact_task.existing_table_ids.clone(), @@ -369,13 +366,7 @@ pub async fn check_compaction_result( context.storage_opts.compactor_iter_max_io_retry_times, )); } else { - let mut stats = StoreLocalStatistic::default(); for table_info in &level.table_infos { - let table = context - .sstable_store - .sstable(table_info, &mut stats) - .await?; - del_iter.add_sst_iter(SstableDeleteRangeIterator::new(table)); table_iters.push(ConcatSstableIterator::new( compact_task.existing_table_ids.clone(), vec![table_info.clone()], diff --git a/src/storage/src/hummock/event_handler/hummock_event_handler.rs b/src/storage/src/hummock/event_handler/hummock_event_handler.rs index f2aa2ea7fd88d..1c8abc78ddffc 100644 --- a/src/storage/src/hummock/event_handler/hummock_event_handler.rs +++ b/src/storage/src/hummock/event_handler/hummock_event_handler.rs @@ -50,6 +50,7 @@ use crate::hummock::event_handler::{ ReadOnlyRwLockRef, }; use crate::hummock::local_version::pinned_version::PinnedVersion; +use crate::hummock::local_version::recent_versions::RecentVersions; use crate::hummock::store::version::{ HummockReadVersion, StagingData, StagingSstableInfo, VersionUpdate, }; @@ -197,7 +198,7 @@ pub struct HummockEventHandler { local_read_version_mapping: HashMap, version_update_notifier_tx: Arc>, - pinned_version: Arc>, + recent_versions: Arc>, write_conflict_detector: Option>, uploader: HummockUploader, @@ -355,7 +356,10 @@ impl HummockEventHandler { hummock_event_rx, version_update_rx, version_update_notifier_tx, - pinned_version: Arc::new(ArcSwap::from_pointee(pinned_version)), + recent_versions: Arc::new(ArcSwap::from_pointee(RecentVersions::new( + pinned_version, + storage_opts.max_cached_recent_versions_number, + ))), write_conflict_detector, read_version_mapping, local_read_version_mapping: Default::default(), @@ -371,8 +375,8 @@ impl HummockEventHandler { self.version_update_notifier_tx.clone() } - pub fn pinned_version(&self) -> Arc> { - self.pinned_version.clone() + pub fn recent_versions(&self) -> Arc> { + self.recent_versions.clone() } pub fn read_version_mapping(&self) -> ReadOnlyReadVersionMapping { @@ -529,17 +533,18 @@ impl HummockEventHandler { .await .expect("should not be empty"); let prev_version_id = latest_version_ref.id(); - let new_version = Self::resolve_version_update_info( + if let Some(new_version) = Self::resolve_version_update_info( latest_version_ref.clone(), version_update, None, - ); - info!( - ?prev_version_id, - new_version_id = ?new_version.id(), - "recv new version" - ); - latest_version = Some(new_version); + ) { + info!( + ?prev_version_id, + new_version_id = ?new_version.id(), + "recv new version" + ); + latest_version = Some(new_version); + } } self.apply_version_update( @@ -582,21 +587,21 @@ impl HummockEventHandler { .unwrap_or_else(|| self.uploader.hummock_version().clone()); let mut sst_delta_infos = vec![]; - let new_pinned_version = Self::resolve_version_update_info( + if let Some(new_pinned_version) = Self::resolve_version_update_info( pinned_version.clone(), version_payload, Some(&mut sst_delta_infos), - ); - - self.refiller - .start_cache_refill(sst_delta_infos, pinned_version, new_pinned_version); + ) { + self.refiller + .start_cache_refill(sst_delta_infos, pinned_version, new_pinned_version); + } } fn resolve_version_update_info( pinned_version: PinnedVersion, version_payload: HummockVersionUpdate, mut sst_delta_infos: Option<&mut Vec>, - ) -> PinnedVersion { + ) -> Option { let newly_pinned_version = match version_payload { HummockVersionUpdate::VersionDeltas(version_deltas) => { let mut version_to_apply = pinned_version.version().clone(); @@ -629,8 +634,9 @@ impl HummockEventHandler { .metrics .event_handler_on_apply_version_update .start_timer(); - self.pinned_version - .store(Arc::new(new_pinned_version.clone())); + self.recent_versions.rcu(|prev_recent_versions| { + prev_recent_versions.with_new_version(new_pinned_version.clone()) + }); { self.for_each_read_version( @@ -663,7 +669,10 @@ impl HummockEventHandler { // TODO: should we change the logic when supporting partial ckpt? if let Some(sstable_object_id_manager) = &self.sstable_object_id_manager { sstable_object_id_manager.remove_watermark_object_id(TrackerId::Epoch( - self.pinned_version.load().visible_table_committed_epoch(), + self.recent_versions + .load() + .latest_version() + .visible_table_committed_epoch(), )); } @@ -789,13 +798,13 @@ impl HummockEventHandler { is_replicated, vnodes, } => { - let pinned_version = self.pinned_version.load(); + let pinned_version = self.recent_versions.load().latest_version().clone(); let instance_id = self.generate_instance_id(); let basic_read_version = Arc::new(RwLock::new( HummockReadVersion::new_with_replication_option( table_id, instance_id, - (**pinned_version).clone(), + pinned_version, is_replicated, vnodes, ), @@ -992,7 +1001,7 @@ mod tests { ); let event_tx = event_handler.event_sender(); - let latest_version = event_handler.pinned_version.clone(); + let latest_version = event_handler.recent_versions.clone(); let latest_version_update_tx = event_handler.version_update_notifier_tx.clone(); let send_clear = |version_id| { @@ -1018,12 +1027,15 @@ mod tests { let (old_version, new_version, refill_finish_tx) = refill_task_rx.recv().await.unwrap(); assert_eq!(old_version.version(), initial_version.version()); assert_eq!(new_version.version(), &version1); - assert_eq!(latest_version.load().version(), initial_version.version()); + assert_eq!( + latest_version.load().latest_version().version(), + initial_version.version() + ); let mut changed = latest_version_update_tx.subscribe(); refill_finish_tx.send(()).unwrap(); changed.changed().await.unwrap(); - assert_eq!(latest_version.load().version(), &version1); + assert_eq!(latest_version.load().latest_version().version(), &version1); } // test recovery with pending refill task @@ -1050,11 +1062,11 @@ mod tests { refill_task_rx.recv().await.unwrap(); assert_eq!(old_version3.version(), &version2); assert_eq!(new_version3.version(), &version3); - assert_eq!(latest_version.load().version(), &version1); + assert_eq!(latest_version.load().latest_version().version(), &version1); let rx = send_clear(version3.id); rx.await.unwrap(); - assert_eq!(latest_version.load().version(), &version3); + assert_eq!(latest_version.load().latest_version().version(), &version3); } async fn assert_pending(fut: &mut (impl Future + Unpin)) { @@ -1081,7 +1093,7 @@ mod tests { ))) .unwrap(); rx.await.unwrap(); - assert_eq!(latest_version.load().version(), &version5); + assert_eq!(latest_version.load().latest_version().version(), &version5); } } diff --git a/src/storage/src/hummock/event_handler/uploader/mod.rs b/src/storage/src/hummock/event_handler/uploader/mod.rs index 4494049d93b0b..90e6a9306930a 100644 --- a/src/storage/src/hummock/event_handler/uploader/mod.rs +++ b/src/storage/src/hummock/event_handler/uploader/mod.rs @@ -1643,7 +1643,8 @@ pub(crate) mod tests { let new_pinned_version = uploader .context .pinned_version - .new_pin_version(test_hummock_version(epoch1)); + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); uploader.update_pinned_version(new_pinned_version); assert_eq!(epoch1, uploader.max_committed_epoch()); } @@ -1672,7 +1673,8 @@ pub(crate) mod tests { let new_pinned_version = uploader .context .pinned_version - .new_pin_version(test_hummock_version(epoch1)); + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); uploader.update_pinned_version(new_pinned_version); assert!(uploader.data().syncing_data.is_empty()); assert_eq!(epoch1, uploader.max_committed_epoch()); @@ -1706,7 +1708,8 @@ pub(crate) mod tests { let new_pinned_version = uploader .context .pinned_version - .new_pin_version(test_hummock_version(epoch1)); + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); uploader.update_pinned_version(new_pinned_version); assert!(uploader.data().syncing_data.is_empty()); assert_eq!(epoch1, uploader.max_committed_epoch()); @@ -1730,11 +1733,21 @@ pub(crate) mod tests { let epoch4 = epoch3.next_epoch(); let epoch5 = epoch4.next_epoch(); let epoch6 = epoch5.next_epoch(); - let version1 = initial_pinned_version.new_pin_version(test_hummock_version(epoch1)); - let version2 = initial_pinned_version.new_pin_version(test_hummock_version(epoch2)); - let version3 = initial_pinned_version.new_pin_version(test_hummock_version(epoch3)); - let version4 = initial_pinned_version.new_pin_version(test_hummock_version(epoch4)); - let version5 = initial_pinned_version.new_pin_version(test_hummock_version(epoch5)); + let version1 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch1)) + .unwrap(); + let version2 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch2)) + .unwrap(); + let version3 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch3)) + .unwrap(); + let version4 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch4)) + .unwrap(); + let version5 = initial_pinned_version + .new_pin_version(test_hummock_version(epoch5)) + .unwrap(); uploader.start_epochs_for_test([epoch6]); uploader.init_instance(TEST_LOCAL_INSTANCE_ID, TEST_TABLE_ID, epoch6); diff --git a/src/storage/src/hummock/hummock_meta_client.rs b/src/storage/src/hummock/hummock_meta_client.rs index d123558acc50b..038856a3ba2f3 100644 --- a/src/storage/src/hummock/hummock_meta_client.rs +++ b/src/storage/src/hummock/hummock_meta_client.rs @@ -80,7 +80,12 @@ impl HummockMetaClient for MonitoredHummockMetaClient { res } - async fn commit_epoch(&self, _epoch: HummockEpoch, _sync_result: SyncResult) -> Result<()> { + async fn commit_epoch( + &self, + _epoch: HummockEpoch, + _sync_result: SyncResult, + _is_log_store: bool, + ) -> Result<()> { panic!("Only meta service can commit_epoch in production.") } diff --git a/src/storage/src/hummock/iterator/concat_delete_range_iterator.rs b/src/storage/src/hummock/iterator/concat_delete_range_iterator.rs deleted file mode 100644 index a7c5215439bfb..0000000000000 --- a/src/storage/src/hummock/iterator/concat_delete_range_iterator.rs +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright 2024 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::future::Future; - -use risingwave_hummock_sdk::key::{FullKey, PointRange, UserKey}; -use risingwave_hummock_sdk::sstable_info::SstableInfo; -use risingwave_hummock_sdk::HummockEpoch; - -use crate::hummock::iterator::DeleteRangeIterator; -use crate::hummock::sstable_store::SstableStoreRef; -use crate::hummock::{HummockResult, SstableDeleteRangeIterator}; -use crate::monitor::StoreLocalStatistic; - -pub struct ConcatDeleteRangeIterator { - sstables: Vec, - current: Option, - idx: usize, - sstable_store: SstableStoreRef, - stats: StoreLocalStatistic, -} - -impl ConcatDeleteRangeIterator { - pub fn new(sstables: Vec, sstable_store: SstableStoreRef) -> Self { - Self { - sstables, - sstable_store, - stats: StoreLocalStatistic::default(), - idx: 0, - current: None, - } - } - - async fn next_inner(&mut self) -> HummockResult<()> { - if let Some(iter) = self.current.as_mut() { - if iter.is_valid() { - if iter.is_last_range() - && self.idx + 1 < self.sstables.len() - && self.sstables[self.idx + 1].range_tombstone_count > 0 - && iter - .next_extended_user_key() - .left_user_key - .eq(&FullKey::decode(&self.sstables[self.idx].key_range.right).user_key) - { - // When the last range of the current sstable is equal to the first range of the - // next sstable, the `next` method would return two same `PointRange`. So we - // must skip one. - let exclusive_range_start = iter.next_extended_user_key().is_exclude_left_key; - let last_key_in_sst_start = iter - .next_extended_user_key() - .left_user_key - .eq(&FullKey::decode(&self.sstables[self.idx + 1].key_range.left).user_key); - iter.next().await?; - if !iter.is_valid() && last_key_in_sst_start { - self.seek_idx(self.idx + 1, None).await?; - let next_range = self.next_extended_user_key(); - debug_assert!(self.is_valid()); - if next_range.is_exclude_left_key == exclusive_range_start - && next_range - .left_user_key - .eq(&FullKey::decode(&self.sstables[self.idx].key_range.left) - .user_key) - { - self.current.as_mut().unwrap().next().await?; - } - return Ok(()); - } - } else { - iter.next().await?; - } - let mut idx = self.idx; - while idx + 1 < self.sstables.len() && !self.is_valid() { - self.seek_idx(idx + 1, None).await?; - idx += 1; - } - } - } - Ok(()) - } - - /// Seeks to a table, and then seeks to the key if `seek_key` is given. - async fn seek_idx( - &mut self, - idx: usize, - seek_key: Option>, - ) -> HummockResult<()> { - self.current.take(); - if idx < self.sstables.len() { - if self.sstables[idx].range_tombstone_count == 0 { - return Ok(()); - } - let table = self - .sstable_store - .sstable(&self.sstables[idx], &mut self.stats) - .await?; - let mut sstable_iter = SstableDeleteRangeIterator::new(table); - - if let Some(key) = seek_key { - sstable_iter.seek(key).await?; - } else { - sstable_iter.rewind().await?; - } - self.current = Some(sstable_iter); - self.idx = idx; - } - Ok(()) - } -} - -impl DeleteRangeIterator for ConcatDeleteRangeIterator { - type NextFuture<'a> = impl Future> + 'a; - type RewindFuture<'a> = impl Future> + 'a; - type SeekFuture<'a> = impl Future> + 'a; - - fn next_extended_user_key(&self) -> PointRange<&[u8]> { - self.current.as_ref().unwrap().next_extended_user_key() - } - - fn current_epoch(&self) -> HummockEpoch { - self.current.as_ref().unwrap().current_epoch() - } - - fn next(&mut self) -> Self::NextFuture<'_> { - self.next_inner() - } - - fn rewind(&mut self) -> Self::RewindFuture<'_> { - async move { - let mut idx = 0; - self.seek_idx(idx, None).await?; - while idx + 1 < self.sstables.len() && !self.is_valid() { - self.seek_idx(idx + 1, None).await?; - idx += 1; - } - Ok(()) - } - } - - fn seek<'a>(&'a mut self, target_user_key: UserKey<&'a [u8]>) -> Self::SeekFuture<'_> { - async move { - let mut idx = self - .sstables - .partition_point(|sst| { - FullKey::decode(&sst.key_range.left) - .user_key - .le(&target_user_key) - }) - .saturating_sub(1); // considering the boundary of 0 - self.seek_idx(idx, Some(target_user_key)).await?; - while idx + 1 < self.sstables.len() && !self.is_valid() { - self.seek_idx(idx + 1, None).await?; - idx += 1; - } - Ok(()) - } - } - - fn is_valid(&self) -> bool { - self.current - .as_ref() - .map(|iter| iter.is_valid()) - .unwrap_or(false) - } -} diff --git a/src/storage/src/hummock/iterator/delete_range_iterator.rs b/src/storage/src/hummock/iterator/delete_range_iterator.rs deleted file mode 100644 index bcc2f3e3ea26f..0000000000000 --- a/src/storage/src/hummock/iterator/delete_range_iterator.rs +++ /dev/null @@ -1,386 +0,0 @@ -// Copyright 2024 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::{BTreeSet, BinaryHeap}; -use std::future::Future; - -use risingwave_common::util::epoch::is_max_epoch; -use risingwave_hummock_sdk::key::{PointRange, UserKey}; -use risingwave_hummock_sdk::sstable_info::SstableInfo; -use risingwave_hummock_sdk::HummockEpoch; - -use crate::hummock::iterator::concat_delete_range_iterator::ConcatDeleteRangeIterator; -use crate::hummock::shared_buffer::shared_buffer_batch::SharedBufferDeleteRangeIterator; -use crate::hummock::sstable_store::SstableStoreRef; -use crate::hummock::{HummockResult, SstableDeleteRangeIterator}; - -/// `DeleteRangeIterator` defines the interface of all delete-range iterators, which is used to -/// filter keys deleted by some range tombstone -/// -/// After creating the iterator instance, -/// - if you want to iterate from the beginning, you need to then call its `rewind` method. -/// - if you want to iterate from some specific position, you need to then call its `seek` method. -pub trait DeleteRangeIterator { - type NextFuture<'a>: Future> + Send + 'a - where - Self: 'a; - type RewindFuture<'a>: Future> + Send + 'a - where - Self: 'a; - type SeekFuture<'a>: Future> + Send + 'a - where - Self: 'a; - /// Retrieves the next extended user key that changes current epoch. - /// - /// Note: - /// - Before calling this function, makes sure the iterator `is_valid`. - /// - This function should be straightforward and return immediately. - /// - /// # Panics - /// This function will panic if the iterator is invalid. - fn next_extended_user_key(&self) -> PointRange<&[u8]>; - - /// Retrieves the epoch of the current range delete. - /// It returns the epoch between the previous `next_user_key` (inclusive) and the current - /// `next_user_key` (not inclusive). When there is no range deletes, it will return - /// `HummockEpoch::MAX`. - /// - /// Note: - /// - Before calling this function, makes sure the iterator `is_valid`. - /// - This function should be straightforward and return immediately. - /// - /// # Panics - /// This function will panic if the iterator is invalid. - fn current_epoch(&self) -> HummockEpoch; - - /// Moves a valid iterator to the next tombstone. - /// - /// Note: - /// - Before calling this function, makes sure the iterator `is_valid`. - /// - After calling this function, you may first check whether the iterator `is_valid` again, - /// then get the new tombstone by calling `start_user_key`, `end_user_key` and - /// `current_epoch`. - /// - If the position after calling this is invalid, this function WON'T return an `Err`. You - /// should check `is_valid` before continuing the iteration. - /// - /// # Panics - /// This function will panic if the iterator is invalid. - fn next(&mut self) -> Self::NextFuture<'_>; - - /// Resets the position of the iterator. - /// - /// Note: - /// - Do not decide whether the position is valid or not by checking the returned error of this - /// function. This function WON'T return an `Err` if invalid. You should check `is_valid` - /// before starting iteration. - fn rewind(&mut self) -> Self::RewindFuture<'_>; - - /// Resets iterator and seeks to the first tombstone whose left-end >= provided key, we use this - /// method to skip tombstones which do not overlap with the provided key. - /// - /// Note: - /// - Do not decide whether the position is valid or not by checking the returned error of this - /// function. This function WON'T return an `Err` if invalid. You should check `is_valid` - /// before starting iteration. - fn seek<'a>(&'a mut self, target_user_key: UserKey<&'a [u8]>) -> Self::SeekFuture<'_>; - - /// Indicates whether the iterator can be used. - /// - /// Note: - /// - ONLY call `next_user_key`, `current_epoch` and `next` if `is_valid` returns `true`. - /// - This function should be straightforward and return immediately. - fn is_valid(&self) -> bool; -} - -pub enum RangeIteratorTyped { - Sst(SstableDeleteRangeIterator), - Batch(SharedBufferDeleteRangeIterator), - Concat(ConcatDeleteRangeIterator), -} - -impl DeleteRangeIterator for RangeIteratorTyped { - type NextFuture<'a> = impl Future> + 'a; - type RewindFuture<'a> = impl Future> + 'a; - type SeekFuture<'a> = impl Future> + 'a; - - fn next_extended_user_key(&self) -> PointRange<&[u8]> { - match self { - RangeIteratorTyped::Sst(sst) => sst.next_extended_user_key(), - RangeIteratorTyped::Batch(batch) => batch.next_extended_user_key(), - RangeIteratorTyped::Concat(batch) => batch.next_extended_user_key(), - } - } - - fn current_epoch(&self) -> HummockEpoch { - match self { - RangeIteratorTyped::Sst(sst) => sst.current_epoch(), - RangeIteratorTyped::Batch(batch) => batch.current_epoch(), - RangeIteratorTyped::Concat(batch) => batch.current_epoch(), - } - } - - fn next(&mut self) -> Self::NextFuture<'_> { - async move { - match self { - RangeIteratorTyped::Sst(sst) => sst.next().await, - RangeIteratorTyped::Batch(batch) => batch.next().await, - RangeIteratorTyped::Concat(iter) => iter.next().await, - } - } - } - - fn rewind(&mut self) -> Self::RewindFuture<'_> { - async move { - match self { - RangeIteratorTyped::Sst(sst) => sst.rewind().await, - RangeIteratorTyped::Batch(batch) => batch.rewind().await, - RangeIteratorTyped::Concat(iter) => iter.rewind().await, - } - } - } - - fn seek<'a>(&'a mut self, target_user_key: UserKey<&'a [u8]>) -> Self::SeekFuture<'_> { - async move { - match self { - RangeIteratorTyped::Sst(sst) => sst.seek(target_user_key).await, - RangeIteratorTyped::Batch(batch) => batch.seek(target_user_key).await, - RangeIteratorTyped::Concat(iter) => iter.seek(target_user_key).await, - } - } - } - - fn is_valid(&self) -> bool { - match self { - RangeIteratorTyped::Sst(sst) => sst.is_valid(), - RangeIteratorTyped::Batch(batch) => batch.is_valid(), - RangeIteratorTyped::Concat(iter) => iter.is_valid(), - } - } -} - -impl PartialEq for RangeIteratorTyped { - fn eq(&self, other: &Self) -> bool { - self.next_extended_user_key() - .eq(&other.next_extended_user_key()) - } -} - -impl PartialOrd for RangeIteratorTyped { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Eq for RangeIteratorTyped {} - -impl Ord for RangeIteratorTyped { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - other - .next_extended_user_key() - .cmp(&self.next_extended_user_key()) - } -} - -/// For each SST or batch delete range iterator, it represents the union set of delete ranges in the -/// corresponding SST/batch. Therefore delete ranges are then ordered and do not overlap with each -/// other in every `RangeIteratorTyped`. However, in each SST, since original delete ranges are -/// replaced with a union set of delete ranges, we lose exact information about whether a key -/// is deleted by a delete range in the same SST. Therefore we need to construct a -/// corresponding delete key (aka key tombstone) to represent this. -/// -/// In the `ForwardMergeRangeIterator`, assume that SST1 has delete range event -/// `<5, epoch1>`, `<8, epoch2>` and `<11, epoch3>` -/// and SST2 has delete range event -/// `<7, epoch4>`and `<9, epoch5>`. -/// Initially, `next_user_key` of `ForwardMergeRangeIterator` is 5, which is the earliest event and -/// current epochs is empty set at this time. -/// When `UserIterator` queries user key 5, `current_epochs` becomes `{epoch1}`, which means the key -/// fetched by `UserIterator` is deleted only if `key epoch <= epoch1 <= read_epoch`. -/// Simultaneously, `next_user_key` becomes 7, which means that the inequality will be kept until -/// the key fetched by `UserIterator` reaches user key 7. -/// For example, if the `UserIterator` queries user key 6 later, the delete condition is still -/// `key epoch <= epoch1 <= read_epoch`. -/// -/// When `UserIterator` queries user key 8, -/// `next_user_key` of SST1 is 11, `current_epoch` of SST1 is epoch2; -/// `next_user_key` of SST2 is 9, `current_epoch` of SST2 is epoch4; -/// Therefore `current_epochs` of `ForwardMergeRangeIterator` is `{epoch2, epoch4}`, -/// `next_user_key` of `ForwardMergeRangeIterator` is min(11, 9) == 9, -/// which means that `current_epochs` won't change until user key 9. -/// -/// We can then get the largest epoch which is not greater than read epoch in `{epoch2, epoch4}`. -/// The user key is deleted only if key epoch is below this epoch. -pub struct ForwardMergeRangeIterator { - heap: BinaryHeap, - unused_iters: Vec, - tmp_buffer: Vec, - read_epoch: HummockEpoch, - /// The correctness of the algorithm needs to be guaranteed by "the epoch of the - /// intervals covering each other must be different". - current_epochs: BTreeSet, -} - -impl Default for ForwardMergeRangeIterator { - fn default() -> Self { - ForwardMergeRangeIterator::new(HummockEpoch::MAX) - } -} - -impl ForwardMergeRangeIterator { - pub fn new(read_epoch: HummockEpoch) -> Self { - Self { - heap: BinaryHeap::new(), - unused_iters: vec![], - tmp_buffer: vec![], - read_epoch, - current_epochs: BTreeSet::new(), - } - } - - pub fn add_batch_iter(&mut self, iter: SharedBufferDeleteRangeIterator) { - self.unused_iters.push(RangeIteratorTyped::Batch(iter)); - } - - pub fn add_sst_iter(&mut self, iter: SstableDeleteRangeIterator) { - self.unused_iters.push(RangeIteratorTyped::Sst(iter)); - } - - pub fn add_concat_iter(&mut self, sstables: Vec, sstable_store: SstableStoreRef) { - self.unused_iters - .push(RangeIteratorTyped::Concat(ConcatDeleteRangeIterator::new( - sstables, - sstable_store, - ))) - } -} - -impl ForwardMergeRangeIterator { - pub async fn next_until(&mut self, target_user_key: UserKey<&[u8]>) -> HummockResult<()> { - let target_extended_user_key = PointRange::from_user_key(target_user_key, false); - while self.is_valid() && self.next_extended_user_key().le(&target_extended_user_key) { - self.next().await?; - } - Ok(()) - } - - pub fn earliest_delete_since(&self, epoch: HummockEpoch) -> HummockEpoch { - self.current_epochs - .range(epoch..) - .next() - .map_or(HummockEpoch::MAX, |ret| *ret) - } - - pub fn earliest_epoch(&self) -> HummockEpoch { - self.current_epochs - .first() - .map_or(HummockEpoch::MAX, |epoch| *epoch) - } -} - -impl DeleteRangeIterator for ForwardMergeRangeIterator { - type NextFuture<'a> = impl Future> + 'a; - type RewindFuture<'a> = impl Future> + 'a; - type SeekFuture<'a> = impl Future> + 'a; - - fn next_extended_user_key(&self) -> PointRange<&[u8]> { - self.heap.peek().unwrap().next_extended_user_key() - } - - fn current_epoch(&self) -> HummockEpoch { - self.current_epochs - .range(..=self.read_epoch) - .last() - .map_or(HummockEpoch::MIN, |epoch| *epoch) - } - - fn next(&mut self) -> Self::NextFuture<'_> { - async { - self.tmp_buffer - .push(self.heap.pop().expect("no inner iter")); - while let Some(node) = self.heap.peek() - && node.is_valid() - && node.next_extended_user_key() == self.tmp_buffer[0].next_extended_user_key() - { - self.tmp_buffer.push(self.heap.pop().unwrap()); - } - for node in &self.tmp_buffer { - let epoch = node.current_epoch(); - if !is_max_epoch(epoch) { - self.current_epochs.remove(&epoch); - } - } - // Correct because ranges in an epoch won't intersect. - for mut node in std::mem::take(&mut self.tmp_buffer) { - node.next().await?; - if node.is_valid() { - let epoch = node.current_epoch(); - if !is_max_epoch(epoch) { - self.current_epochs.insert(epoch); - } - self.heap.push(node); - } else { - // Put back to `unused_iters` - self.unused_iters.push(node); - } - } - Ok(()) - } - } - - fn rewind(&mut self) -> Self::RewindFuture<'_> { - async move { - self.current_epochs.clear(); - self.unused_iters.extend(self.heap.drain()); - for mut node in self.unused_iters.drain(..) { - node.rewind().await?; - if node.is_valid() { - let epoch = node.current_epoch(); - if !is_max_epoch(epoch) { - self.current_epochs.insert(epoch); - } - self.heap.push(node); - } - } - Ok(()) - } - } - - fn seek<'a>(&'a mut self, target_user_key: UserKey<&'a [u8]>) -> Self::SeekFuture<'_> { - async move { - self.current_epochs.clear(); - let mut iters = std::mem::take(&mut self.unused_iters); - iters.extend(self.heap.drain()); - for mut node in iters { - node.seek(target_user_key).await?; - if node.is_valid() { - let epoch = node.current_epoch(); - if !is_max_epoch(epoch) { - self.current_epochs.insert(epoch); - } - self.heap.push(node); - } else { - self.unused_iters.push(node); - } - } - Ok(()) - } - } - - fn is_valid(&self) -> bool { - self.heap - .peek() - .map(|node| node.is_valid()) - .unwrap_or(false) - } -} diff --git a/src/storage/src/hummock/iterator/mod.rs b/src/storage/src/hummock/iterator/mod.rs index fdfcd26a3a592..a205baac0aa96 100644 --- a/src/storage/src/hummock/iterator/mod.rs +++ b/src/storage/src/hummock/iterator/mod.rs @@ -46,15 +46,10 @@ use risingwave_hummock_sdk::EpochWithGap; use crate::hummock::iterator::HummockIteratorUnion::{First, Fourth, Second, Third}; pub mod change_log; -mod concat_delete_range_iterator; -mod delete_range_iterator; mod skip_watermark; #[cfg(any(test, feature = "test"))] pub mod test_utils; -pub use delete_range_iterator::{ - DeleteRangeIterator, ForwardMergeRangeIterator, RangeIteratorTyped, -}; use risingwave_common::catalog::TableId; pub use skip_watermark::*; diff --git a/src/storage/src/hummock/local_version/mod.rs b/src/storage/src/hummock/local_version/mod.rs index 578e123c6574e..4a45c8dc9075c 100644 --- a/src/storage/src/hummock/local_version/mod.rs +++ b/src/storage/src/hummock/local_version/mod.rs @@ -13,3 +13,4 @@ // limitations under the License. pub mod pinned_version; +pub mod recent_versions; diff --git a/src/storage/src/hummock/local_version/pinned_version.rs b/src/storage/src/hummock/local_version/pinned_version.rs index 5ef53edcd26ef..afaafdf7cbe8a 100644 --- a/src/storage/src/hummock/local_version/pinned_version.rs +++ b/src/storage/src/hummock/local_version/pinned_version.rs @@ -92,22 +92,25 @@ impl PinnedVersion { } } - pub fn new_pin_version(&self, version: HummockVersion) -> Self { + pub fn new_pin_version(&self, version: HummockVersion) -> Option { assert!( version.id >= self.version.id, "pinning a older version {}. Current is {}", version.id, self.version.id ); + if version.id == self.version.id { + return None; + } let version_id = version.id; - PinnedVersion { + Some(PinnedVersion { version: Arc::new(version), guard: Arc::new(PinnedVersionGuard::new( version_id, self.guard.pinned_version_manager_tx.clone(), )), - } + }) } pub fn id(&self) -> HummockVersionId { diff --git a/src/storage/src/hummock/local_version/recent_versions.rs b/src/storage/src/hummock/local_version/recent_versions.rs new file mode 100644 index 0000000000000..8d3f1a015ad6a --- /dev/null +++ b/src/storage/src/hummock/local_version/recent_versions.rs @@ -0,0 +1,321 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; + +use risingwave_common::catalog::TableId; +use risingwave_hummock_sdk::HummockEpoch; + +use crate::hummock::local_version::pinned_version::PinnedVersion; + +pub struct RecentVersions { + latest_version: PinnedVersion, + is_latest_committed: bool, + recent_versions: Vec, // earlier version at the front + max_version_num: usize, +} + +impl RecentVersions { + pub fn new(version: PinnedVersion, max_version_num: usize) -> Self { + assert!(max_version_num > 0); + Self { + latest_version: version, + is_latest_committed: true, // The first version is always treated as committed epochs + recent_versions: Vec::new(), + max_version_num, + } + } + + fn has_table_committed(&self, new_version: &PinnedVersion) -> bool { + let mut has_table_committed = false; + for (table_id, info) in new_version.version().state_table_info.info() { + if let Some(prev_info) = self + .latest_version + .version() + .state_table_info + .info() + .get(table_id) + { + match info.committed_epoch.cmp(&prev_info.committed_epoch) { + Ordering::Less => { + unreachable!( + "table {} has regress committed epoch {}, prev committed epoch {}", + table_id, info.committed_epoch, prev_info.committed_epoch + ); + } + Ordering::Equal => {} + Ordering::Greater => { + has_table_committed = true; + } + } + } else { + has_table_committed = true; + } + } + has_table_committed + } + + #[must_use] + pub fn with_new_version(&self, version: PinnedVersion) -> Self { + assert!(version.version().id > self.latest_version.version().id); + let is_committed = self.has_table_committed(&version); + let recent_versions = if self.is_latest_committed { + let prev_recent_versions = if self.recent_versions.len() >= self.max_version_num { + assert_eq!(self.recent_versions.len(), self.max_version_num); + &self.recent_versions[1..] + } else { + &self.recent_versions[..] + }; + let mut recent_versions = Vec::with_capacity(prev_recent_versions.len() + 1); + recent_versions.extend(prev_recent_versions.iter().cloned()); + recent_versions.push(self.latest_version.clone()); + recent_versions + } else { + self.recent_versions.clone() + }; + Self { + latest_version: version, + is_latest_committed: is_committed, + recent_versions, + max_version_num: self.max_version_num, + } + } + + pub fn latest_version(&self) -> &PinnedVersion { + &self.latest_version + } + + /// Return the latest version that is safe to read `epoch` on `table_id`. + /// + /// `safe to read` means that the `committed_epoch` of the `table_id` in the version won't be greater than the given `epoch` + pub fn get_safe_version( + &self, + table_id: TableId, + epoch: HummockEpoch, + ) -> Option { + if let Some(info) = self + .latest_version + .version() + .state_table_info + .info() + .get(&table_id) + { + if info.committed_epoch <= epoch { + Some(self.latest_version.clone()) + } else { + self.get_safe_version_from_recent(table_id, epoch) + } + } else { + None + } + } + + fn get_safe_version_from_recent( + &self, + table_id: TableId, + epoch: HummockEpoch, + ) -> Option { + if cfg!(debug_assertions) { + assert!( + epoch + < self + .latest_version + .version() + .state_table_info + .info() + .get(&table_id) + .expect("should exist") + .committed_epoch + ); + } + let result = self.recent_versions.binary_search_by(|version| { + let committed_epoch = version + .version() + .state_table_info + .info() + .get(&table_id) + .map(|info| info.committed_epoch); + if let Some(committed_epoch) = committed_epoch { + committed_epoch.cmp(&epoch) + } else { + // We have ensured that the table_id exists in the latest version, so if the table_id does not exist in a + // previous version, the table must have not created yet, and therefore has less ordering. + Ordering::Less + } + }); + match result { + Ok(index) => Some(self.recent_versions[index].clone()), + Err(index) => { + // `index` is index of the first version that has `committed_epoch` greater than `epoch` + // or `index` equals `recent_version.len()` when `epoch` is greater than all `committed_epoch` + let version = if index >= self.recent_versions.len() { + assert_eq!(index, self.recent_versions.len()); + self.recent_versions.last().cloned() + } else if index == 0 { + // The earliest version has a higher committed epoch + None + } else { + self.recent_versions.get(index - 1).cloned() + }; + version.and_then(|version| { + if version + .version() + .state_table_info + .info() + .contains_key(&table_id) + { + Some(version) + } else { + // if the table does not exist in the version, return `None` to try get a time travel version + None + } + }) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use risingwave_common::catalog::TableId; + use risingwave_hummock_sdk::version::HummockVersion; + use risingwave_pb::hummock::{PbHummockVersion, StateTableInfo}; + use tokio::sync::mpsc::unbounded_channel; + + use crate::hummock::local_version::pinned_version::PinnedVersion; + use crate::hummock::local_version::recent_versions::RecentVersions; + + const TEST_TABLE_ID1: TableId = TableId::new(233); + const TEST_TABLE_ID2: TableId = TableId::new(234); + + fn gen_pin_version( + version_id: u64, + table_committed_epoch: impl IntoIterator, + ) -> PinnedVersion { + PinnedVersion::new( + HummockVersion::from_rpc_protobuf(&PbHummockVersion { + id: version_id, + state_table_info: HashMap::from_iter(table_committed_epoch.into_iter().map( + |(table_id, committed_epoch)| { + ( + table_id.table_id, + StateTableInfo { + committed_epoch, + safe_epoch: 0, + compaction_group_id: 0, + }, + ) + }, + )), + ..Default::default() + }), + unbounded_channel().0, + ) + } + + fn assert_query_equal( + recent_version: &RecentVersions, + expected: &[(TableId, u64, Option<&PinnedVersion>)], + ) { + for (table_id, epoch, expected_version) in expected + .iter() + .cloned() + .chain([(TEST_TABLE_ID1, 0, None), (TEST_TABLE_ID2, 0, None)]) + { + let version = recent_version.get_safe_version(table_id, epoch); + assert_eq!( + version.as_ref().map(|version| version.id()), + expected_version.map(|version| version.id()) + ); + } + } + + #[test] + fn test_basic() { + let epoch1 = 233; + let epoch0 = epoch1 - 1; + let epoch2 = epoch1 + 1; + let epoch3 = epoch2 + 1; + let epoch4 = epoch3 + 1; + let version1 = gen_pin_version(1, [(TEST_TABLE_ID1, epoch1)]); + // with at most 2 historical versions + let recent_versions = RecentVersions::new(version1.clone(), 2); + assert!(recent_versions.recent_versions.is_empty()); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, Some(&version1)), + (TEST_TABLE_ID1, epoch2, Some(&version1)), + ], + ); + + let recent_versions = + recent_versions.with_new_version(gen_pin_version(2, [(TEST_TABLE_ID1, epoch1)])); + assert_eq!(recent_versions.recent_versions.len(), 1); + assert!(!recent_versions.is_latest_committed); + + let version3 = gen_pin_version(3, [(TEST_TABLE_ID1, epoch2)]); + let recent_versions = recent_versions.with_new_version(version3.clone()); + assert_eq!(recent_versions.recent_versions.len(), 1); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, Some(&version1)), + (TEST_TABLE_ID1, epoch2, Some(&version3)), + (TEST_TABLE_ID1, epoch3, Some(&version3)), + ], + ); + + let version4 = gen_pin_version(4, [(TEST_TABLE_ID2, epoch1), (TEST_TABLE_ID1, epoch2)]); + let recent_versions = recent_versions.with_new_version(version4.clone()); + assert_eq!(recent_versions.recent_versions.len(), 2); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, Some(&version1)), + (TEST_TABLE_ID1, epoch2, Some(&version4)), + (TEST_TABLE_ID1, epoch3, Some(&version4)), + (TEST_TABLE_ID2, epoch0, None), + (TEST_TABLE_ID2, epoch1, Some(&version4)), + (TEST_TABLE_ID2, epoch2, Some(&version4)), + ], + ); + + let version5 = gen_pin_version(5, [(TEST_TABLE_ID2, epoch1), (TEST_TABLE_ID1, epoch3)]); + let recent_versions = recent_versions.with_new_version(version5.clone()); + assert_eq!(recent_versions.recent_versions.len(), 2); + assert!(recent_versions.is_latest_committed); + assert_query_equal( + &recent_versions, + &[ + (TEST_TABLE_ID1, epoch0, None), + (TEST_TABLE_ID1, epoch1, None), + (TEST_TABLE_ID1, epoch2, Some(&version4)), + (TEST_TABLE_ID1, epoch3, Some(&version5)), + (TEST_TABLE_ID1, epoch4, Some(&version5)), + (TEST_TABLE_ID2, epoch0, None), + (TEST_TABLE_ID2, epoch1, Some(&version5)), + (TEST_TABLE_ID2, epoch2, Some(&version5)), + ], + ); + } +} diff --git a/src/storage/src/hummock/mod.rs b/src/storage/src/hummock/mod.rs index f10b6deee503e..8c5410f5c4cde 100644 --- a/src/storage/src/hummock/mod.rs +++ b/src/storage/src/hummock/mod.rs @@ -90,16 +90,6 @@ pub async fn get_from_sstable_info( local_stats, ) { - if !read_options.ignore_range_tombstone { - let delete_epoch = get_min_delete_range_epoch_from_sstable(&sstable, full_key.user_key); - if delete_epoch <= full_key.epoch_with_gap.pure_epoch() { - return Ok(Some(( - HummockValue::Delete, - EpochWithGap::new_from_epoch(delete_epoch), - ))); - } - } - return Ok(None); } @@ -113,17 +103,6 @@ pub async fn get_from_sstable_info( iter.seek(full_key).await?; // Iterator has sought passed the borders. if !iter.is_valid() { - if !read_options.ignore_range_tombstone { - let delete_epoch = - get_min_delete_range_epoch_from_sstable(iter.sst(), full_key.user_key); - if delete_epoch <= full_key.epoch_with_gap.pure_epoch() { - return Ok(Some(( - HummockValue::Delete, - EpochWithGap::new_from_epoch(delete_epoch), - ))); - } - } - return Ok(None); } @@ -131,16 +110,6 @@ pub async fn get_from_sstable_info( // or key next to it. let value = if iter.key().user_key == full_key.user_key { Some((iter.value().to_bytes(), iter.key().epoch_with_gap)) - } else if !read_options.ignore_range_tombstone { - let delete_epoch = get_min_delete_range_epoch_from_sstable(iter.sst(), full_key.user_key); - if delete_epoch <= full_key.epoch_with_gap.pure_epoch() { - Some(( - HummockValue::Delete, - EpochWithGap::new_from_epoch(delete_epoch), - )) - } else { - None - } } else { None }; diff --git a/src/storage/src/hummock/shared_buffer/shared_buffer_batch.rs b/src/storage/src/hummock/shared_buffer/shared_buffer_batch.rs index 1f8b17fb6c662..53fccc922b2bc 100644 --- a/src/storage/src/hummock/shared_buffer/shared_buffer_batch.rs +++ b/src/storage/src/hummock/shared_buffer/shared_buffer_batch.rs @@ -14,7 +14,6 @@ use std::cmp::Ordering; use std::fmt::Debug; -use std::future::Future; use std::marker::PhantomData; use std::mem::size_of_val; use std::ops::Bound::Included; @@ -27,16 +26,15 @@ use bytes::Bytes; use prometheus::IntGauge; use risingwave_common::catalog::TableId; use risingwave_common::hash::VirtualNode; -use risingwave_hummock_sdk::key::{FullKey, PointRange, TableKey, TableKeyRange, UserKey}; +use risingwave_hummock_sdk::key::{FullKey, TableKey, TableKeyRange, UserKey}; use risingwave_hummock_sdk::EpochWithGap; use crate::hummock::iterator::{ - Backward, DeleteRangeIterator, DirectionEnum, Forward, HummockIterator, - HummockIteratorDirection, ValueMeta, + Backward, DirectionEnum, Forward, HummockIterator, HummockIteratorDirection, ValueMeta, }; use crate::hummock::utils::{range_overlap, MemoryTracker}; use crate::hummock::value::HummockValue; -use crate::hummock::{HummockEpoch, HummockResult, MonotonicDeleteEvent}; +use crate::hummock::{HummockEpoch, HummockResult}; use crate::mem_table::ImmId; use crate::store::ReadOptions; @@ -844,122 +842,6 @@ impl HummockIterator } } -pub struct SharedBufferDeleteRangeIterator { - monotonic_tombstone_events: Vec, - next_idx: usize, -} - -impl SharedBufferDeleteRangeIterator { - #[cfg(any(test, feature = "test"))] - pub(crate) fn new( - epoch: HummockEpoch, - table_id: TableId, - delete_ranges: Vec<(Bound, Bound)>, - ) -> Self { - use itertools::Itertools; - let point_range_pairs = delete_ranges - .into_iter() - .map(|(left_bound, right_bound)| { - ( - match left_bound { - Bound::Excluded(x) => PointRange::from_user_key( - UserKey::new(table_id, TableKey(x.to_vec())), - true, - ), - Bound::Included(x) => PointRange::from_user_key( - UserKey::new(table_id, TableKey(x.to_vec())), - false, - ), - Bound::Unbounded => unreachable!(), - }, - match right_bound { - Bound::Excluded(x) => PointRange::from_user_key( - UserKey::new(table_id, TableKey(x.to_vec())), - false, - ), - Bound::Included(x) => PointRange::from_user_key( - UserKey::new(table_id, TableKey(x.to_vec())), - true, - ), - Bound::Unbounded => PointRange::from_user_key( - UserKey::new( - TableId::new(table_id.table_id() + 1), - TableKey::default(), - ), - false, - ), - }, - ) - }) - .collect_vec(); - let mut monotonic_tombstone_events = Vec::with_capacity(point_range_pairs.len() * 2); - for (start_point_range, end_point_range) in point_range_pairs { - monotonic_tombstone_events.push(MonotonicDeleteEvent { - event_key: start_point_range, - new_epoch: epoch, - }); - monotonic_tombstone_events.push(MonotonicDeleteEvent { - event_key: end_point_range, - new_epoch: HummockEpoch::MAX, - }); - } - Self { - monotonic_tombstone_events, - next_idx: 0, - } - } -} - -impl DeleteRangeIterator for SharedBufferDeleteRangeIterator { - type NextFuture<'a> = impl Future> + 'a; - type RewindFuture<'a> = impl Future> + 'a; - type SeekFuture<'a> = impl Future> + 'a; - - fn next_extended_user_key(&self) -> PointRange<&[u8]> { - self.monotonic_tombstone_events[self.next_idx] - .event_key - .as_ref() - } - - fn current_epoch(&self) -> HummockEpoch { - if self.next_idx > 0 { - self.monotonic_tombstone_events[self.next_idx - 1].new_epoch - } else { - HummockEpoch::MAX - } - } - - fn next(&mut self) -> Self::NextFuture<'_> { - async move { - self.next_idx += 1; - Ok(()) - } - } - - fn rewind(&mut self) -> Self::RewindFuture<'_> { - async move { - self.next_idx = 0; - Ok(()) - } - } - - fn seek<'a>(&'a mut self, target_user_key: UserKey<&'a [u8]>) -> Self::SeekFuture<'a> { - async move { - let target_extended_user_key = PointRange::from_user_key(target_user_key, false); - self.next_idx = self.monotonic_tombstone_events.partition_point( - |MonotonicDeleteEvent { event_key, .. }| { - event_key.as_ref().le(&target_extended_user_key) - }, - ); - Ok(()) - } - } - - fn is_valid(&self) -> bool { - self.next_idx < self.monotonic_tombstone_events.len() - } -} - #[cfg(test)] mod tests { use std::ops::Bound::Excluded; diff --git a/src/storage/src/hummock/sstable/builder.rs b/src/storage/src/hummock/sstable/builder.rs index 32960c7b8f97d..49caf0ba02568 100644 --- a/src/storage/src/hummock/sstable/builder.rs +++ b/src/storage/src/hummock/sstable/builder.rs @@ -12,12 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp; use std::collections::BTreeSet; use std::sync::Arc; use bytes::{Bytes, BytesMut}; -use risingwave_common::util::epoch::is_max_epoch; use risingwave_hummock_sdk::key::{user_key, FullKey, MAX_KEY_LEN}; use risingwave_hummock_sdk::key_range::KeyRange; use risingwave_hummock_sdk::sstable_info::SstableInfo; @@ -411,6 +409,7 @@ impl SstableBuilder { .map(|block_meta| block_meta.uncompressed_size as u64) .sum::(); + #[expect(deprecated)] let mut meta = SstableMeta { block_metas: self.block_metas, bloom_filter, @@ -450,21 +449,6 @@ impl SstableBuilder { ) }); - // Expand the epoch of the whole sst by tombstone epoch - let (tombstone_min_epoch, tombstone_max_epoch) = { - let mut tombstone_min_epoch = HummockEpoch::MAX; - let mut tombstone_max_epoch = u64::MIN; - - for monotonic_delete in &meta.monotonic_tombstone_events { - if !is_max_epoch(monotonic_delete.new_epoch) { - tombstone_min_epoch = cmp::min(tombstone_min_epoch, monotonic_delete.new_epoch); - tombstone_max_epoch = cmp::max(tombstone_max_epoch, monotonic_delete.new_epoch); - } - } - - (tombstone_min_epoch, tombstone_max_epoch) - }; - let (avg_key_size, avg_value_size) = if self.table_stats.is_empty() { (0, 0) } else { @@ -522,9 +506,9 @@ impl SstableBuilder { stale_key_count, total_key_count, uncompressed_file_size: uncompressed_file_size + meta.encoded_size() as u64, - min_epoch: cmp::min(min_epoch, tombstone_min_epoch), - max_epoch: cmp::max(max_epoch, tombstone_max_epoch), - range_tombstone_count: meta.monotonic_tombstone_events.len() as u64, + min_epoch, + max_epoch, + range_tombstone_count: 0, sst_size: meta.estimated_size as u64, }; tracing::trace!( diff --git a/src/storage/src/hummock/sstable/delete_range_aggregator.rs b/src/storage/src/hummock/sstable/delete_range_aggregator.rs deleted file mode 100644 index a0c01d8cb80ea..0000000000000 --- a/src/storage/src/hummock/sstable/delete_range_aggregator.rs +++ /dev/null @@ -1,475 +0,0 @@ -// Copyright 2024 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::future::Future; - -#[cfg(test)] -use risingwave_common::util::epoch::is_max_epoch; -use risingwave_hummock_sdk::key::{PointRange, UserKey}; -use risingwave_hummock_sdk::HummockEpoch; - -use super::MonotonicDeleteEvent; -use crate::hummock::iterator::{DeleteRangeIterator, ForwardMergeRangeIterator}; -use crate::hummock::sstable_store::TableHolder; -use crate::hummock::{HummockResult, Sstable}; - -pub struct CompactionDeleteRangeIterator { - inner: ForwardMergeRangeIterator, -} - -impl CompactionDeleteRangeIterator { - pub fn new(inner: ForwardMergeRangeIterator) -> Self { - Self { inner } - } - - pub async fn next(&mut self) -> HummockResult<()> { - self.inner.next().await - } - - #[cfg(test)] - pub async fn get_tombstone_between( - self, - smallest_user_key: UserKey<&[u8]>, - largest_user_key: UserKey<&[u8]>, - ) -> HummockResult> { - let mut iter = self; - iter.seek(smallest_user_key).await?; - let extended_smallest_user_key = PointRange::from_user_key(smallest_user_key, false); - let extended_largest_user_key = PointRange::from_user_key(largest_user_key, false); - let mut monotonic_events = vec![]; - if !is_max_epoch(iter.earliest_epoch()) { - monotonic_events.push(MonotonicDeleteEvent { - event_key: extended_smallest_user_key.to_vec(), - new_epoch: iter.earliest_epoch(), - }); - } - - while iter.is_valid() { - if !extended_largest_user_key.is_empty() && iter.key().ge(&extended_largest_user_key) { - if !monotonic_events.is_empty() { - monotonic_events.push(MonotonicDeleteEvent { - event_key: extended_largest_user_key.to_vec(), - new_epoch: HummockEpoch::MAX, - }); - } - break; - } - - let event_key = iter.key().to_vec(); - iter.next().await?; - - monotonic_events.push(MonotonicDeleteEvent { - new_epoch: iter.earliest_epoch(), - event_key, - }); - } - - monotonic_events.dedup_by(|a, b| { - a.event_key.left_user_key.table_id == b.event_key.left_user_key.table_id - && a.new_epoch == b.new_epoch - }); - if !monotonic_events.is_empty() { - assert!(!is_max_epoch(monotonic_events.first().unwrap().new_epoch)); - assert!(is_max_epoch(monotonic_events.last().unwrap().new_epoch)); - } - Ok(monotonic_events) - } - - /// Return the earliest range-tombstone which deletes target-key. - /// Target-key must be given in order. - #[cfg(test)] - pub async fn earliest_delete_which_can_see_key_for_test( - &mut self, - target_user_key: UserKey<&[u8]>, - epoch: HummockEpoch, - ) -> HummockResult { - let target_extended_user_key = PointRange::from_user_key(target_user_key, false); - while self.inner.is_valid() - && self - .inner - .next_extended_user_key() - .le(&target_extended_user_key) - { - self.inner.next().await?; - } - Ok(self.earliest_delete_since(epoch)) - } - - pub fn key(&self) -> PointRange<&[u8]> { - self.inner.next_extended_user_key() - } - - pub fn is_valid(&self) -> bool { - self.inner.is_valid() - } - - pub fn earliest_epoch(&self) -> HummockEpoch { - self.inner.earliest_epoch() - } - - pub fn earliest_delete_since(&self, epoch: HummockEpoch) -> HummockEpoch { - self.inner.earliest_delete_since(epoch) - } - - /// seek to the first key which larger than `target_user_key`. - pub async fn seek<'a>(&'a mut self, target_user_key: UserKey<&'a [u8]>) -> HummockResult<()> { - self.inner.seek(target_user_key).await - } - - pub async fn rewind(&mut self) -> HummockResult<()> { - self.inner.rewind().await - } -} - -pub struct SstableDeleteRangeIterator { - table: TableHolder, - next_idx: usize, -} - -impl SstableDeleteRangeIterator { - pub fn new(table: TableHolder) -> Self { - Self { table, next_idx: 0 } - } - - /// Retrieves whether `next_extended_user_key` is the last range of this SST file. - /// - /// Note: - /// - Before calling this function, makes sure the iterator `is_valid`. - /// - This function should return immediately. - /// - /// # Panics - /// This function will panic if the iterator is invalid. - pub fn is_last_range(&self) -> bool { - debug_assert!(self.next_idx < self.table.meta.monotonic_tombstone_events.len()); - self.next_idx + 1 == self.table.meta.monotonic_tombstone_events.len() - } -} - -impl DeleteRangeIterator for SstableDeleteRangeIterator { - type NextFuture<'a> = impl Future> + 'a; - type RewindFuture<'a> = impl Future> + 'a; - type SeekFuture<'a> = impl Future> + 'a; - - fn next_extended_user_key(&self) -> PointRange<&[u8]> { - self.table.meta.monotonic_tombstone_events[self.next_idx] - .event_key - .as_ref() - } - - fn current_epoch(&self) -> HummockEpoch { - if self.next_idx > 0 { - self.table.meta.monotonic_tombstone_events[self.next_idx - 1].new_epoch - } else { - HummockEpoch::MAX - } - } - - fn next(&mut self) -> Self::NextFuture<'_> { - async move { - self.next_idx += 1; - Ok(()) - } - } - - fn rewind(&mut self) -> Self::RewindFuture<'_> { - async move { - self.next_idx = 0; - Ok(()) - } - } - - fn seek<'a>(&'a mut self, target_user_key: UserKey<&'a [u8]>) -> Self::SeekFuture<'_> { - async move { - let target_extended_user_key = PointRange::from_user_key(target_user_key, false); - self.next_idx = self.table.meta.monotonic_tombstone_events.partition_point( - |MonotonicDeleteEvent { event_key, .. }| { - event_key.as_ref().le(&target_extended_user_key) - }, - ); - Ok(()) - } - } - - fn is_valid(&self) -> bool { - self.next_idx < self.table.meta.monotonic_tombstone_events.len() - } -} - -pub fn get_min_delete_range_epoch_from_sstable( - table: &Sstable, - query_user_key: UserKey<&[u8]>, -) -> HummockEpoch { - let query_extended_user_key = PointRange::from_user_key(query_user_key, false); - let idx = table.meta.monotonic_tombstone_events.partition_point( - |MonotonicDeleteEvent { event_key, .. }| event_key.as_ref().le(&query_extended_user_key), - ); - if idx == 0 { - HummockEpoch::MAX - } else { - table.meta.monotonic_tombstone_events[idx - 1].new_epoch - } -} - -#[cfg(test)] -mod tests { - use std::ops::Bound; - - use bytes::Bytes; - use risingwave_common::catalog::TableId; - use risingwave_common::util::epoch::test_epoch; - - use super::*; - use crate::hummock::test_utils::delete_range::CompactionDeleteRangesBuilder; - use crate::hummock::test_utils::test_user_key; - - #[tokio::test] - pub async fn test_compaction_delete_range_iterator() { - let mut builder = CompactionDeleteRangesBuilder::default(); - let table_id = TableId::default(); - builder.add_delete_events_for_test( - 9, - table_id, - vec![ - ( - Bound::Included(Bytes::copy_from_slice(b"aaaaaa")), - Bound::Excluded(Bytes::copy_from_slice(b"bbbddd")), - ), - ( - Bound::Included(Bytes::copy_from_slice(b"bbbfff")), - Bound::Excluded(Bytes::copy_from_slice(b"ffffff")), - ), - ( - Bound::Included(Bytes::copy_from_slice(b"gggggg")), - Bound::Excluded(Bytes::copy_from_slice(b"hhhhhh")), - ), - ], - ); - builder.add_delete_events_for_test( - 12, - table_id, - vec![( - Bound::Included(Bytes::copy_from_slice(b"aaaaaa")), - Bound::Excluded(Bytes::copy_from_slice(b"bbbccc")), - )], - ); - builder.add_delete_events_for_test( - 8, - table_id, - vec![( - Bound::Excluded(Bytes::copy_from_slice(b"bbbeee")), - Bound::Included(Bytes::copy_from_slice(b"eeeeee")), - )], - ); - builder.add_delete_events_for_test( - 6, - table_id, - vec![( - Bound::Included(Bytes::copy_from_slice(b"bbbaab")), - Bound::Excluded(Bytes::copy_from_slice(b"bbbdddf")), - )], - ); - builder.add_delete_events_for_test( - 7, - table_id, - vec![( - Bound::Excluded(Bytes::copy_from_slice(b"hhhhhh")), - Bound::Unbounded, - )], - ); - let mut iter = builder.build_for_compaction(); - iter.rewind().await.unwrap(); - - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"bbb").as_ref(), - test_epoch(13) - ) - .await - .unwrap(), - HummockEpoch::MAX, - ); - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"bbb").as_ref(), - test_epoch(11) - ) - .await - .unwrap(), - test_epoch(12) - ); - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"bbb").as_ref(), - test_epoch(8) - ) - .await - .unwrap(), - test_epoch(9) - ); - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"bbbaaa").as_ref(), - test_epoch(8) - ) - .await - .unwrap(), - test_epoch(9) - ); - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"bbbccd").as_ref(), - test_epoch(8) - ) - .await - .unwrap(), - test_epoch(9) - ); - - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"bbbddd").as_ref(), - test_epoch(8) - ) - .await - .unwrap(), - HummockEpoch::MAX, - ); - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"bbbeee").as_ref(), - test_epoch(8) - ) - .await - .unwrap(), - HummockEpoch::MAX, - ); - - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"bbbeef").as_ref(), - test_epoch(10) - ) - .await - .unwrap(), - HummockEpoch::MAX, - ); - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"eeeeee").as_ref(), - test_epoch(8) - ) - .await - .unwrap(), - test_epoch(8) - ); - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"gggggg").as_ref(), - test_epoch(8) - ) - .await - .unwrap(), - test_epoch(9) - ); - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"hhhhhh").as_ref(), - test_epoch(6) - ) - .await - .unwrap(), - HummockEpoch::MAX, - ); - assert_eq!( - iter.earliest_delete_which_can_see_key_for_test( - test_user_key(b"iiiiii").as_ref(), - test_epoch(6) - ) - .await - .unwrap(), - test_epoch(7) - ); - } - - #[tokio::test] - pub async fn test_delete_range_split() { - let table_id = TableId::default(); - let mut builder = CompactionDeleteRangesBuilder::default(); - builder.add_delete_events_for_test( - 13, - table_id, - vec![( - Bound::Included(Bytes::copy_from_slice(b"aaaa")), - Bound::Excluded(Bytes::copy_from_slice(b"cccc")), - )], - ); - builder.add_delete_events_for_test( - 10, - table_id, - vec![( - Bound::Excluded(Bytes::copy_from_slice(b"cccc")), - Bound::Excluded(Bytes::copy_from_slice(b"dddd")), - )], - ); - builder.add_delete_events_for_test( - 12, - table_id, - vec![( - Bound::Included(Bytes::copy_from_slice(b"cccc")), - Bound::Included(Bytes::copy_from_slice(b"eeee")), - )], - ); - builder.add_delete_events_for_test( - 15, - table_id, - vec![( - Bound::Excluded(Bytes::copy_from_slice(b"eeee")), - Bound::Excluded(Bytes::copy_from_slice(b"ffff")), - )], - ); - let compaction_delete_range = builder.build_for_compaction(); - let split_ranges = compaction_delete_range - .get_tombstone_between( - test_user_key(b"bbbb").as_ref(), - test_user_key(b"eeeeee").as_ref(), - ) - .await - .unwrap(); - assert_eq!(6, split_ranges.len()); - assert_eq!( - PointRange::from_user_key(test_user_key(b"bbbb"), false), - split_ranges[0].event_key - ); - assert_eq!( - PointRange::from_user_key(test_user_key(b"cccc"), false), - split_ranges[1].event_key - ); - assert_eq!( - PointRange::from_user_key(test_user_key(b"cccc"), true), - split_ranges[2].event_key - ); - assert_eq!( - PointRange::from_user_key(test_user_key(b"dddd"), false), - split_ranges[3].event_key - ); - assert_eq!( - PointRange::from_user_key(test_user_key(b"eeee"), true), - split_ranges[4].event_key - ); - assert_eq!( - PointRange::from_user_key(test_user_key(b"eeeeee"), false), - split_ranges[5].event_key - ); - } -} diff --git a/src/storage/src/hummock/sstable/forward_sstable_iterator.rs b/src/storage/src/hummock/sstable/forward_sstable_iterator.rs index fe258db352143..287c387dd3270 100644 --- a/src/storage/src/hummock/sstable/forward_sstable_iterator.rs +++ b/src/storage/src/hummock/sstable/forward_sstable_iterator.rs @@ -101,10 +101,6 @@ impl SstableIterator { } } - pub(crate) fn sst(&self) -> &TableHolder { - &self.sst - } - /// Seeks to a block, and then seeks to the key if `seek_key` is given. async fn seek_idx( &mut self, diff --git a/src/storage/src/hummock/sstable/mod.rs b/src/storage/src/hummock/sstable/mod.rs index 1125cc919bd58..8460f179d9527 100644 --- a/src/storage/src/hummock/sstable/mod.rs +++ b/src/storage/src/hummock/sstable/mod.rs @@ -18,7 +18,7 @@ mod block; use std::collections::HashSet; -use std::fmt::{Debug, Display, Formatter}; +use std::fmt::{Debug, Formatter}; use std::ops::{BitXor, Bound, Range}; pub use block::*; @@ -43,20 +43,13 @@ pub use forward_sstable_iterator::*; use tracing::warn; mod backward_sstable_iterator; pub use backward_sstable_iterator::*; -use risingwave_hummock_sdk::key::{ - FullKey, KeyPayloadType, PointRange, TableKey, UserKey, UserKeyRangeRef, -}; +use risingwave_hummock_sdk::key::{FullKey, KeyPayloadType, UserKey, UserKeyRangeRef}; use risingwave_hummock_sdk::{HummockEpoch, HummockSstableObjectId}; -mod delete_range_aggregator; mod filter; mod sstable_object_id_manager; mod utils; -pub use delete_range_aggregator::{ - get_min_delete_range_epoch_from_sstable, CompactionDeleteRangeIterator, - SstableDeleteRangeIterator, -}; pub use filter::FilterBuilder; pub use sstable_object_id_manager::*; pub use utils::CompressionAlgorithm; @@ -72,69 +65,6 @@ const MAGIC: u32 = 0x5785ab73; const OLD_VERSION: u32 = 1; const VERSION: u32 = 2; -#[derive(Clone, PartialEq, Eq, Debug)] -// delete keys located in [start_user_key, end_user_key) -pub struct DeleteRangeTombstone { - pub start_user_key: PointRange>, - pub end_user_key: PointRange>, - pub sequence: HummockEpoch, -} - -impl PartialOrd for DeleteRangeTombstone { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for DeleteRangeTombstone { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.start_user_key - .cmp(&other.start_user_key) - .then_with(|| self.end_user_key.cmp(&other.end_user_key)) - .then_with(|| other.sequence.cmp(&self.sequence)) - } -} - -impl DeleteRangeTombstone { - pub fn new( - table_id: TableId, - start_table_key: Vec, - is_left_open: bool, - end_table_key: Vec, - is_right_close: bool, - sequence: HummockEpoch, - ) -> Self { - Self { - start_user_key: PointRange::from_user_key( - UserKey::new(table_id, TableKey(start_table_key)), - is_left_open, - ), - end_user_key: PointRange::from_user_key( - UserKey::new(table_id, TableKey(end_table_key)), - is_right_close, - ), - sequence, - } - } - - #[cfg(test)] - pub fn new_for_test( - table_id: TableId, - start_table_key: Vec, - end_table_key: Vec, - sequence: HummockEpoch, - ) -> Self { - Self::new( - table_id, - start_table_key, - false, - end_table_key, - false, - sequence, - ) - } -} - /// Assume that watermark1 is 5, watermark2 is 7, watermark3 is 11, delete ranges /// `{ [0, wmk1) in epoch1, [wmk1, wmk2) in epoch2, [wmk2, wmk3) in epoch3 }` /// can be transformed into events below: @@ -148,24 +78,14 @@ impl DeleteRangeTombstone { /// next event key wmk2 (7) (not inclusive). /// If there is no range deletes between current event key and next event key, `new_epoch` will be /// `HummockEpoch::MAX`. -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct MonotonicDeleteEvent { - pub event_key: PointRange>, + pub event_key: + risingwave_hummock_sdk::key::range_delete_backward_compatibility_serde_struct::PointRange, pub new_epoch: HummockEpoch, } impl MonotonicDeleteEvent { - #[cfg(test)] - pub fn new(table_id: TableId, event_key: Vec, new_epoch: HummockEpoch) -> Self { - Self { - event_key: PointRange::from_user_key( - UserKey::new(table_id, TableKey(event_key)), - false, - ), - new_epoch, - } - } - pub fn encode(&self, mut buf: impl BufMut) { self.event_key .left_user_key @@ -179,6 +99,7 @@ impl MonotonicDeleteEvent { } pub fn decode(buf: &mut &[u8]) -> Self { + use risingwave_hummock_sdk::key::range_delete_backward_compatibility_serde_struct::*; let user_key = UserKey::decode_length_prefixed(buf); let exclude_left_key_flag = buf.get_u8(); let is_exclude_left_key = match exclude_left_key_flag { @@ -188,26 +109,13 @@ impl MonotonicDeleteEvent { }; let new_epoch = buf.get_u64_le(); Self { - event_key: PointRange::from_user_key(user_key, is_exclude_left_key), + event_key: PointRange { + left_user_key: user_key, + is_exclude_left_key, + }, new_epoch, } } - - #[inline] - pub fn encoded_size(&self) -> usize { - // length prefixed requires 4B more than its `encoded_len()` - 4 + self.event_key.left_user_key.encoded_len() + 1 + 8 - } -} - -impl Display for MonotonicDeleteEvent { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Event key {:?} epoch {:?}", - self.event_key, self.new_epoch - ) - } } #[derive(Serialize, Deserialize)] @@ -387,6 +295,7 @@ pub struct SstableMeta { /// key wmk1 (5) and till the next event key wmk2 (7) (not inclusive). /// If there is no range deletes between current event key and next event key, `new_epoch` will /// be `HummockEpoch::MAX`. + #[deprecated] pub monotonic_tombstone_events: Vec, /// Format version, for further compatibility. pub version: u32, @@ -435,6 +344,7 @@ impl SstableMeta { buf.put_u32_le(self.key_count); put_length_prefixed_slice(&mut buf, &self.smallest_key); put_length_prefixed_slice(&mut buf, &self.largest_key); + #[expect(deprecated)] buf.put_u32_le( utils::checked_into_u32(self.monotonic_tombstone_events.len()).unwrap_or_else(|_| { let tmp_full_key = FullKey::decode(&self.smallest_key); @@ -445,6 +355,7 @@ impl SstableMeta { ) }), ); + #[expect(deprecated)] for monotonic_tombstone_event in &self.monotonic_tombstone_events { monotonic_tombstone_event.encode(&mut buf); } @@ -513,6 +424,7 @@ impl SstableMeta { "read non-empty range tombstones"); } + #[expect(deprecated)] Ok(Self { block_metas, bloom_filter, @@ -535,11 +447,6 @@ impl SstableMeta { .map(|block_meta| block_meta.encoded_size()) .sum::() + 4 // monotonic tombstone events len - + self - .monotonic_tombstone_events - .iter() - .map(|event| event.encoded_size()) - .sum::() + 4 // bloom filter len + self.bloom_filter.len() + 4 // estimated size @@ -585,6 +492,7 @@ mod tests { #[test] fn test_sstable_meta_enc_dec() { + #[expect(deprecated)] let meta = SstableMeta { block_metas: vec![ BlockMeta { diff --git a/src/storage/src/hummock/sstable/writer.rs b/src/storage/src/hummock/sstable/writer.rs index 99347411fef6e..9c0f3f06f1c28 100644 --- a/src/storage/src/hummock/sstable/writer.rs +++ b/src/storage/src/hummock/sstable/writer.rs @@ -324,7 +324,7 @@ impl SstableWriter for StreamingUploadWriter { t }); - assert!(!meta.block_metas.is_empty() || !meta.monotonic_tombstone_events.is_empty()); + assert!(!meta.block_metas.is_empty()); // Upload data to object store. self.object_uploader.finish().await?; @@ -496,6 +496,7 @@ mod tests { }); blocks.push(data.slice((i * 1000) as usize..((i + 1) * 1000) as usize)); } + #[expect(deprecated)] let meta = SstableMeta { block_metas, bloom_filter: vec![], diff --git a/src/storage/src/hummock/store/hummock_storage.rs b/src/storage/src/hummock/store/hummock_storage.rs index 82b98c5f4fb39..888de0db1af1c 100644 --- a/src/storage/src/hummock/store/hummock_storage.rs +++ b/src/storage/src/hummock/store/hummock_storage.rs @@ -14,7 +14,7 @@ use std::collections::HashSet; use std::future::Future; -use std::ops::{Bound, Deref}; +use std::ops::Bound; use std::sync::Arc; use arc_swap::ArcSwap; @@ -50,9 +50,10 @@ use crate::hummock::event_handler::{ }; use crate::hummock::iterator::change_log::ChangeLogIterator; use crate::hummock::local_version::pinned_version::{start_pinned_version_worker, PinnedVersion}; +use crate::hummock::local_version::recent_versions::RecentVersions; use crate::hummock::observer_manager::HummockObserverNode; use crate::hummock::time_travel_version_cache::SimpleTimeTravelVersionCache; -use crate::hummock::utils::{validate_safe_epoch, wait_for_epoch}; +use crate::hummock::utils::wait_for_epoch; use crate::hummock::write_limiter::{WriteLimiter, WriteLimiterRef}; use crate::hummock::{ HummockEpoch, HummockError, HummockResult, HummockStorageIterator, HummockStorageRevIterator, @@ -97,7 +98,7 @@ pub struct HummockStorage { version_update_notifier_tx: Arc>, - pinned_version: Arc>, + recent_versions: Arc>, hummock_version_reader: HummockVersionReader, @@ -223,7 +224,7 @@ impl HummockStorage { version_update_notifier_tx: hummock_event_handler.version_update_notifier_tx(), hummock_event_sender: event_tx.clone(), _version_update_sender: version_update_tx, - pinned_version: hummock_event_handler.pinned_version(), + recent_versions: hummock_event_handler.recent_versions(), hummock_version_reader: HummockVersionReader::new( sstable_store, state_store_metrics.clone(), @@ -260,15 +261,9 @@ impl HummockStorage { ) -> StorageResult> { let key_range = (Bound::Included(key.clone()), Bound::Included(key.clone())); - let (key_range, read_version_tuple) = if read_options.read_version_from_time_travel { - self.build_read_version_by_time_travel(epoch, read_options.table_id, key_range) - .await? - } else if read_options.read_version_from_backup { - self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) - .await? - } else { - self.build_read_version_tuple(epoch, read_options.table_id, key_range)? - }; + let (key_range, read_version_tuple) = self + .build_read_version_tuple(epoch, key_range, &read_options) + .await?; if is_empty_key_range(&key_range) { return Ok(None); @@ -285,15 +280,9 @@ impl HummockStorage { epoch: u64, read_options: ReadOptions, ) -> StorageResult { - let (key_range, read_version_tuple) = if read_options.read_version_from_time_travel { - self.build_read_version_by_time_travel(epoch, read_options.table_id, key_range) - .await? - } else if read_options.read_version_from_backup { - self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) - .await? - } else { - self.build_read_version_tuple(epoch, read_options.table_id, key_range)? - }; + let (key_range, read_version_tuple) = self + .build_read_version_tuple(epoch, key_range, &read_options) + .await?; self.hummock_version_reader .iter(key_range, epoch, read_options, read_version_tuple) @@ -306,27 +295,20 @@ impl HummockStorage { epoch: u64, read_options: ReadOptions, ) -> StorageResult { - let (key_range, read_version_tuple) = if read_options.read_version_from_time_travel { - self.build_read_version_by_time_travel(epoch, read_options.table_id, key_range) - .await? - } else if read_options.read_version_from_backup { - self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) - .await? - } else { - self.build_read_version_tuple(epoch, read_options.table_id, key_range)? - }; + let (key_range, read_version_tuple) = self + .build_read_version_tuple(epoch, key_range, &read_options) + .await?; self.hummock_version_reader .rev_iter(key_range, epoch, read_options, read_version_tuple, None) .await } - async fn build_read_version_by_time_travel( + async fn get_time_travel_version( &self, epoch: u64, table_id: TableId, - key_range: TableKeyRange, - ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { + ) -> StorageResult { let fetch = async { let pb_version = self .hummock_meta_client @@ -335,7 +317,6 @@ impl HummockStorage { .inspect_err(|e| tracing::error!("{}", e.to_report_string())) .map_err(|e| HummockError::meta_error(e.to_report_string()))?; let version = HummockVersion::from_rpc_protobuf(&pb_version); - validate_safe_epoch(&version, table_id, epoch)?; let (tx, _rx) = unbounded_channel(); Ok(PinnedVersion::new(version, tx)) }; @@ -343,9 +324,24 @@ impl HummockStorage { .simple_time_travel_version_cache .get_or_insert(epoch, fetch) .await?; - Ok(get_committed_read_version_tuple( - version, table_id, key_range, epoch, - )) + Ok(version) + } + + async fn build_read_version_tuple( + &self, + epoch: u64, + key_range: TableKeyRange, + read_options: &ReadOptions, + ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { + if read_options.read_version_from_backup { + self.build_read_version_tuple_from_backup(epoch, read_options.table_id, key_range) + .await + } else if read_options.read_committed { + self.build_read_version_tuple_from_committed(epoch, read_options.table_id, key_range) + .await + } else { + self.build_read_version_tuple_from_all(epoch, read_options.table_id, key_range) + } } async fn build_read_version_tuple_from_backup( @@ -359,16 +355,12 @@ impl HummockStorage { .try_get_hummock_version(table_id, epoch) .await { - Ok(Some(backup_version)) => { - validate_safe_epoch(backup_version.version(), table_id, epoch)?; - - Ok(get_committed_read_version_tuple( - backup_version, - table_id, - key_range, - epoch, - )) - } + Ok(Some(backup_version)) => Ok(get_committed_read_version_tuple( + backup_version, + table_id, + key_range, + epoch, + )), Ok(None) => Err(HummockError::read_backup_error(format!( "backup include epoch {} not found", epoch @@ -378,27 +370,47 @@ impl HummockStorage { } } - fn build_read_version_tuple( + async fn build_read_version_tuple_from_committed( &self, epoch: u64, table_id: TableId, key_range: TableKeyRange, ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { - let pinned_version = self.pinned_version.load(); - validate_safe_epoch(pinned_version.version(), table_id, epoch)?; - let table_committed_epoch = pinned_version + let version = match self + .recent_versions + .load() + .get_safe_version(table_id, epoch) + { + Some(version) => version, + None => self.get_time_travel_version(epoch, table_id).await?, + }; + Ok(get_committed_read_version_tuple( + version, table_id, key_range, epoch, + )) + } + + fn build_read_version_tuple_from_all( + &self, + epoch: u64, + table_id: TableId, + key_range: TableKeyRange, + ) -> StorageResult<(TableKeyRange, ReadVersionTuple)> { + let pinned_version = self.recent_versions.load().latest_version().clone(); + let info = pinned_version .version() .state_table_info .info() - .get(&table_id) - .map(|info| info.committed_epoch); + .get(&table_id); // check epoch if lower mce - let ret = if let Some(table_committed_epoch) = table_committed_epoch - && epoch <= table_committed_epoch + let ret = if let Some(info) = info + && epoch <= info.committed_epoch { + if epoch < info.safe_epoch { + return Err(HummockError::expired_epoch(table_id, info.safe_epoch, epoch).into()); + } // read committed_version directly without build snapshot - get_committed_read_version_tuple((**pinned_version).clone(), table_id, key_range, epoch) + get_committed_read_version_tuple(pinned_version, table_id, key_range, epoch) } else { let vnode = vnode(&key_range); let mut matched_replicated_read_version_cnt = 0; @@ -431,6 +443,7 @@ impl HummockStorage { // When the system has just started and no state has been created, the memory state // may be empty if read_version_vec.is_empty() { + let table_committed_epoch = info.map(|info| info.committed_epoch); if matched_replicated_read_version_cnt > 0 { tracing::warn!( "Read(table_id={} vnode={} epoch={}) is not allowed on replicated read version ({} found). Fall back to committed version (epoch={:?})", @@ -449,12 +462,7 @@ impl HummockStorage { table_committed_epoch ); } - get_committed_read_version_tuple( - (**pinned_version).clone(), - table_id, - key_range, - epoch, - ) + get_committed_read_version_tuple(pinned_version, table_id, key_range, epoch) } else { if read_version_vec.len() != 1 { let read_version_vnodes = read_version_vec @@ -538,7 +546,7 @@ impl HummockStorage { } pub fn get_pinned_version(&self) -> PinnedVersion { - self.pinned_version.load().deref().deref().clone() + self.recent_versions.load().latest_version().clone() } pub fn backup_reader(&self) -> BackupReaderRef { @@ -604,7 +612,7 @@ impl StateStoreRead for HummockStorage { key_range: TableKeyRange, options: ReadLogOptions, ) -> StorageResult { - let version = (**self.pinned_version.load()).clone(); + let version = self.recent_versions.load().latest_version().clone(); let iter = self .hummock_version_reader .iter_log(version, epoch_range, key_range, options) @@ -653,16 +661,8 @@ impl HummockStorage { pub async fn seal_and_sync_epoch( &self, epoch: u64, + table_ids: HashSet, ) -> StorageResult { - let table_ids = self - .pinned_version - .load() - .version() - .state_table_info - .info() - .keys() - .cloned() - .collect(); self.sync(epoch, table_ids).await } @@ -675,7 +675,7 @@ impl HummockStorage { .send(HummockVersionUpdate::PinnedVersion(Box::new(version))) .unwrap(); loop { - if self.pinned_version.load().id() >= version_id { + if self.recent_versions.load().latest_version().id() >= version_id { break; } @@ -686,7 +686,7 @@ impl HummockStorage { pub async fn wait_version(&self, version: HummockVersion) { use tokio::task::yield_now; loop { - if self.pinned_version.load().id() >= version.id { + if self.recent_versions.load().latest_version().id() >= version.id { break; } @@ -736,7 +736,7 @@ impl HummockStorage { pub async fn wait_version_update(&self, old_id: HummockVersionId) -> HummockVersionId { use tokio::task::yield_now; loop { - let cur_id = self.pinned_version.load().id(); + let cur_id = self.recent_versions.load().latest_version().id(); if cur_id > old_id { return cur_id; } diff --git a/src/storage/src/hummock/test_utils.rs b/src/storage/src/hummock/test_utils.rs index 8d96e29f5426d..7f3bad46b644a 100644 --- a/src/storage/src/hummock/test_utils.rs +++ b/src/storage/src/hummock/test_utils.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::BTreeSet; -use std::ops::Bound; use std::sync::Arc; use bytes::Bytes; @@ -26,26 +24,23 @@ use risingwave_common::catalog::TableId; use risingwave_common::config::EvictionConfig; use risingwave_common::hash::VirtualNode; use risingwave_common::util::epoch::test_epoch; -use risingwave_hummock_sdk::key::{FullKey, PointRange, TableKey, UserKey}; +use risingwave_hummock_sdk::key::{FullKey, TableKey, UserKey}; use risingwave_hummock_sdk::key_range::KeyRange; use risingwave_hummock_sdk::sstable_info::SstableInfo; use risingwave_hummock_sdk::{EpochWithGap, HummockEpoch, HummockSstableObjectId}; use super::iterator::test_utils::iterator_test_table_key_of; use super::{ - HummockResult, InMemWriter, MonotonicDeleteEvent, SstableMeta, SstableWriterOptions, - DEFAULT_RESTART_INTERVAL, + HummockResult, InMemWriter, SstableMeta, SstableWriterOptions, DEFAULT_RESTART_INTERVAL, }; use crate::filter_key_extractor::{FilterKeyExtractorImpl, FullKeyFilterKeyExtractor}; -use crate::hummock::iterator::ForwardMergeRangeIterator; use crate::hummock::shared_buffer::shared_buffer_batch::{ SharedBufferBatch, SharedBufferItem, SharedBufferValue, }; use crate::hummock::value::HummockValue; use crate::hummock::{ - BlockedXor16FilterBuilder, CachePolicy, CompactionDeleteRangeIterator, DeleteRangeTombstone, - FilterBuilder, LruCache, Sstable, SstableBuilder, SstableBuilderOptions, SstableStoreRef, - SstableWriter, TableHolder, Xor16FilterBuilder, + BlockedXor16FilterBuilder, CachePolicy, FilterBuilder, LruCache, Sstable, SstableBuilder, + SstableBuilderOptions, SstableStoreRef, SstableWriter, TableHolder, Xor16FilterBuilder, }; use crate::monitor::StoreLocalStatistic; use crate::opts::StorageOpts; @@ -387,167 +382,3 @@ where .await .unwrap() } - -pub mod delete_range { - use super::*; - use crate::hummock::shared_buffer::shared_buffer_batch::SharedBufferDeleteRangeIterator; - - #[derive(Default)] - pub struct CompactionDeleteRangesBuilder { - iter: ForwardMergeRangeIterator, - } - - impl CompactionDeleteRangesBuilder { - pub fn add_delete_events_for_test( - &mut self, - epoch: HummockEpoch, - table_id: TableId, - delete_ranges: Vec<(Bound, Bound)>, - ) { - self.iter - .add_batch_iter(SharedBufferDeleteRangeIterator::new( - test_epoch(epoch), - table_id, - delete_ranges, - )); - } - - pub fn build_for_compaction(self) -> CompactionDeleteRangeIterator { - CompactionDeleteRangeIterator::new(self.iter) - } - } - - /// Assume that watermark1 is 5, watermark2 is 7, watermark3 is 11, delete ranges - /// `{ [0, wmk1) in epoch1, [wmk1, wmk2) in epoch2, [wmk2, wmk3) in epoch3 }` - /// can be transformed into events below: - /// `{ <0, +epoch1> }` - fn build_events( - delete_tombstones: &Vec, - ) -> Vec { - let tombstone_len = delete_tombstones.len(); - let mut events = Vec::with_capacity(tombstone_len * 2); - for DeleteRangeTombstone { - start_user_key, - end_user_key, - sequence, - } in delete_tombstones - { - events.push((start_user_key, 1, *sequence)); - events.push((end_user_key, 0, *sequence)); - } - events.sort(); - - let mut result = Vec::with_capacity(events.len()); - for (user_key, group) in &events.into_iter().group_by(|(user_key, _, _)| *user_key) { - let (mut exit, mut enter) = (vec![], vec![]); - for (_, op, sequence) in group { - match op { - 0 => exit.push(TombstoneEnterExitEvent { - tombstone_epoch: sequence, - }), - 1 => { - enter.push(TombstoneEnterExitEvent { - tombstone_epoch: sequence, - }); - } - _ => unreachable!(), - } - } - result.push((user_key.clone(), exit, enter)); - } - - result - } - - pub fn create_monotonic_events( - mut delete_range_tombstones: Vec, - ) -> Vec { - delete_range_tombstones.sort(); - let events = build_events(&delete_range_tombstones); - create_monotonic_events_from_compaction_delete_events(events) - } - - fn create_monotonic_events_from_compaction_delete_events( - compaction_delete_range_events: Vec, - ) -> Vec { - let mut epochs = BTreeSet::new(); - let mut monotonic_tombstone_events = - Vec::with_capacity(compaction_delete_range_events.len()); - for event in compaction_delete_range_events { - apply_event(&mut epochs, &event); - monotonic_tombstone_events.push(MonotonicDeleteEvent { - event_key: event.0, - new_epoch: epochs.first().map_or(HummockEpoch::MAX, |epoch| *epoch), - }); - } - monotonic_tombstone_events.dedup_by(|a, b| { - a.event_key.left_user_key.table_id == b.event_key.left_user_key.table_id - && a.new_epoch == b.new_epoch - }); - monotonic_tombstone_events - } - - #[derive(Clone)] - #[cfg(any(test, feature = "test"))] - pub struct TombstoneEnterExitEvent { - pub(crate) tombstone_epoch: HummockEpoch, - } - - #[cfg(any(test, feature = "test"))] - pub type CompactionDeleteRangeEvent = ( - // event key - PointRange>, - // Old tombstones which exits at the event key - Vec, - // New tombstones which enters at the event key - Vec, - ); - /// We introduce `event` to avoid storing excessive range tombstones after compaction if there are - /// overlaps among range tombstones among different SSTs/batchs in compaction. - /// The core idea contains two parts: - /// 1) we only need to keep the smallest epoch of the overlapping - /// range tomstone intervals since the key covered by the range tombstone in lower level must have - /// smaller epoches; - /// 2) due to 1), we lose the information to delete a key by tombstone in a single - /// SST so we add a tombstone key in the data block. - /// We leverage `events` to calculate the epoch information mentioned above. - /// - /// e.g. Delete range [1, 5) at epoch1, delete range [3, 7) at epoch2 and delete range [10, 12) at - /// epoch3 will first be transformed into `events` below: - /// - /// `<1, +epoch1> <5, -epoch1> <3, +epoch2> <7, -epoch2> <10, +epoch3> <12, -epoch3>` - /// - /// Then `events` are sorted by user key: - /// - /// `<1, +epoch1> <3, +epoch2> <5, -epoch1> <7, -epoch2> <10, +epoch3> <12, -epoch3>` - /// - /// We rely on the fact that keys met in compaction are in order. - /// - /// When user key 0 comes, no events have happened yet so no range delete epoch. (will be - /// represented as range delete epoch MAX EPOCH) - /// - /// When user key 1 comes, event `<1, +epoch1>` happens so there is currently one range delete - /// epoch: epoch1. - /// - /// When user key 2 comes, no more events happen so the set remains `{epoch1}`. - /// - /// When user key 3 comes, event `<3, +epoch2>` happens so the range delete epoch set is now - /// `{epoch1, epoch2}`. - /// - /// When user key 5 comes, event `<5, -epoch1>` happens so epoch1 exits the set, - /// therefore the current range delete epoch set is `{epoch2}`. - /// - /// When user key 11 comes, events `<7, -epoch2>` and `<10, +epoch3>` - /// both happen, one after another. The set changes to `{epoch3}` from `{epoch2}`. - pub fn apply_event(epochs: &mut BTreeSet, event: &CompactionDeleteRangeEvent) { - let (_, exit, enter) = event; - // Correct because ranges in an epoch won't intersect. - for TombstoneEnterExitEvent { tombstone_epoch } in exit { - epochs.remove(tombstone_epoch); - } - for TombstoneEnterExitEvent { tombstone_epoch } in enter { - epochs.insert(*tombstone_epoch); - } - } -} diff --git a/src/storage/src/hummock/utils.rs b/src/storage/src/hummock/utils.rs index 3f2d1f989f529..c2f6cbafed79b 100644 --- a/src/storage/src/hummock/utils.rs +++ b/src/storage/src/hummock/utils.rs @@ -30,11 +30,10 @@ use risingwave_hummock_sdk::key::{ bound_table_key_range, EmptySliceRef, FullKey, TableKey, UserKey, }; use risingwave_hummock_sdk::sstable_info::SstableInfo; -use risingwave_hummock_sdk::version::HummockVersion; use risingwave_hummock_sdk::{can_concat, HummockEpoch}; use tokio::sync::oneshot::{channel, Receiver, Sender}; -use super::{HummockError, HummockResult, SstableStoreRef}; +use super::{HummockError, SstableStoreRef}; use crate::error::StorageResult; use crate::hummock::CachePolicy; use crate::mem_table::{KeyOp, MemTableError}; @@ -72,24 +71,6 @@ where !too_left && !too_right } -pub fn validate_safe_epoch( - version: &HummockVersion, - table_id: TableId, - epoch: u64, -) -> HummockResult<()> { - if let Some(info) = version.state_table_info.info().get(&table_id) - && epoch < info.safe_epoch - { - return Err(HummockError::expired_epoch( - table_id, - info.safe_epoch, - epoch, - )); - } - - Ok(()) -} - pub fn filter_single_sst(info: &SstableInfo, table_id: TableId, table_key_range: &R) -> bool where R: RangeBounds>, diff --git a/src/storage/src/opts.rs b/src/storage/src/opts.rs index f6d6f31fb3a4f..a3a787f55c97d 100644 --- a/src/storage/src/opts.rs +++ b/src/storage/src/opts.rs @@ -63,6 +63,8 @@ pub struct StorageOpts { /// max memory usage for large query. pub prefetch_buffer_capacity_mb: usize, + pub max_cached_recent_versions_number: usize, + pub max_prefetch_block_number: usize, pub disable_remote_compactor: bool, @@ -170,6 +172,10 @@ impl From<(&RwConfig, &SystemParamsReader, &StorageMemoryConfig)> for StorageOpt meta_cache_shard_num: s.meta_cache_shard_num, meta_cache_eviction_config: s.meta_cache_eviction_config.clone(), prefetch_buffer_capacity_mb: s.prefetch_buffer_capacity_mb, + max_cached_recent_versions_number: c + .storage + .max_cached_recent_versions_number + .unwrap_or(60), max_prefetch_block_number: c.storage.max_prefetch_block_number, disable_remote_compactor: c.storage.disable_remote_compactor, share_buffer_upload_concurrency: c.storage.share_buffer_upload_concurrency, diff --git a/src/storage/src/store.rs b/src/storage/src/store.rs index 91f79231f6939..db21faa78c6cf 100644 --- a/src/storage/src/store.rs +++ b/src/storage/src/store.rs @@ -493,7 +493,6 @@ pub struct ReadOptions { /// If the `prefix_hint` is not None, it should be included in /// `key` or `key_range` in the read API. pub prefix_hint: Option, - pub ignore_range_tombstone: bool, pub prefetch_options: PrefetchOptions, pub cache_policy: CachePolicy, @@ -502,20 +501,19 @@ pub struct ReadOptions { /// Read from historical hummock version of meta snapshot backup. /// It should only be used by `StorageTable` for batch query. pub read_version_from_backup: bool, - pub read_version_from_time_travel: bool, + pub read_committed: bool, } impl From for ReadOptions { fn from(value: TracedReadOptions) -> Self { Self { prefix_hint: value.prefix_hint.map(|b| b.into()), - ignore_range_tombstone: value.ignore_range_tombstone, prefetch_options: value.prefetch_options.into(), cache_policy: value.cache_policy.into(), retention_seconds: value.retention_seconds, table_id: value.table_id.into(), read_version_from_backup: value.read_version_from_backup, - read_version_from_time_travel: value.read_version_from_time_travel, + read_committed: value.read_committed, } } } @@ -524,13 +522,12 @@ impl From for TracedReadOptions { fn from(value: ReadOptions) -> Self { Self { prefix_hint: value.prefix_hint.map(|b| b.into()), - ignore_range_tombstone: value.ignore_range_tombstone, prefetch_options: value.prefetch_options.into(), cache_policy: value.cache_policy.into(), retention_seconds: value.retention_seconds, table_id: value.table_id.into(), read_version_from_backup: value.read_version_from_backup, - read_version_from_time_travel: value.read_version_from_time_travel, + read_committed: value.read_committed, } } } diff --git a/src/storage/src/table/batch_table/storage_table.rs b/src/storage/src/table/batch_table/storage_table.rs index 7a0ad76cce4a5..254e8e73095b1 100644 --- a/src/storage/src/table/batch_table/storage_table.rs +++ b/src/storage/src/table/batch_table/storage_table.rs @@ -361,7 +361,10 @@ impl StorageTableInner { ) -> StorageResult> { let epoch = wait_epoch.get_epoch(); let read_backup = matches!(wait_epoch, HummockReadEpoch::Backup(_)); - let read_time_travel = matches!(wait_epoch, HummockReadEpoch::TimeTravel(_)); + let read_committed = matches!( + wait_epoch, + HummockReadEpoch::TimeTravel(_) | HummockReadEpoch::Committed(_) + ); self.store.try_wait_epoch(wait_epoch).await?; let serialized_pk = serialize_pk_with_vnode( &pk, @@ -382,7 +385,7 @@ impl StorageTableInner { retention_seconds: self.table_option.retention_seconds, table_id: self.table_id, read_version_from_backup: read_backup, - read_version_from_time_travel: read_time_travel, + read_committed, cache_policy: CachePolicy::Fill(CacheContext::Default), ..Default::default() }; @@ -487,17 +490,19 @@ impl StorageTableInner { let iterators: Vec<_> = try_join_all(table_key_ranges.map(|table_key_range| { let prefix_hint = prefix_hint.clone(); let read_backup = matches!(wait_epoch, HummockReadEpoch::Backup(_)); - let read_time_travel = matches!(wait_epoch, HummockReadEpoch::TimeTravel(_)); + let read_committed = matches!( + wait_epoch, + HummockReadEpoch::TimeTravel(_) | HummockReadEpoch::Committed(_) + ); async move { let read_options = ReadOptions { prefix_hint, retention_seconds: self.table_option.retention_seconds, table_id: self.table_id, read_version_from_backup: read_backup, - read_version_from_time_travel: read_time_travel, + read_committed, prefetch_options, cache_policy, - ..Default::default() }; let pk_serializer = match self.output_row_in_key_indices.is_empty() { true => None, diff --git a/src/stream/benches/bench_state_table.rs b/src/stream/benches/bench_state_table.rs index 06a5be2fb3d3b..ceaabdec5b637 100644 --- a/src/stream/benches/bench_state_table.rs +++ b/src/stream/benches/bench_state_table.rs @@ -24,6 +24,7 @@ use risingwave_common::util::epoch::{test_epoch, EpochPair}; use risingwave_common::util::sort_util::OrderType; use risingwave_storage::memory::MemoryStateStore; use risingwave_stream::common::table::state_table::WatermarkCacheParameterizedStateTable; +use risingwave_stream::common::table::test_utils::gen_pbtable; use tokio::runtime::Runtime; type TestStateTable = @@ -72,12 +73,10 @@ async fn create_state_table( let pk_indices = (0..key_length).collect(); let store = MemoryStateStore::new(); - TestStateTable::::new_without_distribution_inconsistent_op( + TestStateTable::::from_table_catalog_inconsistent_op( + &gen_pbtable(TEST_TABLE_ID, column_descs, order_types, pk_indices, 0), store, - TEST_TABLE_ID, - column_descs, - order_types, - pk_indices, + None, ) .await } diff --git a/src/stream/spill_test/src/test_mem_table.rs b/src/stream/spill_test/src/test_mem_table.rs index 9f21ce5ae60be..75407feeaa286 100644 --- a/src/stream/spill_test/src/test_mem_table.rs +++ b/src/stream/spill_test/src/test_mem_table.rs @@ -19,7 +19,7 @@ use risingwave_common::util::epoch::{test_epoch, EpochPair}; use risingwave_common::util::sort_util::OrderType; use risingwave_hummock_test::test_utils::prepare_hummock_test_env; use risingwave_stream::common::table::state_table::StateTable; -use risingwave_stream::common::table::test_utils::gen_prost_table; +use risingwave_stream::common::table::test_utils::gen_pbtable; #[tokio::test] async fn test_mem_table_spill_in_streaming() { @@ -51,7 +51,7 @@ async fn test_mem_table_spill_in_streaming() { let order_types = vec![OrderType::ascending()]; let pk_index = vec![0_usize]; let read_prefix_len_hint = 1; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -181,7 +181,7 @@ async fn test_mem_table_spill_in_streaming_multiple_times() { let order_types = vec![OrderType::ascending()]; let pk_index = vec![0_usize]; let read_prefix_len_hint = 1; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, diff --git a/src/stream/src/common/log_store_impl/kv_log_store/mod.rs b/src/stream/src/common/log_store_impl/kv_log_store/mod.rs index f4e62e429effa..440c7188d2fa1 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/mod.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/mod.rs @@ -526,7 +526,11 @@ mod tests { let epoch3 = epoch2.next_epoch(); writer.flush_current_epoch(epoch3, true).await.unwrap(); - let sync_result = test_env.storage.seal_and_sync_epoch(epoch2).await.unwrap(); + let sync_result = test_env + .storage + .seal_and_sync_epoch(epoch2, HashSet::from_iter([table.id.into()])) + .await + .unwrap(); assert!(!sync_result.uncommitted_ssts.is_empty()); reader.init().await.unwrap(); diff --git a/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs b/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs index 3114c22e63323..812f80f28990c 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs @@ -24,7 +24,7 @@ use risingwave_common::util::chunk_coalesce::DataChunkBuilder; use risingwave_pb::catalog::PbTable; use crate::common::log_store_impl::kv_log_store::KvLogStorePkInfo; -use crate::common::table::test_utils::gen_prost_table_with_dist_key; +use crate::common::table::test_utils::gen_pbtable_with_dist_key; pub(crate) const TEST_TABLE_ID: TableId = TableId { table_id: 233 }; pub(crate) const TEST_DATA_SIZE: usize = 10; @@ -164,7 +164,7 @@ pub(crate) fn gen_test_log_store_table(pk_info: &'static KvLogStorePkInfo) -> Pb let order_types = pk_info.pk_orderings.to_vec(); let pk_index = (0..pk_info.pk_len()).collect(); let read_prefix_len_hint = 0; - gen_prost_table_with_dist_key( + gen_pbtable_with_dist_key( TEST_TABLE_ID, schema, order_types, diff --git a/src/stream/src/common/table/state_table.rs b/src/stream/src/common/table/state_table.rs index df85aa7f260ce..83040884fc4cf 100644 --- a/src/stream/src/common/table/state_table.rs +++ b/src/stream/src/common/table/state_table.rs @@ -523,196 +523,6 @@ where } } - /// Create a state table without distribution, used for unit tests. - pub async fn new_without_distribution( - store: S, - table_id: TableId, - columns: Vec, - order_types: Vec, - pk_indices: Vec, - ) -> Self { - Self::new_with_distribution( - store, - table_id, - columns, - order_types, - pk_indices, - TableDistribution::singleton(), - None, - ) - .await - } - - /// Create a state table without distribution, with given `value_indices`, used for unit tests. - pub async fn new_without_distribution_with_value_indices( - store: S, - table_id: TableId, - columns: Vec, - order_types: Vec, - pk_indices: Vec, - value_indices: Vec, - ) -> Self { - Self::new_with_distribution( - store, - table_id, - columns, - order_types, - pk_indices, - TableDistribution::singleton(), - Some(value_indices), - ) - .await - } - - /// Create a state table without distribution, used for unit tests. - pub async fn new_without_distribution_inconsistent_op( - store: S, - table_id: TableId, - columns: Vec, - order_types: Vec, - pk_indices: Vec, - ) -> Self { - Self::new_with_distribution_inner( - store, - table_id, - columns, - order_types, - pk_indices, - TableDistribution::singleton(), - None, - false, - ) - .await - } - - /// Create a state table with distribution specified with `distribution`. Should use - /// `Distribution::fallback()` for tests. - pub async fn new_with_distribution( - store: S, - table_id: TableId, - table_columns: Vec, - order_types: Vec, - pk_indices: Vec, - distribution: TableDistribution, - value_indices: Option>, - ) -> Self { - Self::new_with_distribution_inner( - store, - table_id, - table_columns, - order_types, - pk_indices, - distribution, - value_indices, - true, - ) - .await - } - - /// Create a state table with distribution and without sanity check, used for unit tests. - pub async fn new_with_distribution_inconsistent_op( - store: S, - table_id: TableId, - table_columns: Vec, - order_types: Vec, - pk_indices: Vec, - distribution: TableDistribution, - value_indices: Option>, - ) -> Self { - Self::new_with_distribution_inner( - store, - table_id, - table_columns, - order_types, - pk_indices, - distribution, - value_indices, - false, - ) - .await - } - - #[allow(clippy::too_many_arguments)] - async fn new_with_distribution_inner( - store: S, - table_id: TableId, - table_columns: Vec, - order_types: Vec, - pk_indices: Vec, - distribution: TableDistribution, - value_indices: Option>, - is_consistent_op: bool, - ) -> Self { - let make_row_serde = || { - SD::new( - Arc::from( - value_indices - .clone() - .unwrap_or_else(|| (0..table_columns.len()).collect_vec()) - .into_boxed_slice(), - ), - Arc::from(table_columns.clone().into_boxed_slice()), - ) - }; - let op_consistency_level = if is_consistent_op { - let row_serde = make_row_serde(); - consistent_old_value_op(row_serde, false) - } else { - OpConsistencyLevel::Inconsistent - }; - let local_state_store = store - .new_local(NewLocalOptions::new( - table_id, - op_consistency_level, - TableOption::default(), - distribution.vnodes().clone(), - )) - .await; - let row_serde = make_row_serde(); - let data_types: Vec = table_columns - .iter() - .map(|col| col.data_type.clone()) - .collect(); - let pk_data_types = pk_indices - .iter() - .map(|i| table_columns[*i].data_type.clone()) - .collect(); - let pk_serde = OrderedRowSerde::new(pk_data_types, order_types); - - // TODO: let's not restore watermark in unit tests for now, to avoid complexity. - let committed_watermark = None; - - let watermark_cache = if USE_WATERMARK_CACHE { - StateTableWatermarkCache::new(WATERMARK_CACHE_ENTRIES) - } else { - StateTableWatermarkCache::new(0) - }; - - Self { - table_id, - local_store: local_state_store, - store, - pk_serde, - row_serde, - pk_indices, - distribution, - prefix_hint_len: 0, - table_option: Default::default(), - value_indices, - pending_watermark: None, - committed_watermark, - watermark_cache, - data_types, - output_indices: vec![], - i2o_mapping: ColIndexMapping::new(vec![], 0), - op_consistency_level: if is_consistent_op { - StateTableOpConsistencyLevel::ConsistentOldValue - } else { - StateTableOpConsistencyLevel::Inconsistent - }, - } - } - pub fn get_data_types(&self) -> &[DataType] { &self.data_types } diff --git a/src/stream/src/common/table/test_state_table.rs b/src/stream/src/common/table/test_state_table.rs index dde0d8a581406..99036e968f046 100644 --- a/src/stream/src/common/table/test_state_table.rs +++ b/src/stream/src/common/table/test_state_table.rs @@ -32,7 +32,7 @@ use risingwave_storage::table::SINGLETON_VNODE; use crate::common::table::state_table::{ ReplicatedStateTable, StateTable, WatermarkCacheStateTable, }; -use crate::common::table::test_utils::{gen_prost_table, gen_prost_table_with_value_indices}; +use crate::common::table::test_utils::{gen_pbtable, gen_pbtable_with_value_indices}; #[tokio::test] async fn test_state_table_update_insert() { @@ -48,7 +48,7 @@ async fn test_state_table_update_insert() { let order_types = vec![OrderType::ascending()]; let pk_index = vec![0_usize]; let read_prefix_len_hint = 1; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -241,7 +241,7 @@ async fn test_state_table_iter_with_prefix() { ]; let pk_index = vec![0_usize, 1_usize]; let read_prefix_len_hint = 1; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -376,7 +376,7 @@ async fn test_state_table_iter_with_pk_range() { ]; let pk_index = vec![0_usize, 1_usize]; let read_prefix_len_hint = 1; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -516,7 +516,7 @@ async fn test_mem_table_assertion() { let order_types = vec![OrderType::ascending()]; let pk_index = vec![0_usize]; let read_prefix_len_hint = 1; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -560,7 +560,7 @@ async fn test_state_table_iter_with_value_indices() { ]; let pk_index = vec![0_usize, 1_usize]; let read_prefix_len_hint = 0; - let table = gen_prost_table_with_value_indices( + let table = gen_pbtable_with_value_indices( TEST_TABLE_ID, column_descs, order_types, @@ -733,7 +733,7 @@ async fn test_state_table_iter_with_shuffle_value_indices() { ]; let pk_index = vec![0_usize, 1_usize]; let read_prefix_len_hint = 0; - let table = gen_prost_table_with_value_indices( + let table = gen_pbtable_with_value_indices( TEST_TABLE_ID, column_descs, order_types, @@ -981,7 +981,7 @@ async fn test_state_table_write_chunk() { let order_types = vec![OrderType::ascending()]; let pk_index = vec![0_usize]; let read_prefix_len_hint = 0; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -1113,7 +1113,7 @@ async fn test_state_table_write_chunk_visibility() { let order_types = vec![OrderType::ascending()]; let pk_index = vec![0_usize]; let read_prefix_len_hint = 0; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -1239,7 +1239,7 @@ async fn test_state_table_write_chunk_value_indices() { let order_types = vec![OrderType::ascending()]; let pk_index = vec![0_usize]; let read_prefix_len_hint = 0; - let table = gen_prost_table_with_value_indices( + let table = gen_pbtable_with_value_indices( TEST_TABLE_ID, column_descs, order_types, @@ -1338,7 +1338,7 @@ async fn test_state_table_watermark_cache_ignore_null() { let order_types = vec![OrderType::ascending(), OrderType::descending()]; let pk_index = vec![0_usize, 1_usize]; let read_prefix_len_hint = 0; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -1464,7 +1464,7 @@ async fn test_state_table_watermark_cache_write_chunk() { let order_types = vec![OrderType::ascending(), OrderType::descending()]; let pk_index = vec![0_usize, 1_usize]; let read_prefix_len_hint = 0; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -1639,7 +1639,7 @@ async fn test_state_table_watermark_cache_refill() { let order_types = vec![OrderType::ascending(), OrderType::descending()]; let pk_index = vec![0_usize, 1_usize]; let read_prefix_len_hint = 0; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -1735,7 +1735,7 @@ async fn test_state_table_iter_prefix_and_sub_range() { ]; let pk_index = vec![0_usize, 1_usize]; let read_prefix_len_hint = 0; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, @@ -1921,7 +1921,7 @@ async fn test_replicated_state_table_replication() { ]; let pk_index = vec![0_usize]; let read_prefix_len_hint = 1; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs, order_types, diff --git a/src/stream/src/common/table/test_storage_table.rs b/src/stream/src/common/table/test_storage_table.rs index 1eb552271dced..be09fa1e91833 100644 --- a/src/stream/src/common/table/test_storage_table.rs +++ b/src/stream/src/common/table/test_storage_table.rs @@ -28,7 +28,7 @@ use risingwave_storage::table::batch_table::storage_table::StorageTable; use risingwave_storage::table::TableIter; use crate::common::table::state_table::StateTable; -use crate::common::table::test_utils::{gen_prost_table, gen_prost_table_with_value_indices}; +use crate::common::table::test_utils::{gen_pbtable, gen_pbtable_with_value_indices}; /// There are three struct in relational layer, StateTable, MemTable and StorageTable. /// `StateTable` provides read/write interfaces to the upper layer streaming operator. @@ -56,7 +56,7 @@ async fn test_storage_table_value_indices() { let order_types = vec![OrderType::ascending(), OrderType::descending()]; let value_indices = vec![1, 3, 4]; let read_prefix_len_hint = 2; - let table = gen_prost_table_with_value_indices( + let table = gen_pbtable_with_value_indices( TEST_TABLE_ID, column_descs.clone(), order_types.clone(), @@ -76,7 +76,7 @@ async fn test_storage_table_value_indices() { column_descs.clone(), order_types.clone(), pk_indices, - value_indices.into_iter().map(|v| v as usize).collect_vec(), + value_indices, ); let mut epoch = EpochPair::new_test_epoch(test_epoch(1)); test_env @@ -191,7 +191,7 @@ async fn test_shuffled_column_id_for_storage_table_get_row() { let order_types = vec![OrderType::ascending(), OrderType::descending()]; let pk_indices = vec![0_usize, 1_usize]; let read_prefix_len_hint = 2; - let table = gen_prost_table( + let table = gen_pbtable( TEST_TABLE_ID, column_descs.clone(), order_types.clone(), @@ -299,13 +299,13 @@ async fn test_row_based_storage_table_point_get_in_batch_mode() { let order_types = vec![OrderType::ascending(), OrderType::descending()]; let value_indices: Vec = vec![0, 1, 2]; let read_prefix_len_hint = 0; - let table = gen_prost_table_with_value_indices( + let table = gen_pbtable_with_value_indices( TEST_TABLE_ID, column_descs.clone(), order_types.clone(), pk_indices.clone(), read_prefix_len_hint, - value_indices.iter().map(|v| *v as i32).collect_vec(), + value_indices.clone(), ); test_env.register_table(table.clone()).await; @@ -409,13 +409,13 @@ async fn test_batch_scan_with_value_indices() { let pk_indices = vec![0_usize, 2_usize]; let value_indices: Vec = vec![1, 3]; let read_prefix_len_hint = 0; - let table = gen_prost_table_with_value_indices( + let table = gen_pbtable_with_value_indices( TEST_TABLE_ID, column_descs.clone(), order_types.clone(), pk_indices.clone(), read_prefix_len_hint, - value_indices.iter().map(|v| *v as i32).collect_vec(), + value_indices.clone(), ); test_env.register_table(table.clone()).await; @@ -509,13 +509,13 @@ async fn test_batch_scan_chunk_with_value_indices() { let pk_indices = vec![0_usize, 2_usize]; let value_indices: Vec = vec![1, 3]; let read_prefix_len_hint = 0; - let table = gen_prost_table_with_value_indices( + let table = gen_pbtable_with_value_indices( TEST_TABLE_ID, column_descs.clone(), order_types.clone(), pk_indices.clone(), read_prefix_len_hint, - value_indices.iter().map(|v| *v as i32).collect_vec(), + value_indices.clone(), ); test_env.register_table(table.clone()).await; diff --git a/src/stream/src/common/table/test_utils.rs b/src/stream/src/common/table/test_utils.rs index c037b6de8b922..1b379f044cc65 100644 --- a/src/stream/src/common/table/test_utils.rs +++ b/src/stream/src/common/table/test_utils.rs @@ -20,73 +20,73 @@ use risingwave_pb::catalog::PbTable; use risingwave_pb::common::PbColumnOrder; use risingwave_pb::plan_common::ColumnCatalog; -pub fn gen_prost_table( +pub fn gen_pbtable( table_id: TableId, column_descs: Vec, order_types: Vec, - pk_index: Vec, - read_prefix_len_hint: u32, + pk_indices: Vec, + read_prefix_len_hint: usize, ) -> PbTable { - let col_len = column_descs.len() as i32; - gen_prost_table_with_value_indices( + let value_indices = (0..column_descs.len()).collect_vec(); + gen_pbtable_with_value_indices( table_id, column_descs, order_types, - pk_index, + pk_indices, read_prefix_len_hint, - (0..col_len).collect_vec(), + value_indices, ) } -pub fn gen_prost_table_with_dist_key( +pub fn gen_pbtable_with_dist_key( table_id: TableId, column_descs: Vec, order_types: Vec, - pk_index: Vec, - read_prefix_len_hint: u32, + pk_indices: Vec, + read_prefix_len_hint: usize, distribution_key: Vec, ) -> PbTable { - let col_len = column_descs.len() as i32; - gen_prost_table_inner( + let value_indices = (0..column_descs.len()).collect_vec(); + gen_pbtable_inner( table_id, column_descs, order_types, - pk_index, + pk_indices, read_prefix_len_hint, - (0..col_len).collect_vec(), + value_indices, distribution_key, ) } -pub fn gen_prost_table_with_value_indices( +pub fn gen_pbtable_with_value_indices( table_id: TableId, column_descs: Vec, order_types: Vec, - pk_index: Vec, - read_prefix_len_hint: u32, - value_indices: Vec, + pk_indices: Vec, + read_prefix_len_hint: usize, + value_indices: Vec, ) -> PbTable { - gen_prost_table_inner( + gen_pbtable_inner( table_id, column_descs, order_types, - pk_index, + pk_indices, read_prefix_len_hint, value_indices, Vec::default(), ) } -pub fn gen_prost_table_inner( +pub fn gen_pbtable_inner( table_id: TableId, column_descs: Vec, order_types: Vec, - pk_index: Vec, - read_prefix_len_hint: u32, - value_indices: Vec, + pk_indices: Vec, + read_prefix_len_hint: usize, + value_indices: Vec, distribution_key: Vec, ) -> PbTable { - let prost_pk = pk_index + let prost_pk = pk_indices .iter() .zip_eq_fast(order_types.iter()) .map(|(idx, order)| PbColumnOrder { @@ -102,13 +102,14 @@ pub fn gen_prost_table_inner( }) .collect(); + let value_indices = value_indices.into_iter().map(|i| i as i32).collect_vec(); let distribution_key = distribution_key.into_iter().map(|i| i as i32).collect_vec(); PbTable { id: table_id.table_id(), columns: prost_columns, pk: prost_pk, - read_prefix_len_hint, + read_prefix_len_hint: read_prefix_len_hint as u32, value_indices, distribution_key, ..Default::default() diff --git a/src/stream/src/executor/actor.rs b/src/stream/src/executor/actor.rs index 4e56e3b0c2262..b1b167567ed33 100644 --- a/src/stream/src/executor/actor.rs +++ b/src/stream/src/executor/actor.rs @@ -56,7 +56,7 @@ pub struct ActorContext { /// This is the number of dispatchers when the actor is created. It will not be updated during runtime when new downstreams are added. pub initial_dispatch_num: usize, // mv_table_id to subscription id - pub related_subscriptions: HashMap>, + pub related_subscriptions: Arc>>, // Meta client. currently used for auto schema change. `None` for test only pub meta_client: Option, @@ -78,7 +78,7 @@ impl ActorContext { streaming_metrics: Arc::new(StreamingMetrics::unused()), // Set 1 for test to enable sanity check on table initial_dispatch_num: 1, - related_subscriptions: HashMap::new(), + related_subscriptions: HashMap::new().into(), meta_client: None, streaming_config: Arc::new(StreamingConfig::default()), }) @@ -89,7 +89,7 @@ impl ActorContext { total_mem_val: Arc>, streaming_metrics: Arc, initial_dispatch_num: usize, - related_subscriptions: HashMap>, + related_subscriptions: Arc>>, meta_client: Option, streaming_config: Arc, ) -> ActorContextRef { diff --git a/src/stream/src/executor/aggregation/distinct.rs b/src/stream/src/executor/aggregation/distinct.rs index ee54f524e592d..3ec5aae2e97bb 100644 --- a/src/stream/src/executor/aggregation/distinct.rs +++ b/src/stream/src/executor/aggregation/distinct.rs @@ -296,6 +296,7 @@ mod tests { use risingwave_storage::memory::MemoryStateStore; use super::*; + use crate::common::table::test_utils::gen_pbtable_with_value_indices; async fn infer_dedup_tables( agg_calls: &[AggCall], @@ -338,16 +339,17 @@ mod tests { add_column_desc(DataType::Int64); } - let n_columns = columns.len(); - let table = StateTable::new_without_distribution_with_value_indices( - store.clone(), + let pk_indices = (0..(group_key_types.len() + 1)).collect::>(); + let value_indices = ((group_key_types.len() + 1)..columns.len()).collect::>(); + let table_pb = gen_pbtable_with_value_indices( TableId::new(2333 + distinct_col as u32), columns, order_types, - (0..(group_key_types.len() + 1)).collect(), - ((group_key_types.len() + 1)..n_columns).collect(), - ) - .await; + pk_indices, + 0, + value_indices, + ); + let table = StateTable::from_table_catalog(&table_pb, store.clone(), None).await; dedup_tables.insert(distinct_col, table); } diff --git a/src/stream/src/executor/aggregation/minput.rs b/src/stream/src/executor/aggregation/minput.rs index 66ba0abc83fad..393be1878412b 100644 --- a/src/stream/src/executor/aggregation/minput.rs +++ b/src/stream/src/executor/aggregation/minput.rs @@ -312,6 +312,7 @@ mod tests { use super::MaterializedInputState; use crate::common::table::state_table::StateTable; + use crate::common::table::test_utils::gen_pbtable; use crate::common::StateTableColumnMapping; use crate::executor::aggregation::GroupKey; use crate::executor::{PkIndices, StreamExecutorResult}; @@ -341,12 +342,10 @@ mod tests { .collect_vec(); let mapping = StateTableColumnMapping::new(upstream_columns, None); let pk_len = order_types.len(); - let table = StateTable::new_without_distribution( + let table = StateTable::from_table_catalog( + &gen_pbtable(table_id, columns, order_types, (0..pk_len).collect(), 0), MemoryStateStore::new(), - table_id, - columns, - order_types, - (0..pk_len).collect(), + None, ) .await; (table, mapping) diff --git a/src/stream/src/executor/approx_percentile/global_state.rs b/src/stream/src/executor/approx_percentile/global_state.rs index 790d89699e781..58011a8450c88 100644 --- a/src/stream/src/executor/approx_percentile/global_state.rs +++ b/src/stream/src/executor/approx_percentile/global_state.rs @@ -299,7 +299,7 @@ impl BucketTableCache { } pub fn get_output(&self, row_count: i64, quantile: f64, base: f64) -> Datum { - let quantile_count = (row_count as f64 * quantile).floor() as i64; + let quantile_count = ((row_count - 1) as f64 * quantile).floor() as i64; let mut acc_count = 0; for (bucket_id, count) in self.neg_buckets.iter().rev() { acc_count += count; diff --git a/src/stream/src/executor/asof_join.rs b/src/stream/src/executor/asof_join.rs new file mode 100644 index 0000000000000..b2cd81a04d3b4 --- /dev/null +++ b/src/stream/src/executor/asof_join.rs @@ -0,0 +1,1386 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +use std::collections::{BTreeMap, HashSet}; +use std::ops::Bound; +use std::time::Duration; + +use either::Either; +use itertools::Itertools; +use multimap::MultiMap; +use risingwave_common::array::Op; +use risingwave_common::hash::{HashKey, NullBitmap}; +use risingwave_common::util::epoch::EpochPair; +use risingwave_common::util::iter_util::ZipEqDebug; +use tokio::time::Instant; + +use self::builder::JoinChunkBuilder; +use super::barrier_align::*; +use super::join::hash_join::*; +use super::join::*; +use super::watermark::*; +use crate::executor::join::builder::JoinStreamChunkBuilder; +use crate::executor::prelude::*; + +/// Evict the cache every n rows. +const EVICT_EVERY_N_ROWS: u32 = 16; + +fn is_subset(vec1: Vec, vec2: Vec) -> bool { + HashSet::::from_iter(vec1).is_subset(&vec2.into_iter().collect()) +} + +pub struct JoinParams { + /// Indices of the join keys + pub join_key_indices: Vec, + /// Indices of the input pk after dedup + pub deduped_pk_indices: Vec, +} + +impl JoinParams { + pub fn new(join_key_indices: Vec, deduped_pk_indices: Vec) -> Self { + Self { + join_key_indices, + deduped_pk_indices, + } + } +} + +struct JoinSide { + /// Store all data from a one side stream + ht: JoinHashMap, + /// Indices of the join key columns + join_key_indices: Vec, + /// The data type of all columns without degree. + all_data_types: Vec, + /// The start position for the side in output new columns + start_pos: usize, + /// The mapping from input indices of a side to output columes. + i2o_mapping: Vec<(usize, usize)>, + i2o_mapping_indexed: MultiMap, + /// Whether degree table is needed for this side. + need_degree_table: bool, +} + +impl std::fmt::Debug for JoinSide { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("JoinSide") + .field("join_key_indices", &self.join_key_indices) + .field("col_types", &self.all_data_types) + .field("start_pos", &self.start_pos) + .field("i2o_mapping", &self.i2o_mapping) + .field("need_degree_table", &self.need_degree_table) + .finish() + } +} + +impl JoinSide { + // WARNING: Please do not call this until we implement it. + fn is_dirty(&self) -> bool { + unimplemented!() + } + + #[expect(dead_code)] + fn clear_cache(&mut self) { + assert!( + !self.is_dirty(), + "cannot clear cache while states of hash join are dirty" + ); + + // TODO: not working with rearranged chain + // self.ht.clear(); + } + + pub fn init(&mut self, epoch: EpochPair) { + self.ht.init(epoch); + } +} + +/// `AsOfJoinExecutor` takes two input streams and runs equal hash join on them. +/// The output columns are the concatenation of left and right columns. +pub struct AsOfJoinExecutor { + ctx: ActorContextRef, + info: ExecutorInfo, + + /// Left input executor + input_l: Option, + /// Right input executor + input_r: Option, + /// The data types of the formed new columns + actual_output_data_types: Vec, + /// The parameters of the left join executor + side_l: JoinSide, + /// The parameters of the right join executor + side_r: JoinSide, + + metrics: Arc, + /// The maximum size of the chunk produced by executor at a time + chunk_size: usize, + /// Count the messages received, clear to 0 when counted to `EVICT_EVERY_N_MESSAGES` + cnt_rows_received: u32, + + /// watermark column index -> `BufferedWatermarks` + watermark_buffers: BTreeMap>, + + high_join_amplification_threshold: usize, + /// `AsOf` join description + asof_desc: AsOfDesc, +} + +impl std::fmt::Debug + for AsOfJoinExecutor +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AsOfJoinExecutor") + .field("join_type", &T) + .field("input_left", &self.input_l.as_ref().unwrap().identity()) + .field("input_right", &self.input_r.as_ref().unwrap().identity()) + .field("side_l", &self.side_l) + .field("side_r", &self.side_r) + .field("pk_indices", &self.info.pk_indices) + .field("schema", &self.info.schema) + .field("actual_output_data_types", &self.actual_output_data_types) + .finish() + } +} + +impl Execute + for AsOfJoinExecutor +{ + fn execute(self: Box) -> BoxedMessageStream { + self.into_stream().boxed() + } +} + +struct EqJoinArgs<'a, K: HashKey, S: StateStore> { + ctx: &'a ActorContextRef, + side_l: &'a mut JoinSide, + side_r: &'a mut JoinSide, + asof_desc: &'a AsOfDesc, + actual_output_data_types: &'a [DataType], + // inequality_watermarks: &'a Watermark, + chunk: StreamChunk, + chunk_size: usize, + cnt_rows_received: &'a mut u32, + high_join_amplification_threshold: usize, +} + +impl AsOfJoinExecutor { + #[allow(clippy::too_many_arguments)] + pub fn new( + ctx: ActorContextRef, + info: ExecutorInfo, + input_l: Executor, + input_r: Executor, + params_l: JoinParams, + params_r: JoinParams, + null_safe: Vec, + output_indices: Vec, + state_table_l: StateTable, + degree_state_table_l: StateTable, + state_table_r: StateTable, + degree_state_table_r: StateTable, + watermark_epoch: AtomicU64Ref, + metrics: Arc, + chunk_size: usize, + high_join_amplification_threshold: usize, + asof_desc: AsOfDesc, + ) -> Self { + let side_l_column_n = input_l.schema().len(); + + let schema_fields = [ + input_l.schema().fields.clone(), + input_r.schema().fields.clone(), + ] + .concat(); + + let original_output_data_types = schema_fields + .iter() + .map(|field| field.data_type()) + .collect_vec(); + let actual_output_data_types = output_indices + .iter() + .map(|&idx| original_output_data_types[idx].clone()) + .collect_vec(); + + // Data types of of hash join state. + let state_all_data_types_l = input_l.schema().data_types(); + let state_all_data_types_r = input_r.schema().data_types(); + + let state_pk_indices_l = input_l.pk_indices().to_vec(); + let state_pk_indices_r = input_r.pk_indices().to_vec(); + + let state_order_key_indices_l = state_table_l.pk_indices(); + let state_order_key_indices_r = state_table_r.pk_indices(); + + let state_join_key_indices_l = params_l.join_key_indices; + let state_join_key_indices_r = params_r.join_key_indices; + + let degree_join_key_indices_l = (0..state_join_key_indices_l.len()).collect_vec(); + let degree_join_key_indices_r = (0..state_join_key_indices_r.len()).collect_vec(); + + let degree_pk_indices_l = (state_join_key_indices_l.len() + ..state_join_key_indices_l.len() + params_l.deduped_pk_indices.len()) + .collect_vec(); + let degree_pk_indices_r = (state_join_key_indices_r.len() + ..state_join_key_indices_r.len() + params_r.deduped_pk_indices.len()) + .collect_vec(); + + // If pk is contained in join key. + let pk_contained_in_jk_l = + is_subset(state_pk_indices_l.clone(), state_join_key_indices_l.clone()); + let pk_contained_in_jk_r = + is_subset(state_pk_indices_r.clone(), state_join_key_indices_r.clone()); + + let join_key_data_types_l = state_join_key_indices_l + .iter() + .map(|idx| state_all_data_types_l[*idx].clone()) + .collect_vec(); + + let join_key_data_types_r = state_join_key_indices_r + .iter() + .map(|idx| state_all_data_types_r[*idx].clone()) + .collect_vec(); + + assert_eq!(join_key_data_types_l, join_key_data_types_r); + + let degree_all_data_types_l = state_order_key_indices_l + .iter() + .map(|idx| state_all_data_types_l[*idx].clone()) + .collect_vec(); + let degree_all_data_types_r = state_order_key_indices_r + .iter() + .map(|idx| state_all_data_types_r[*idx].clone()) + .collect_vec(); + + let null_matched = K::Bitmap::from_bool_vec(null_safe); + + let need_degree_table_l = false; + let need_degree_table_r = false; + + let (left_to_output, right_to_output) = { + let (left_len, right_len) = if is_left_semi_or_anti(T) { + (state_all_data_types_l.len(), 0usize) + } else if is_right_semi_or_anti(T) { + (0usize, state_all_data_types_r.len()) + } else { + (state_all_data_types_l.len(), state_all_data_types_r.len()) + }; + JoinStreamChunkBuilder::get_i2o_mapping(&output_indices, left_len, right_len) + }; + + let l2o_indexed = MultiMap::from_iter(left_to_output.iter().copied()); + let r2o_indexed = MultiMap::from_iter(right_to_output.iter().copied()); + + // handle inequality watermarks + // https://github.com/risingwavelabs/risingwave/issues/18503 + // let inequality_watermarks = None; + let watermark_buffers = BTreeMap::new(); + + let inequal_key_idx_l = Some(asof_desc.left_idx); + let inequal_key_idx_r = Some(asof_desc.right_idx); + + Self { + ctx: ctx.clone(), + info, + input_l: Some(input_l), + input_r: Some(input_r), + actual_output_data_types, + side_l: JoinSide { + ht: JoinHashMap::new( + watermark_epoch.clone(), + join_key_data_types_l, + state_join_key_indices_l.clone(), + state_all_data_types_l.clone(), + state_table_l, + params_l.deduped_pk_indices, + degree_join_key_indices_l, + degree_all_data_types_l, + degree_state_table_l, + degree_pk_indices_l, + null_matched.clone(), + need_degree_table_l, + pk_contained_in_jk_l, + inequal_key_idx_l, + metrics.clone(), + ctx.id, + ctx.fragment_id, + "left", + ), + join_key_indices: state_join_key_indices_l, + all_data_types: state_all_data_types_l, + i2o_mapping: left_to_output, + i2o_mapping_indexed: l2o_indexed, + start_pos: 0, + need_degree_table: need_degree_table_l, + }, + side_r: JoinSide { + ht: JoinHashMap::new( + watermark_epoch, + join_key_data_types_r, + state_join_key_indices_r.clone(), + state_all_data_types_r.clone(), + state_table_r, + params_r.deduped_pk_indices, + degree_join_key_indices_r, + degree_all_data_types_r, + degree_state_table_r, + degree_pk_indices_r, + null_matched, + need_degree_table_r, + pk_contained_in_jk_r, + inequal_key_idx_r, + metrics.clone(), + ctx.id, + ctx.fragment_id, + "right", + ), + join_key_indices: state_join_key_indices_r, + all_data_types: state_all_data_types_r, + start_pos: side_l_column_n, + i2o_mapping: right_to_output, + i2o_mapping_indexed: r2o_indexed, + need_degree_table: need_degree_table_r, + }, + metrics, + chunk_size, + cnt_rows_received: 0, + watermark_buffers, + high_join_amplification_threshold, + asof_desc, + } + } + + #[try_stream(ok = Message, error = StreamExecutorError)] + async fn into_stream(mut self) { + let input_l = self.input_l.take().unwrap(); + let input_r = self.input_r.take().unwrap(); + let aligned_stream = barrier_align( + input_l.execute(), + input_r.execute(), + self.ctx.id, + self.ctx.fragment_id, + self.metrics.clone(), + "Join", + ); + pin_mut!(aligned_stream); + + let barrier = expect_first_barrier_from_aligned_stream(&mut aligned_stream).await?; + self.side_l.init(barrier.epoch); + self.side_r.init(barrier.epoch); + + // The first barrier message should be propagated. + yield Message::Barrier(barrier); + let actor_id_str = self.ctx.id.to_string(); + let fragment_id_str = self.ctx.fragment_id.to_string(); + + // initialized some metrics + let join_actor_input_waiting_duration_ns = self + .metrics + .join_actor_input_waiting_duration_ns + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str]); + let left_join_match_duration_ns = self + .metrics + .join_match_duration_ns + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str, "left"]); + let right_join_match_duration_ns = self + .metrics + .join_match_duration_ns + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str, "right"]); + + let barrier_join_match_duration_ns = self + .metrics + .join_match_duration_ns + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str, "barrier"]); + + let left_join_cached_entry_count = self + .metrics + .join_cached_entry_count + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str, "left"]); + + let right_join_cached_entry_count = self + .metrics + .join_cached_entry_count + .with_guarded_label_values(&[&actor_id_str, &fragment_id_str, "right"]); + + let mut start_time = Instant::now(); + + while let Some(msg) = aligned_stream + .next() + .instrument_await("hash_join_barrier_align") + .await + { + join_actor_input_waiting_duration_ns.inc_by(start_time.elapsed().as_nanos() as u64); + match msg? { + AlignedMessage::WatermarkLeft(watermark) => { + for watermark_to_emit in self.handle_watermark(SideType::Left, watermark)? { + yield Message::Watermark(watermark_to_emit); + } + } + AlignedMessage::WatermarkRight(watermark) => { + for watermark_to_emit in self.handle_watermark(SideType::Right, watermark)? { + yield Message::Watermark(watermark_to_emit); + } + } + AlignedMessage::Left(chunk) => { + let mut left_time = Duration::from_nanos(0); + let mut left_start_time = Instant::now(); + #[for_await] + for chunk in Self::eq_join_left(EqJoinArgs { + ctx: &self.ctx, + side_l: &mut self.side_l, + side_r: &mut self.side_r, + asof_desc: &self.asof_desc, + actual_output_data_types: &self.actual_output_data_types, + // inequality_watermarks: &self.inequality_watermarks, + chunk, + chunk_size: self.chunk_size, + cnt_rows_received: &mut self.cnt_rows_received, + high_join_amplification_threshold: self.high_join_amplification_threshold, + }) { + left_time += left_start_time.elapsed(); + yield Message::Chunk(chunk?); + left_start_time = Instant::now(); + } + left_time += left_start_time.elapsed(); + left_join_match_duration_ns.inc_by(left_time.as_nanos() as u64); + self.try_flush_data().await?; + } + AlignedMessage::Right(chunk) => { + let mut right_time = Duration::from_nanos(0); + let mut right_start_time = Instant::now(); + #[for_await] + for chunk in Self::eq_join_right(EqJoinArgs { + ctx: &self.ctx, + side_l: &mut self.side_l, + side_r: &mut self.side_r, + asof_desc: &self.asof_desc, + actual_output_data_types: &self.actual_output_data_types, + // inequality_watermarks: &self.inequality_watermarks, + chunk, + chunk_size: self.chunk_size, + cnt_rows_received: &mut self.cnt_rows_received, + high_join_amplification_threshold: self.high_join_amplification_threshold, + }) { + right_time += right_start_time.elapsed(); + yield Message::Chunk(chunk?); + right_start_time = Instant::now(); + } + right_time += right_start_time.elapsed(); + right_join_match_duration_ns.inc_by(right_time.as_nanos() as u64); + self.try_flush_data().await?; + } + AlignedMessage::Barrier(barrier) => { + let barrier_start_time = Instant::now(); + self.flush_data(barrier.epoch).await?; + + // Update the vnode bitmap for state tables of both sides if asked. + if let Some(vnode_bitmap) = barrier.as_update_vnode_bitmap(self.ctx.id) { + if self.side_l.ht.update_vnode_bitmap(vnode_bitmap.clone()) { + self.watermark_buffers + .values_mut() + .for_each(|buffers| buffers.clear()); + // self.inequality_watermarks.fill(None); + } + self.side_r.ht.update_vnode_bitmap(vnode_bitmap); + } + + // Report metrics of cached join rows/entries + for (join_cached_entry_count, ht) in [ + (&left_join_cached_entry_count, &self.side_l.ht), + (&right_join_cached_entry_count, &self.side_r.ht), + ] { + join_cached_entry_count.set(ht.entry_count() as i64); + } + + barrier_join_match_duration_ns + .inc_by(barrier_start_time.elapsed().as_nanos() as u64); + yield Message::Barrier(barrier); + } + } + start_time = Instant::now(); + } + } + + async fn flush_data(&mut self, epoch: EpochPair) -> StreamExecutorResult<()> { + // All changes to the state has been buffered in the mem-table of the state table. Just + // `commit` them here. + self.side_l.ht.flush(epoch).await?; + self.side_r.ht.flush(epoch).await?; + Ok(()) + } + + async fn try_flush_data(&mut self) -> StreamExecutorResult<()> { + // All changes to the state has been buffered in the mem-table of the state table. Just + // `commit` them here. + self.side_l.ht.try_flush().await?; + self.side_r.ht.try_flush().await?; + Ok(()) + } + + // We need to manually evict the cache. + fn evict_cache( + side_update: &mut JoinSide, + side_match: &mut JoinSide, + cnt_rows_received: &mut u32, + ) { + *cnt_rows_received += 1; + if *cnt_rows_received == EVICT_EVERY_N_ROWS { + side_update.ht.evict(); + side_match.ht.evict(); + *cnt_rows_received = 0; + } + } + + fn handle_watermark( + &mut self, + side: SideTypePrimitive, + watermark: Watermark, + ) -> StreamExecutorResult> { + let (side_update, side_match) = if side == SideType::Left { + (&mut self.side_l, &mut self.side_r) + } else { + (&mut self.side_r, &mut self.side_l) + }; + + // State cleaning + if side_update.join_key_indices[0] == watermark.col_idx { + side_match.ht.update_watermark(watermark.val.clone()); + } + + // Select watermarks to yield. + let wm_in_jk = side_update + .join_key_indices + .iter() + .positions(|idx| *idx == watermark.col_idx); + let mut watermarks_to_emit = vec![]; + for idx in wm_in_jk { + let buffers = self + .watermark_buffers + .entry(idx) + .or_insert_with(|| BufferedWatermarks::with_ids([SideType::Left, SideType::Right])); + if let Some(selected_watermark) = buffers.handle_watermark(side, watermark.clone()) { + let empty_indices = vec![]; + let output_indices = side_update + .i2o_mapping_indexed + .get_vec(&side_update.join_key_indices[idx]) + .unwrap_or(&empty_indices) + .iter() + .chain( + side_match + .i2o_mapping_indexed + .get_vec(&side_match.join_key_indices[idx]) + .unwrap_or(&empty_indices), + ); + for output_idx in output_indices { + watermarks_to_emit.push(selected_watermark.clone().with_idx(*output_idx)); + } + }; + } + Ok(watermarks_to_emit) + } + + /// the data the hash table and match the coming + /// data chunk with the executor state + async fn hash_eq_match( + key: &K, + ht: &mut JoinHashMap, + ) -> StreamExecutorResult> { + if !key.null_bitmap().is_subset(ht.null_matched()) { + Ok(None) + } else { + ht.take_state(key).await.map(Some) + } + } + + #[try_stream(ok = StreamChunk, error = StreamExecutorError)] + async fn eq_join_left(args: EqJoinArgs<'_, K, S>) { + let EqJoinArgs { + ctx: _, + side_l, + side_r, + asof_desc, + actual_output_data_types, + // inequality_watermarks, + chunk, + chunk_size, + cnt_rows_received, + high_join_amplification_threshold: _, + } = args; + + let (side_update, side_match) = (side_l, side_r); + + let mut join_chunk_builder = + JoinChunkBuilder::::new(JoinStreamChunkBuilder::new( + chunk_size, + actual_output_data_types.to_vec(), + side_update.i2o_mapping.clone(), + side_match.i2o_mapping.clone(), + )); + + let keys = K::build_many(&side_update.join_key_indices, chunk.data_chunk()); + for (r, key) in chunk.rows_with_holes().zip_eq_debug(keys.iter()) { + let Some((op, row)) = r else { + continue; + }; + Self::evict_cache(side_update, side_match, cnt_rows_received); + + let matched_rows = if !side_update.ht.check_inequal_key_null(&row) { + Self::hash_eq_match(key, &mut side_match.ht).await? + } else { + None + }; + let inequal_key = side_update.ht.serialize_inequal_key_from_row(row); + + if let Some(matched_rows) = matched_rows { + let matched_row_by_inequality = match asof_desc.inequality_type { + AsOfInequalityType::Lt => matched_rows.lower_bound_by_inequality( + Bound::Excluded(&inequal_key), + &side_match.all_data_types, + ), + AsOfInequalityType::Le => matched_rows.lower_bound_by_inequality( + Bound::Included(&inequal_key), + &side_match.all_data_types, + ), + AsOfInequalityType::Gt => matched_rows.upper_bound_by_inequality( + Bound::Excluded(&inequal_key), + &side_match.all_data_types, + ), + AsOfInequalityType::Ge => matched_rows.upper_bound_by_inequality( + Bound::Included(&inequal_key), + &side_match.all_data_types, + ), + }; + match op { + Op::Insert | Op::UpdateInsert => { + if let Some(matched_row_by_inequality) = matched_row_by_inequality { + let matched_row = matched_row_by_inequality?; + + if let Some(chunk) = + join_chunk_builder.with_match_on_insert(&row, &matched_row) + { + yield chunk; + } + } else if let Some(chunk) = + join_chunk_builder.forward_if_not_matched(Op::Insert, row) + { + yield chunk; + } + side_update.ht.insert_row(key, row).await?; + } + Op::Delete | Op::UpdateDelete => { + if let Some(matched_row_by_inequality) = matched_row_by_inequality { + let matched_row = matched_row_by_inequality?; + + if let Some(chunk) = + join_chunk_builder.with_match_on_delete(&row, &matched_row) + { + yield chunk; + } + } else if let Some(chunk) = + join_chunk_builder.forward_if_not_matched(Op::Delete, row) + { + yield chunk; + } + side_update.ht.delete_row(key, row)?; + } + } + // Insert back the state taken from ht. + side_match.ht.update_state(key, matched_rows); + } else { + // Row which violates null-safe bitmap will never be matched so we need not + // store. + match op { + Op::Insert | Op::UpdateInsert => { + if let Some(chunk) = + join_chunk_builder.forward_if_not_matched(Op::Insert, row) + { + yield chunk; + } + } + Op::Delete | Op::UpdateDelete => { + if let Some(chunk) = + join_chunk_builder.forward_if_not_matched(Op::Delete, row) + { + yield chunk; + } + } + } + } + } + if let Some(chunk) = join_chunk_builder.take() { + yield chunk; + } + } + + #[try_stream(ok = StreamChunk, error = StreamExecutorError)] + async fn eq_join_right(args: EqJoinArgs<'_, K, S>) { + let EqJoinArgs { + ctx, + side_l, + side_r, + asof_desc, + actual_output_data_types, + // inequality_watermarks, + chunk, + chunk_size, + cnt_rows_received, + high_join_amplification_threshold, + } = args; + + let (side_update, side_match) = (side_r, side_l); + + let mut join_chunk_builder = JoinStreamChunkBuilder::new( + chunk_size, + actual_output_data_types.to_vec(), + side_update.i2o_mapping.clone(), + side_match.i2o_mapping.clone(), + ); + + let join_matched_rows_metrics = ctx + .streaming_metrics + .join_matched_join_keys + .with_guarded_label_values(&[ + &ctx.id.to_string(), + &ctx.fragment_id.to_string(), + &side_update.ht.table_id().to_string(), + ]); + + let keys = K::build_many(&side_update.join_key_indices, chunk.data_chunk()); + for (r, key) in chunk.rows_with_holes().zip_eq_debug(keys.iter()) { + let Some((op, row)) = r else { + continue; + }; + let mut join_matched_rows_cnt = 0; + + Self::evict_cache(side_update, side_match, cnt_rows_received); + + let matched_rows = if !side_update.ht.check_inequal_key_null(&row) { + Self::hash_eq_match(key, &mut side_match.ht).await? + } else { + None + }; + let inequal_key = side_update.ht.serialize_inequal_key_from_row(row); + + if let Some(matched_rows) = matched_rows { + let update_rows = Self::hash_eq_match(key, &mut side_update.ht).await?.expect("None is not expected because we have checked null in key when getting matched_rows"); + let right_inequality_index = update_rows.inequality_index(); + let (row_to_delete_r, row_to_insert_r) = + if let Some(pks) = right_inequality_index.get(&inequal_key) { + assert!(!pks.is_empty()); + let row_pk = side_match.ht.serialize_pk_from_row(row); + match op { + Op::Insert | Op::UpdateInsert => { + // If there are multiple rows match the inequality key in the right table, we use one with smallest pk. + let smallest_pk = pks.first_key_sorted().unwrap(); + if smallest_pk > &row_pk { + // smallest_pk is in the cache index, so it must exist in the cache. + if let Some(to_delete_row) = update_rows + .get_by_indexed_pk(smallest_pk, &side_update.all_data_types) + { + ( + Some(Either::Left(to_delete_row?.row)), + Some(Either::Right(row)), + ) + } else { + // Something wrong happened. Ignore this row in non strict consistency mode. + (None, None) + } + } else { + // No affected row in the right table. + (None, None) + } + } + Op::Delete | Op::UpdateDelete => { + let smallest_pk = pks.first_key_sorted().unwrap(); + if smallest_pk == &row_pk { + if let Some(second_smallest_pk) = pks.second_key_sorted() { + if let Some(to_insert_row) = update_rows.get_by_indexed_pk( + second_smallest_pk, + &side_update.all_data_types, + ) { + ( + Some(Either::Right(row)), + Some(Either::Left(to_insert_row?.row)), + ) + } else { + // Something wrong happened. Ignore this row in non strict consistency mode. + (None, None) + } + } else { + (Some(Either::Right(row)), None) + } + } else { + // No affected row in the right table. + (None, None) + } + } + } + } else { + match op { + // Decide the row_to_delete later + Op::Insert | Op::UpdateInsert => (None, Some(Either::Right(row))), + // Decide the row_to_insert later + Op::Delete | Op::UpdateDelete => (Some(Either::Right(row)), None), + } + }; + + // 4 cases for row_to_delete_r and row_to_insert_r: + // 1. Some(_), Some(_): delete row_to_delete_r and insert row_to_insert_r + // 2. None, Some(_) : row_to_delete to be decided by the nearest inequality key + // 3. Some(_), None : row_to_insert to be decided by the nearest inequality key + // 4. None, None : do nothing + if row_to_delete_r.is_none() && row_to_insert_r.is_none() { + // no row to delete or insert. + } else { + let prev_inequality_key = + right_inequality_index.upper_bound_key(Bound::Excluded(&inequal_key)); + let next_inequality_key = + right_inequality_index.lower_bound_key(Bound::Excluded(&inequal_key)); + let affected_row_r = match asof_desc.inequality_type { + AsOfInequalityType::Lt | AsOfInequalityType::Le => next_inequality_key + .and_then(|k| { + update_rows.get_first_by_inequality(k, &side_update.all_data_types) + }), + AsOfInequalityType::Gt | AsOfInequalityType::Ge => prev_inequality_key + .and_then(|k| { + update_rows.get_first_by_inequality(k, &side_update.all_data_types) + }), + } + .transpose()? + .map(|r| Either::Left(r.row)); + + let (row_to_delete_r, row_to_insert_r) = + match (&row_to_delete_r, &row_to_insert_r) { + (Some(_), Some(_)) => (row_to_delete_r, row_to_insert_r), + (None, Some(_)) => (affected_row_r, row_to_insert_r), + (Some(_), None) => (row_to_delete_r, affected_row_r), + (None, None) => unreachable!(), + }; + let range = match asof_desc.inequality_type { + AsOfInequalityType::Lt => ( + prev_inequality_key.map_or_else(|| Bound::Unbounded, Bound::Included), + Bound::Excluded(&inequal_key), + ), + AsOfInequalityType::Le => ( + prev_inequality_key.map_or_else(|| Bound::Unbounded, Bound::Excluded), + Bound::Included(&inequal_key), + ), + AsOfInequalityType::Gt => ( + Bound::Excluded(&inequal_key), + next_inequality_key.map_or_else(|| Bound::Unbounded, Bound::Included), + ), + AsOfInequalityType::Ge => ( + Bound::Included(&inequal_key), + next_inequality_key.map_or_else(|| Bound::Unbounded, Bound::Excluded), + ), + }; + + let rows_l = + matched_rows.range_by_inequality(range, &side_match.all_data_types); + for row_l in rows_l { + join_matched_rows_cnt += 1; + let row_l = row_l?.row; + if let Some(row_to_delete_r) = &row_to_delete_r { + if let Some(chunk) = + join_chunk_builder.append_row(Op::Delete, row_to_delete_r, &row_l) + { + yield chunk; + } + } else if is_as_of_left_outer(T) { + if let Some(chunk) = + join_chunk_builder.append_row_matched(Op::Delete, &row_l) + { + yield chunk; + } + } + if let Some(row_to_insert_r) = &row_to_insert_r { + if let Some(chunk) = + join_chunk_builder.append_row(Op::Insert, row_to_insert_r, &row_l) + { + yield chunk; + } + } else if is_as_of_left_outer(T) { + if let Some(chunk) = + join_chunk_builder.append_row_matched(Op::Insert, &row_l) + { + yield chunk; + } + } + } + } + // Insert back the state taken from ht. + side_match.ht.update_state(key, matched_rows); + side_update.ht.update_state(key, update_rows); + + match op { + Op::Insert | Op::UpdateInsert => { + side_update.ht.insert_row(key, row).await?; + } + Op::Delete | Op::UpdateDelete => { + side_update.ht.delete_row(key, row)?; + } + } + } else { + // Row which violates null-safe bitmap will never be matched so we need not + // store. + // Noop here because we only support left outer AsOf join. + } + join_matched_rows_metrics.observe(join_matched_rows_cnt as _); + if join_matched_rows_cnt > high_join_amplification_threshold { + let join_key_data_types = side_update.ht.join_key_data_types(); + let key = key.deserialize(join_key_data_types)?; + tracing::warn!(target: "high_join_amplification", + matched_rows_len = join_matched_rows_cnt, + update_table_id = side_update.ht.table_id(), + match_table_id = side_match.ht.table_id(), + join_key = ?key, + actor_id = ctx.id, + fragment_id = ctx.fragment_id, + "large rows matched for join key when AsOf join updating right side", + ); + } + } + if let Some(chunk) = join_chunk_builder.take() { + yield chunk; + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::AtomicU64; + + use risingwave_common::array::*; + use risingwave_common::catalog::{ColumnDesc, ColumnId, Field, TableId}; + use risingwave_common::hash::Key64; + use risingwave_common::util::epoch::test_epoch; + use risingwave_common::util::sort_util::OrderType; + use risingwave_storage::memory::MemoryStateStore; + + use super::*; + use crate::common::table::test_utils::gen_pbtable; + use crate::executor::test_utils::{MessageSender, MockSource, StreamExecutorTestExt}; + + async fn create_in_memory_state_table( + mem_state: MemoryStateStore, + data_types: &[DataType], + order_types: &[OrderType], + pk_indices: &[usize], + table_id: u32, + ) -> (StateTable, StateTable) { + let column_descs = data_types + .iter() + .enumerate() + .map(|(id, data_type)| ColumnDesc::unnamed(ColumnId::new(id as i32), data_type.clone())) + .collect_vec(); + let state_table = StateTable::from_table_catalog( + &gen_pbtable( + TableId::new(table_id), + column_descs, + order_types.to_vec(), + pk_indices.to_vec(), + 0, + ), + mem_state.clone(), + None, + ) + .await; + + // Create degree table + let mut degree_table_column_descs = vec![]; + pk_indices.iter().enumerate().for_each(|(pk_id, idx)| { + degree_table_column_descs.push(ColumnDesc::unnamed( + ColumnId::new(pk_id as i32), + data_types[*idx].clone(), + )) + }); + degree_table_column_descs.push(ColumnDesc::unnamed( + ColumnId::new(pk_indices.len() as i32), + DataType::Int64, + )); + let degree_state_table = StateTable::from_table_catalog( + &gen_pbtable( + TableId::new(table_id + 1), + degree_table_column_descs, + order_types.to_vec(), + pk_indices.to_vec(), + 0, + ), + mem_state, + None, + ) + .await; + (state_table, degree_state_table) + } + + async fn create_executor( + asof_desc: AsOfDesc, + ) -> (MessageSender, MessageSender, BoxedMessageStream) { + let schema = Schema { + fields: vec![ + Field::unnamed(DataType::Int64), // join key + Field::unnamed(DataType::Int64), + Field::unnamed(DataType::Int64), + ], + }; + let (tx_l, source_l) = MockSource::channel(); + let source_l = source_l.into_executor(schema.clone(), vec![1]); + let (tx_r, source_r) = MockSource::channel(); + let source_r = source_r.into_executor(schema, vec![1]); + let params_l = JoinParams::new(vec![0], vec![1]); + let params_r = JoinParams::new(vec![0], vec![1]); + + let mem_state = MemoryStateStore::new(); + + let (state_l, degree_state_l) = create_in_memory_state_table( + mem_state.clone(), + &[DataType::Int64, DataType::Int64, DataType::Int64], + &[ + OrderType::ascending(), + OrderType::ascending(), + OrderType::ascending(), + ], + &[0, asof_desc.left_idx, 1], + 0, + ) + .await; + + let (state_r, degree_state_r) = create_in_memory_state_table( + mem_state, + &[DataType::Int64, DataType::Int64, DataType::Int64], + &[ + OrderType::ascending(), + OrderType::ascending(), + OrderType::ascending(), + ], + &[0, asof_desc.right_idx, 1], + 2, + ) + .await; + + let schema: Schema = [source_l.schema().fields(), source_r.schema().fields()] + .concat() + .into_iter() + .collect(); + let schema_len = schema.len(); + let info = ExecutorInfo { + schema, + pk_indices: vec![1], + identity: "HashJoinExecutor".to_string(), + }; + + let executor = AsOfJoinExecutor::::new( + ActorContext::for_test(123), + info, + source_l, + source_r, + params_l, + params_r, + vec![false], + (0..schema_len).collect_vec(), + state_l, + degree_state_l, + state_r, + degree_state_r, + Arc::new(AtomicU64::new(0)), + Arc::new(StreamingMetrics::unused()), + 1024, + 2048, + asof_desc, + ); + (tx_l, tx_r, executor.boxed().execute()) + } + + #[tokio::test] + async fn test_as_of_inner_join() -> StreamExecutorResult<()> { + let asof_desc = AsOfDesc { + left_idx: 0, + right_idx: 2, + inequality_type: AsOfInequalityType::Lt, + }; + + let chunk_l1 = StreamChunk::from_pretty( + " I I I + + 1 4 7 + + 2 5 8 + + 3 6 9", + ); + let chunk_l2 = StreamChunk::from_pretty( + " I I I + + 3 8 1 + - 3 8 1", + ); + let chunk_r1 = StreamChunk::from_pretty( + " I I I + + 2 1 7 + + 2 2 1 + + 2 3 4 + + 2 4 2 + + 6 1 9 + + 6 2 9", + ); + let chunk_r2 = StreamChunk::from_pretty( + " I I I + - 2 3 4", + ); + let chunk_r3 = StreamChunk::from_pretty( + " I I I + + 2 3 3", + ); + let chunk_l3 = StreamChunk::from_pretty( + " I I I + - 2 5 8", + ); + let chunk_l4 = StreamChunk::from_pretty( + " I I I + + 6 3 1 + + 6 4 1", + ); + let chunk_r4 = StreamChunk::from_pretty( + " I I I + - 6 1 9", + ); + + let (mut tx_l, mut tx_r, mut hash_join) = + create_executor::<{ AsOfJoinType::Inner }>(asof_desc).await; + + // push the init barrier for left and right + tx_l.push_barrier(test_epoch(1), false); + tx_r.push_barrier(test_epoch(1), false); + hash_join.next_unwrap_ready_barrier()?; + + // push the 1st left chunk + tx_l.push_chunk(chunk_l1); + hash_join.next_unwrap_pending(); + + // push the init barrier for left and right + tx_l.push_barrier(test_epoch(2), false); + tx_r.push_barrier(test_epoch(2), false); + hash_join.next_unwrap_ready_barrier()?; + + // push the 2nd left chunk + tx_l.push_chunk(chunk_l2); + hash_join.next_unwrap_pending(); + + // push the 1st right chunk + tx_r.push_chunk(chunk_r1); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + + 2 5 8 2 1 7 + - 2 5 8 2 1 7 + + 2 5 8 2 3 4" + ) + ); + + // push the 2nd right chunk + tx_r.push_chunk(chunk_r2); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 2 5 8 2 3 4 + + 2 5 8 2 1 7" + ) + ); + + // push the 3rd right chunk + tx_r.push_chunk(chunk_r3); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 2 5 8 2 1 7 + + 2 5 8 2 3 3" + ) + ); + + // push the 3rd left chunk + tx_l.push_chunk(chunk_l3); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 2 5 8 2 3 3" + ) + ); + + // push the 4th left chunk + tx_l.push_chunk(chunk_l4); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + + 6 3 1 6 1 9 + + 6 4 1 6 1 9" + ) + ); + + // push the 4th right chunk + tx_r.push_chunk(chunk_r4); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 6 3 1 6 1 9 + + 6 3 1 6 2 9 + - 6 4 1 6 1 9 + + 6 4 1 6 2 9" + ) + ); + + Ok(()) + } + + #[tokio::test] + async fn test_as_of_left_outer_join() -> StreamExecutorResult<()> { + let asof_desc = AsOfDesc { + left_idx: 1, + right_idx: 2, + inequality_type: AsOfInequalityType::Ge, + }; + + let chunk_l1 = StreamChunk::from_pretty( + " I I I + + 1 4 7 + + 2 5 8 + + 3 6 9", + ); + let chunk_l2 = StreamChunk::from_pretty( + " I I I + + 3 8 1 + - 3 8 1", + ); + let chunk_r1 = StreamChunk::from_pretty( + " I I I + + 2 3 4 + + 2 2 5 + + 2 1 5 + + 6 1 8 + + 6 2 9", + ); + let chunk_r2 = StreamChunk::from_pretty( + " I I I + - 2 3 4 + - 2 1 5 + - 2 2 5", + ); + let chunk_l3 = StreamChunk::from_pretty( + " I I I + + 6 8 9", + ); + let chunk_r3 = StreamChunk::from_pretty( + " I I I + - 6 1 8", + ); + + let (mut tx_l, mut tx_r, mut hash_join) = + create_executor::<{ AsOfJoinType::LeftOuter }>(asof_desc).await; + + // push the init barrier for left and right + tx_l.push_barrier(test_epoch(1), false); + tx_r.push_barrier(test_epoch(1), false); + hash_join.next_unwrap_ready_barrier()?; + + // push the 1st left chunk + tx_l.push_chunk(chunk_l1); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + + 1 4 7 . . . + + 2 5 8 . . . + + 3 6 9 . . ." + ) + ); + + // push the init barrier for left and right + tx_l.push_barrier(test_epoch(2), false); + tx_r.push_barrier(test_epoch(2), false); + hash_join.next_unwrap_ready_barrier()?; + + // push the 2nd left chunk + tx_l.push_chunk(chunk_l2); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + + 3 8 1 . . . + - 3 8 1 . . ." + ) + ); + + // push the 1st right chunk + tx_r.push_chunk(chunk_r1); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 2 5 8 . . . + + 2 5 8 2 3 4 + - 2 5 8 2 3 4 + + 2 5 8 2 2 5 + - 2 5 8 2 2 5 + + 2 5 8 2 1 5" + ) + ); + + // push the 2nd right chunk + tx_r.push_chunk(chunk_r2); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 2 5 8 2 1 5 + + 2 5 8 2 2 5 + - 2 5 8 2 2 5 + + 2 5 8 . . ." + ) + ); + + // push the 3rd left chunk + tx_l.push_chunk(chunk_l3); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + + 6 8 9 6 1 8" + ) + ); + + // push the 3rd right chunk + tx_r.push_chunk(chunk_r3); + let chunk = hash_join.next_unwrap_ready_chunk()?; + assert_eq!( + chunk, + StreamChunk::from_pretty( + " I I I I I I + - 6 8 9 6 1 8 + + 6 8 9 . . ." + ) + ); + Ok(()) + } +} diff --git a/src/stream/src/executor/dedup/append_only_dedup.rs b/src/stream/src/executor/dedup/append_only_dedup.rs index fa110b9f84f17..0a38c82f294fd 100644 --- a/src/stream/src/executor/dedup/append_only_dedup.rs +++ b/src/stream/src/executor/dedup/append_only_dedup.rs @@ -188,6 +188,7 @@ mod tests { use risingwave_storage::memory::MemoryStateStore; use super::*; + use crate::common::table::test_utils::gen_pbtable; use crate::executor::test_utils::MockSource; #[tokio::test] @@ -206,12 +207,10 @@ mod tests { let order_types = vec![OrderType::ascending()]; let state_store = MemoryStateStore::new(); - let state_table = StateTable::new_without_distribution( + let state_table = StateTable::from_table_catalog( + &gen_pbtable(table_id, column_descs, order_types, pk_indices.clone(), 0), state_store, - table_id, - column_descs, - order_types, - pk_indices.clone(), + None, ) .await; diff --git a/src/stream/src/executor/dynamic_filter.rs b/src/stream/src/executor/dynamic_filter.rs index fa0a5624b8ad5..d34de5c009ddb 100644 --- a/src/stream/src/executor/dynamic_filter.rs +++ b/src/stream/src/executor/dynamic_filter.rs @@ -527,27 +527,32 @@ mod tests { use risingwave_storage::table::batch_table::storage_table::StorageTable; use super::*; + use crate::common::table::test_utils::gen_pbtable; use crate::executor::test_utils::{MessageSender, MockSource, StreamExecutorTestExt}; async fn create_in_memory_state_table( mem_state: MemoryStateStore, ) -> (StateTable, StateTable) { - let column_descs = ColumnDesc::unnamed(ColumnId::new(0), DataType::Int64); + let column_descs = vec![ColumnDesc::unnamed(ColumnId::new(0), DataType::Int64)]; + let order_types = vec![OrderType::ascending()]; + let pk_indices = vec![0]; // TODO: use consistent operations for dynamic filter - let state_table_l = StateTable::new_without_distribution_inconsistent_op( + let state_table_l = StateTable::from_table_catalog( + &gen_pbtable( + TableId::new(0), + column_descs.clone(), + order_types.clone(), + pk_indices.clone(), + 0, + ), mem_state.clone(), - TableId::new(0), - vec![column_descs.clone()], - vec![OrderType::ascending()], - vec![0], + None, ) .await; - let state_table_r = StateTable::new_without_distribution_inconsistent_op( + let state_table_r = StateTable::from_table_catalog( + &gen_pbtable(TableId::new(1), column_descs, order_types, pk_indices, 0), mem_state, - TableId::new(1), - vec![column_descs], - vec![OrderType::ascending()], - vec![0], + None, ) .await; (state_table_l, state_table_r) diff --git a/src/stream/src/executor/hash_join.rs b/src/stream/src/executor/hash_join.rs index e1a1b177bcfcc..2920d5799feb7 100644 --- a/src/stream/src/executor/hash_join.rs +++ b/src/stream/src/executor/hash_join.rs @@ -396,6 +396,7 @@ impl HashJoinExecutor HashJoinExecutor; +type InequalKeyType = Vec; pub type StateValueType = EncodedJoinRow; pub type HashValueType = Box; @@ -154,6 +157,21 @@ impl JoinHashMapMetrics { } } +/// Inequality key description for `AsOf` join. +struct InequalityKeyDesc { + idx: usize, + serializer: OrderedRowSerde, +} + +impl InequalityKeyDesc { + /// Serialize the inequality key from a row. + pub fn serialize_inequal_key_from_row(&self, row: impl Row) -> InequalKeyType { + let indices = vec![self.idx]; + let inequality_key = row.project(&indices); + inequality_key.memcmp_serialize(&self.serializer) + } +} + pub struct JoinHashMap { /// Store the join states. inner: JoinHashMapInner, @@ -182,6 +200,8 @@ pub struct JoinHashMap { need_degree_table: bool, /// Pk is part of the join key. pk_contained_in_jk: bool, + /// Inequality key description for `AsOf` join. + inequality_key_desc: Option, /// Metrics of the hash map metrics: JoinHashMapMetrics, } @@ -230,6 +250,7 @@ impl JoinHashMap { null_matched: K::Bitmap, need_degree_table: bool, pk_contained_in_jk: bool, + inequality_key_idx: Option, metrics: Arc, actor_id: ActorId, fragment_id: FragmentId, @@ -246,6 +267,14 @@ impl JoinHashMap { vec![OrderType::ascending(); state_pk_indices.len()], ); + let inequality_key_desc = inequality_key_idx.map(|idx| { + let serializer = OrderedRowSerde::new( + vec![state_all_data_types[idx].clone()], + vec![OrderType::ascending()], + ); + InequalityKeyDesc { idx, serializer } + }); + let join_table_id = state_table.table_id(); let state = TableInner { pk_indices: state_pk_indices, @@ -286,6 +315,7 @@ impl JoinHashMap { degree_state, need_degree_table, pk_contained_in_jk, + inequality_key_desc, metrics: JoinHashMapMetrics::new(&metrics, actor_id, fragment_id, side, join_table_id), } } @@ -427,11 +457,16 @@ impl JoinHashMap { let degree_i64 = degree_row .datum_at(degree_row.len() - 1) .expect("degree should not be NULL"); + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(row.row())); entry_state .insert( pk, JoinRow::new(row.row(), degree_i64.into_int64() as u64) .encode(), + inequality_key, ) .with_context(|| self.state.error_context(row.row()))?; } @@ -459,6 +494,10 @@ impl JoinHashMap { .as_ref() .project(&self.state.pk_indices) .memcmp_serialize(&self.pk_serializer); + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(row.row())); let degree_i64 = degree_row .datum_at(degree_row.len() - 1) .expect("degree should not be NULL"); @@ -466,6 +505,7 @@ impl JoinHashMap { .insert( pk, JoinRow::new(row.row(), degree_i64.into_int64() as u64).encode(), + inequality_key, ) .with_context(|| self.state.error_context(row.row()))?; } @@ -486,8 +526,12 @@ impl JoinHashMap { .as_ref() .project(&self.state.pk_indices) .memcmp_serialize(&self.pk_serializer); + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(row.row())); entry_state - .insert(pk, JoinRow::new(row.row(), 0).encode()) + .insert(pk, JoinRow::new(row.row(), 0).encode(), inequality_key) .with_context(|| self.state.error_context(row.row()))?; } }; @@ -511,9 +555,12 @@ impl JoinHashMap { /// Insert a join row #[allow(clippy::unused_async)] pub async fn insert(&mut self, key: &K, value: JoinRow) -> StreamExecutorResult<()> { - let pk = (&value.row) - .project(&self.state.pk_indices) - .memcmp_serialize(&self.pk_serializer); + let pk = self.serialize_pk_from_row(&value.row); + + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(&value.row)); // TODO(yuhao): avoid this `contains`. // https://github.com/risingwavelabs/risingwave/issues/9233 @@ -521,14 +568,14 @@ impl JoinHashMap { // Update cache let mut entry = self.inner.get_mut(key).unwrap(); entry - .insert(pk, value.encode()) + .insert(pk, value.encode(), inequality_key) .with_context(|| self.state.error_context(&value.row))?; } else if self.pk_contained_in_jk { // Refill cache when the join key exist in neither cache or storage. self.metrics.insert_cache_miss_count += 1; let mut state = JoinEntryState::default(); state - .insert(pk, value.encode()) + .insert(pk, value.encode(), inequality_key) .with_context(|| self.state.error_context(&value.row))?; self.update_state(key, state.into()); } @@ -545,24 +592,25 @@ impl JoinHashMap { #[allow(clippy::unused_async)] pub async fn insert_row(&mut self, key: &K, value: impl Row) -> StreamExecutorResult<()> { let join_row = JoinRow::new(&value, 0); - let pk = (&value) - .project(&self.state.pk_indices) - .memcmp_serialize(&self.pk_serializer); - + let pk = self.serialize_pk_from_row(&value); + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(&value)); // TODO(yuhao): avoid this `contains`. // https://github.com/risingwavelabs/risingwave/issues/9233 if self.inner.contains(key) { // Update cache let mut entry = self.inner.get_mut(key).unwrap(); entry - .insert(pk, join_row.encode()) + .insert(pk, join_row.encode(), inequality_key) .with_context(|| self.state.error_context(&value))?; } else if self.pk_contained_in_jk { // Refill cache when the join key exist in neither cache or storage. self.metrics.insert_cache_miss_count += 1; let mut state = JoinEntryState::default(); state - .insert(pk, join_row.encode()) + .insert(pk, join_row.encode(), inequality_key) .with_context(|| self.state.error_context(&value))?; self.update_state(key, state.into()); } @@ -578,8 +626,12 @@ impl JoinHashMap { let pk = (&value.row) .project(&self.state.pk_indices) .memcmp_serialize(&self.pk_serializer); + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(&value.row)); entry - .remove(pk) + .remove(pk, inequality_key.as_ref()) .with_context(|| self.state.error_context(&value.row))?; } @@ -597,8 +649,13 @@ impl JoinHashMap { let pk = (&value) .project(&self.state.pk_indices) .memcmp_serialize(&self.pk_serializer); + + let inequality_key = self + .inequality_key_desc + .as_ref() + .map(|desc| desc.serialize_inequal_key_from_row(&value)); entry - .remove(pk) + .remove(pk, inequality_key.as_ref()) .with_context(|| self.state.error_context(&value))?; } @@ -680,6 +737,29 @@ impl JoinHashMap { pub fn join_key_data_types(&self) -> &[DataType] { &self.join_key_data_types } + + /// Return true if the inequality key is null. + /// # Panics + /// Panics if the inequality key is not set. + pub fn check_inequal_key_null(&self, row: &impl Row) -> bool { + let desc = self.inequality_key_desc.as_ref().unwrap(); + row.datum_at(desc.idx).is_none() + } + + /// Serialize the inequality key from a row. + /// # Panics + /// Panics if the inequality key is not set. + pub fn serialize_inequal_key_from_row(&self, row: impl Row) -> InequalKeyType { + self.inequality_key_desc + .as_ref() + .unwrap() + .serialize_inequal_key_from_row(&row) + } + + pub fn serialize_pk_from_row(&self, row: impl Row) -> PkType { + row.project(&self.state.pk_indices) + .memcmp_serialize(&self.pk_serializer) + } } use risingwave_common_estimate_size::KvSize; @@ -695,7 +775,9 @@ use super::*; #[derive(Default)] pub struct JoinEntryState { /// The full copy of the state. - cached: join_row_set::JoinRowSet, + cached: JoinRowSet, + /// Index used for AS OF join. The key is inequal column value. The value is the primary key in `cached`. + inequality_index: JoinRowSet>, kv_heap_size: KvSize, } @@ -710,9 +792,11 @@ impl EstimateSize for JoinEntryState { #[derive(Error, Debug)] pub enum JoinEntryError { #[error("double inserting a join state entry")] - OccupiedError, + Occupied, #[error("removing a join state entry but it is not in the cache")] - RemoveError, + Remove, + #[error("retrieving a pk from the inequality index but it is not in the cache")] + InequalIndex, } impl JoinEntryState { @@ -721,11 +805,15 @@ impl JoinEntryState { &mut self, key: PkType, value: StateValueType, + inequality_key: Option, ) -> Result<&mut StateValueType, JoinEntryError> { let mut removed = false; if !enable_strict_consistency() { // strict consistency is off, let's remove existing (if any) first if let Some(old_value) = self.cached.remove(&key) { + if let Some(inequality_key) = inequality_key.as_ref() { + self.remove_pk_from_inequality_index(&key, inequality_key); + } self.kv_heap_size.sub(&key, &old_value); removed = true; } @@ -733,6 +821,9 @@ impl JoinEntryState { self.kv_heap_size.add(&key, &value); + if let Some(inequality_key) = inequality_key { + self.insert_pk_to_inequality_index(key.clone(), inequality_key); + } let ret = self.cached.try_insert(key.clone(), value); if !enable_strict_consistency() { @@ -743,22 +834,77 @@ impl JoinEntryState { } } - ret.map_err(|_| JoinEntryError::OccupiedError) + ret.map_err(|_| JoinEntryError::Occupied) } /// Delete from the cache. - pub fn remove(&mut self, pk: PkType) -> Result<(), JoinEntryError> { + pub fn remove( + &mut self, + pk: PkType, + inequality_key: Option<&InequalKeyType>, + ) -> Result<(), JoinEntryError> { if let Some(value) = self.cached.remove(&pk) { self.kv_heap_size.sub(&pk, &value); + if let Some(inequality_key) = inequality_key { + self.remove_pk_from_inequality_index(&pk, inequality_key); + } Ok(()) } else if enable_strict_consistency() { - Err(JoinEntryError::RemoveError) + Err(JoinEntryError::Remove) } else { consistency_error!(?pk, "removing a join state entry but it's not in the cache"); Ok(()) } } + fn remove_pk_from_inequality_index(&mut self, pk: &PkType, inequality_key: &InequalKeyType) { + if let Some(pk_set) = self.inequality_index.get_mut(inequality_key) { + if pk_set.remove(pk).is_none() { + if enable_strict_consistency() { + panic!("removing a pk that it not in the inequality index"); + } else { + consistency_error!(?pk, "removing a pk that it not in the inequality index"); + }; + } else { + self.kv_heap_size.sub(pk, &()); + } + if pk_set.is_empty() { + self.inequality_index.remove(inequality_key); + } + } + } + + fn insert_pk_to_inequality_index(&mut self, pk: PkType, inequality_key: InequalKeyType) { + if let Some(pk_set) = self.inequality_index.get_mut(&inequality_key) { + let pk_size = pk.estimated_size(); + if pk_set.try_insert(pk, ()).is_err() { + if enable_strict_consistency() { + panic!("inserting a pk that it already in the inequality index"); + } else { + consistency_error!("inserting a pk that it already in the inequality index"); + }; + } else { + self.kv_heap_size.add_size(pk_size); + } + } else { + let mut pk_set = JoinRowSet::default(); + pk_set.try_insert(pk, ()).unwrap(); + self.inequality_index + .try_insert(inequality_key, pk_set) + .unwrap(); + } + } + + pub fn get( + &self, + pk: &PkType, + data_types: &[DataType], + ) -> Option>> { + self.cached + .get(pk) + .map(|encoded| encoded.decode(data_types)) + } + /// Note: the first item in the tuple is the mutable reference to the value in this entry, while /// the second item is the decoded value. To mutate the degree, one **must not** forget to apply /// the changes to the first item. @@ -782,6 +928,92 @@ impl JoinEntryState { pub fn len(&self) -> usize { self.cached.len() } + + /// Range scan the cache using the inequality index. + pub fn range_by_inequality<'a, R>( + &'a self, + range: R, + data_types: &'a [DataType], + ) -> impl Iterator>> + 'a + where + R: RangeBounds + 'a, + { + self.inequality_index.range(range).flat_map(|(_, pk_set)| { + pk_set + .keys() + .flat_map(|pk| self.get_by_indexed_pk(pk, data_types)) + }) + } + + /// Get the records whose inequality key upper bound satisfy the given bound. + pub fn upper_bound_by_inequality<'a>( + &'a self, + bound: Bound<&InequalKeyType>, + data_types: &'a [DataType], + ) -> Option>> { + if let Some((_, pk_set)) = self.inequality_index.upper_bound(bound) { + if let Some(pk) = pk_set.first_key_sorted() { + self.get_by_indexed_pk(pk, data_types) + } else { + panic!("pk set for a index record must has at least one element"); + } + } else { + None + } + } + + pub fn get_by_indexed_pk( + &self, + pk: &PkType, + data_types: &[DataType], + ) -> Option>> +where { + if let Some(value) = self.cached.get(pk) { + Some(value.decode(data_types)) + } else if enable_strict_consistency() { + Some(Err(anyhow!(JoinEntryError::InequalIndex).into())) + } else { + consistency_error!(?pk, "{}", JoinEntryError::InequalIndex.as_report()); + None + } + } + + /// Get the records whose inequality key lower bound satisfy the given bound. + pub fn lower_bound_by_inequality<'a>( + &'a self, + bound: Bound<&InequalKeyType>, + data_types: &'a [DataType], + ) -> Option>> { + if let Some((_, pk_set)) = self.inequality_index.lower_bound(bound) { + if let Some(pk) = pk_set.first_key_sorted() { + self.get_by_indexed_pk(pk, data_types) + } else { + panic!("pk set for a index record must has at least one element"); + } + } else { + None + } + } + + pub fn get_first_by_inequality<'a>( + &'a self, + inequality_key: &InequalKeyType, + data_types: &'a [DataType], + ) -> Option>> { + if let Some(pk_set) = self.inequality_index.get(inequality_key) { + if let Some(pk) = pk_set.first_key_sorted() { + self.get_by_indexed_pk(pk, data_types) + } else { + panic!("pk set for a index record must has at least one element"); + } + } else { + None + } + } + + pub fn inequality_index(&self) -> &JoinRowSet> { + &self.inequality_index + } } #[cfg(test)] @@ -795,16 +1027,36 @@ mod tests { fn insert_chunk( managed_state: &mut JoinEntryState, pk_indices: &[usize], + col_types: &[DataType], + inequality_key_idx: Option, data_chunk: &DataChunk, ) { + let pk_col_type = pk_indices + .iter() + .map(|idx| col_types[*idx].clone()) + .collect_vec(); + let pk_serializer = + OrderedRowSerde::new(pk_col_type, vec![OrderType::ascending(); pk_indices.len()]); + let inequality_key_type = inequality_key_idx.map(|idx| col_types[idx].clone()); + let inequality_key_serializer = inequality_key_type + .map(|data_type| OrderedRowSerde::new(vec![data_type], vec![OrderType::ascending()])); for row_ref in data_chunk.rows() { let row: OwnedRow = row_ref.into_owned_row(); let value_indices = (0..row.len() - 1).collect_vec(); let pk = pk_indices.iter().map(|idx| row[*idx].clone()).collect_vec(); // Pk is only a `i64` here, so encoding method does not matter. - let pk = OwnedRow::new(pk).project(&value_indices).value_serialize(); + let pk = OwnedRow::new(pk) + .project(&value_indices) + .memcmp_serialize(&pk_serializer); + let inequality_key = inequality_key_idx.map(|idx| { + (&row) + .project(&[idx]) + .memcmp_serialize(inequality_key_serializer.as_ref().unwrap()) + }); let join_row = JoinRow { row, degree: 0 }; - managed_state.insert(pk, join_row.encode()).unwrap(); + managed_state + .insert(pk, join_row.encode(), inequality_key) + .unwrap(); } } @@ -826,7 +1078,7 @@ mod tests { } #[tokio::test] - async fn test_managed_all_or_none_state() { + async fn test_managed_join_state() { let mut managed_state = JoinEntryState::default(); let col_types = vec![DataType::Int64, DataType::Int64]; let pk_indices = [0]; @@ -841,7 +1093,13 @@ mod tests { ); // `Vec` in state - insert_chunk(&mut managed_state, &pk_indices, &data_chunk1); + insert_chunk( + &mut managed_state, + &pk_indices, + &col_types, + None, + &data_chunk1, + ); check(&mut managed_state, &col_types, &col1, &col2); // `BtreeMap` in state @@ -852,7 +1110,76 @@ mod tests { 5 8 4 9", ); - insert_chunk(&mut managed_state, &pk_indices, &data_chunk2); + insert_chunk( + &mut managed_state, + &pk_indices, + &col_types, + None, + &data_chunk2, + ); check(&mut managed_state, &col_types, &col1, &col2); } + + #[tokio::test] + async fn test_managed_join_state_w_inequality_index() { + let mut managed_state = JoinEntryState::default(); + let col_types = vec![DataType::Int64, DataType::Int64]; + let pk_indices = [0]; + let inequality_key_idx = Some(1); + let inequality_key_serializer = + OrderedRowSerde::new(vec![DataType::Int64], vec![OrderType::ascending()]); + + let col1 = [3, 2, 1]; + let col2 = [4, 5, 5]; + let data_chunk1 = DataChunk::from_pretty( + "I I + 3 4 + 2 5 + 1 5", + ); + + // `Vec` in state + insert_chunk( + &mut managed_state, + &pk_indices, + &col_types, + inequality_key_idx, + &data_chunk1, + ); + check(&mut managed_state, &col_types, &col1, &col2); + let bound = OwnedRow::new(vec![Some(ScalarImpl::Int64(5))]) + .memcmp_serialize(&inequality_key_serializer); + let row = managed_state + .upper_bound_by_inequality(Bound::Included(&bound), &col_types) + .unwrap() + .unwrap(); + assert_eq!(row.row[0], Some(ScalarImpl::Int64(1))); + let row = managed_state + .upper_bound_by_inequality(Bound::Excluded(&bound), &col_types) + .unwrap() + .unwrap(); + assert_eq!(row.row[0], Some(ScalarImpl::Int64(3))); + + // `BtreeMap` in state + let col1 = [1, 2, 3, 4, 5]; + let col2 = [5, 5, 4, 4, 8]; + let data_chunk2 = DataChunk::from_pretty( + "I I + 5 8 + 4 4", + ); + insert_chunk( + &mut managed_state, + &pk_indices, + &col_types, + inequality_key_idx, + &data_chunk2, + ); + check(&mut managed_state, &col_types, &col1, &col2); + + let bound = OwnedRow::new(vec![Some(ScalarImpl::Int64(8))]) + .memcmp_serialize(&inequality_key_serializer); + let row = managed_state.lower_bound_by_inequality(Bound::Excluded(&bound), &col_types); + assert!(row.is_none()); + } } diff --git a/src/stream/src/executor/join/join_row_set.rs b/src/stream/src/executor/join/join_row_set.rs index de6f5ce2f0279..b34e163410eec 100644 --- a/src/stream/src/executor/join/join_row_set.rs +++ b/src/stream/src/executor/join/join_row_set.rs @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::borrow::Borrow; use std::collections::btree_map::OccupiedError as BTreeMapOccupiedError; use std::collections::BTreeMap; use std::fmt::Debug; use std::mem; +use std::ops::{Bound, RangeBounds}; use auto_enums::auto_enum; use enum_as_inner::EnumAsInner; @@ -110,6 +112,13 @@ impl JoinRowSet { } } + pub fn is_empty(&self) -> bool { + match self { + Self::BTree(inner) => inner.is_empty(), + Self::Vec(inner) => inner.is_empty(), + } + } + #[auto_enum(Iterator)] pub fn values_mut(&mut self) -> impl Iterator { match self { @@ -117,4 +126,161 @@ impl JoinRowSet { Self::Vec(inner) => inner.iter_mut().map(|(_, v)| v), } } + + #[auto_enum(Iterator)] + pub fn keys(&self) -> impl Iterator { + match self { + Self::BTree(inner) => inner.keys(), + Self::Vec(inner) => inner.iter().map(|(k, _v)| k), + } + } + + #[auto_enum(Iterator)] + pub fn range(&self, range: R) -> impl Iterator + where + T: Ord + ?Sized, + K: Borrow + Ord, + R: RangeBounds, + { + match self { + Self::BTree(inner) => inner.range(range), + Self::Vec(inner) => inner + .iter() + .filter(move |(k, _)| range.contains(k.borrow())) + .map(|(k, v)| (k, v)), + } + } + + pub fn lower_bound_key(&self, bound: Bound<&K>) -> Option<&K> { + self.lower_bound(bound).map(|(k, _v)| k) + } + + pub fn upper_bound_key(&self, bound: Bound<&K>) -> Option<&K> { + self.upper_bound(bound).map(|(k, _v)| k) + } + + pub fn lower_bound(&self, bound: Bound<&K>) -> Option<(&K, &V)> { + match self { + Self::BTree(inner) => inner.lower_bound(bound).next(), + Self::Vec(inner) => inner + .iter() + .filter(|(k, _)| (bound, Bound::Unbounded).contains(k)) + .min_by_key(|(k, _)| k) + .map(|(k, v)| (k, v)), + } + } + + pub fn upper_bound(&self, bound: Bound<&K>) -> Option<(&K, &V)> { + match self { + Self::BTree(inner) => inner.upper_bound(bound).prev(), + Self::Vec(inner) => inner + .iter() + .filter(|(k, _)| (Bound::Unbounded, bound).contains(k)) + .max_by_key(|(k, _)| k) + .map(|(k, v)| (k, v)), + } + } + + pub fn get_mut(&mut self, key: &K) -> Option<&mut V> { + match self { + Self::BTree(inner) => inner.get_mut(key), + Self::Vec(inner) => inner.iter_mut().find(|(k, _)| k == key).map(|(_, v)| v), + } + } + + pub fn get(&self, key: &K) -> Option<&V> { + match self { + Self::BTree(inner) => inner.get(key), + Self::Vec(inner) => inner.iter().find(|(k, _)| k == key).map(|(_, v)| v), + } + } + + /// Returns the key-value pair with smallest key in the map. + pub fn first_key_sorted(&self) -> Option<&K> { + match self { + Self::BTree(inner) => inner.first_key_value().map(|(k, _)| k), + Self::Vec(inner) => inner.iter().map(|(k, _)| k).min(), + } + } + + /// Returns the key-value pair with the second smallest key in the map. + pub fn second_key_sorted(&self) -> Option<&K> { + match self { + Self::BTree(inner) => inner.iter().nth(1).map(|(k, _)| k), + Self::Vec(inner) => { + let mut res = None; + let mut smallest = None; + for (k, _) in inner { + if let Some(smallest_k) = smallest { + if k < smallest_k { + res = Some(smallest_k); + smallest = Some(k); + } else if let Some(res_k) = res { + if k < res_k { + res = Some(k); + } + } else { + res = Some(k); + } + } else { + smallest = Some(k); + } + } + res + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_join_row_set_bounds() { + let mut join_row_set: JoinRowSet = JoinRowSet::default(); + + // Insert elements + assert!(join_row_set.try_insert(1, 10).is_ok()); + assert!(join_row_set.try_insert(2, 20).is_ok()); + assert!(join_row_set.try_insert(3, 30).is_ok()); + + // Check lower bound + assert_eq!(join_row_set.lower_bound_key(Bound::Included(&2)), Some(&2)); + assert_eq!(join_row_set.lower_bound_key(Bound::Excluded(&2)), Some(&3)); + + // Check upper bound + assert_eq!(join_row_set.upper_bound_key(Bound::Included(&2)), Some(&2)); + assert_eq!(join_row_set.upper_bound_key(Bound::Excluded(&2)), Some(&1)); + } + + #[test] + fn test_join_row_set_first_and_second_key_sorted() { + { + let mut join_row_set: JoinRowSet = JoinRowSet::default(); + + // Insert elements + assert!(join_row_set.try_insert(3, 30).is_ok()); + assert!(join_row_set.try_insert(1, 10).is_ok()); + assert!(join_row_set.try_insert(2, 20).is_ok()); + + // Check first key sorted + assert_eq!(join_row_set.first_key_sorted(), Some(&1)); + + // Check second key sorted + assert_eq!(join_row_set.second_key_sorted(), Some(&2)); + } + { + let mut join_row_set: JoinRowSet = JoinRowSet::default(); + + // Insert elements + assert!(join_row_set.try_insert(1, 10).is_ok()); + assert!(join_row_set.try_insert(2, 20).is_ok()); + + // Check first key sorted + assert_eq!(join_row_set.first_key_sorted(), Some(&1)); + + // Check second key sorted + assert_eq!(join_row_set.second_key_sorted(), Some(&2)); + } + } } diff --git a/src/stream/src/executor/join/mod.rs b/src/stream/src/executor/join/mod.rs index b8bd5ff84d95f..ea53a7992f265 100644 --- a/src/stream/src/executor/join/mod.rs +++ b/src/stream/src/executor/join/mod.rs @@ -12,6 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +use risingwave_expr::bail; +use risingwave_pb::plan_common::{AsOfJoinDesc, AsOfJoinInequalityType}; + +use crate::error::StreamResult; + pub mod builder; pub mod hash_join; pub mod join_row_set; @@ -35,6 +40,15 @@ pub mod JoinType { pub const RightAnti: JoinTypePrimitive = 7; } +pub type AsOfJoinTypePrimitive = u8; + +#[allow(non_snake_case, non_upper_case_globals)] +pub mod AsOfJoinType { + use super::AsOfJoinTypePrimitive; + pub const Inner: AsOfJoinTypePrimitive = 0; + pub const LeftOuter: AsOfJoinTypePrimitive = 1; +} + pub type SideTypePrimitive = u8; #[allow(non_snake_case, non_upper_case_globals)] pub mod SideType { @@ -43,6 +57,38 @@ pub mod SideType { pub const Right: SideTypePrimitive = 1; } +pub enum AsOfInequalityType { + Le, + Lt, + Ge, + Gt, +} + +pub struct AsOfDesc { + pub left_idx: usize, + pub right_idx: usize, + pub inequality_type: AsOfInequalityType, +} + +impl AsOfDesc { + pub fn from_protobuf(desc_proto: &AsOfJoinDesc) -> StreamResult { + let typ = match desc_proto.inequality_type() { + AsOfJoinInequalityType::AsOfInequalityTypeLt => AsOfInequalityType::Lt, + AsOfJoinInequalityType::AsOfInequalityTypeLe => AsOfInequalityType::Le, + AsOfJoinInequalityType::AsOfInequalityTypeGt => AsOfInequalityType::Gt, + AsOfJoinInequalityType::AsOfInequalityTypeGe => AsOfInequalityType::Ge, + AsOfJoinInequalityType::AsOfInequalityTypeUnspecified => { + bail!("unspecified AsOf join inequality type") + } + }; + Ok(Self { + left_idx: desc_proto.left_idx as usize, + right_idx: desc_proto.right_idx as usize, + inequality_type: typ, + }) + } +} + pub const fn is_outer_side(join_type: JoinTypePrimitive, side_type: SideTypePrimitive) -> bool { join_type == JoinType::FullOuter || (join_type == JoinType::LeftOuter && side_type == SideType::Left) @@ -106,3 +152,7 @@ pub const fn need_right_degree(join_type: JoinTypePrimitive) -> bool { || join_type == JoinType::RightAnti || join_type == JoinType::RightSemi } + +pub const fn is_as_of_left_outer(join_type: AsOfJoinTypePrimitive) -> bool { + join_type == AsOfJoinType::LeftOuter +} diff --git a/src/stream/src/executor/mod.rs b/src/stream/src/executor/mod.rs index 05688474f5bd9..a053e7dc50213 100644 --- a/src/stream/src/executor/mod.rs +++ b/src/stream/src/executor/mod.rs @@ -57,6 +57,7 @@ pub mod monitor; pub mod agg_common; pub mod aggregation; +pub mod asof_join; mod backfill; mod barrier_recv; mod batch_query; @@ -133,7 +134,7 @@ pub use filter::FilterExecutor; pub use hash_agg::HashAggExecutor; pub use hash_join::*; pub use hop_window::HopWindowExecutor; -pub use join::JoinType; +pub use join::{AsOfDesc, AsOfJoinType, JoinType}; pub use lookup::*; pub use lookup_union::LookupUnionExecutor; pub use merge::MergeExecutor; diff --git a/src/stream/src/executor/mview/materialize.rs b/src/stream/src/executor/mview/materialize.rs index 9762d0ecd91ed..a5dc24d5cd74b 100644 --- a/src/stream/src/executor/mview/materialize.rs +++ b/src/stream/src/executor/mview/materialize.rs @@ -37,6 +37,7 @@ use risingwave_storage::row_serde::value_serde::{ValueRowSerde, ValueRowSerdeNew use crate::cache::ManagedLruCache; use crate::common::metrics::MetricsInfo; use crate::common::table::state_table::{StateTableInner, StateTableOpConsistencyLevel}; +use crate::common::table::test_utils::gen_pbtable; use crate::executor::monitor::MaterializeMetrics; use crate::executor::prelude::*; @@ -364,12 +365,16 @@ impl MaterializeExecutor { Arc::from((0..columns.len()).collect_vec()), Arc::from(columns.clone().into_boxed_slice()), ); - let state_table = StateTableInner::new_without_distribution( + let state_table = StateTableInner::from_table_catalog( + &gen_pbtable( + table_id, + columns, + arrange_order_types, + arrange_columns.clone(), + 0, + ), store, - table_id, - columns, - arrange_order_types, - arrange_columns.clone(), + None, ) .await; @@ -2007,12 +2012,16 @@ mod tests { ]; let pk_indices = vec![0]; - let mut table = StateTable::new_without_distribution( + let mut table = StateTable::from_table_catalog( + &gen_pbtable( + TableId::from(1002), + column_descs.clone(), + order_types, + pk_indices, + 0, + ), memory_state_store.clone(), - TableId::from(1002), - column_descs.clone(), - order_types, - pk_indices, + None, ) .await; diff --git a/src/stream/src/executor/mview/test_utils.rs b/src/stream/src/executor/mview/test_utils.rs index d90b32a5add11..c194142c0a49f 100644 --- a/src/stream/src/executor/mview/test_utils.rs +++ b/src/stream/src/executor/mview/test_utils.rs @@ -21,6 +21,7 @@ use risingwave_storage::memory::MemoryStateStore; use risingwave_storage::table::batch_table::storage_table::StorageTable; use crate::common::table::state_table::StateTable; +use crate::common::table::test_utils::gen_pbtable; pub async fn gen_basic_table(row_count: usize) -> StorageTable { let state_store = MemoryStateStore::new(); @@ -33,12 +34,16 @@ pub async fn gen_basic_table(row_count: usize) -> StorageTable ColumnDesc::unnamed(column_ids[2], DataType::Int32), ]; let pk_indices = vec![0_usize, 1_usize]; - let mut state = StateTable::new_without_distribution( + let mut state = StateTable::from_table_catalog( + &gen_pbtable( + TableId::from(0x42), + column_descs.clone(), + order_types, + pk_indices, + 0, + ), state_store.clone(), - TableId::from(0x42), - column_descs.clone(), - order_types, - pk_indices, + None, ) .await; let table = StorageTable::for_test( diff --git a/src/stream/src/executor/now.rs b/src/stream/src/executor/now.rs index 049eee7f8c724..df77c459a8784 100644 --- a/src/stream/src/executor/now.rs +++ b/src/stream/src/executor/now.rs @@ -301,6 +301,7 @@ mod tests { use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; use super::*; + use crate::common::table::test_utils::gen_pbtable; use crate::executor::test_utils::StreamExecutorTestExt; #[tokio::test] @@ -638,12 +639,10 @@ mod tests { ) -> (UnboundedSender, BoxedMessageStream) { let table_id = TableId::new(1); let column_descs = vec![ColumnDesc::unnamed(ColumnId::new(0), DataType::Timestamptz)]; - let state_table = StateTable::new_without_distribution( + let state_table = StateTable::from_table_catalog( + &gen_pbtable(table_id, column_descs, vec![], vec![], 0), state_store.clone(), - table_id, - column_descs, - vec![], - vec![], + None, ) .await; diff --git a/src/stream/src/executor/sort.rs b/src/stream/src/executor/sort.rs index c2a01c8915a96..1556ebf419aaf 100644 --- a/src/stream/src/executor/sort.rs +++ b/src/stream/src/executor/sort.rs @@ -149,6 +149,7 @@ mod tests { use risingwave_storage::memory::MemoryStateStore; use super::*; + use crate::common::table::test_utils::gen_pbtable; use crate::executor::test_utils::{MessageSender, MockSource, StreamExecutorTestExt}; async fn create_executor( @@ -170,12 +171,16 @@ mod tests { // note that the sort column is the first table pk column to ensure ordering let table_pk_indices = vec![sort_column_index, 0]; let table_order_types = vec![OrderType::ascending(), OrderType::ascending()]; - let buffer_table = StateTable::new_without_distribution( + let buffer_table = StateTable::from_table_catalog( + &gen_pbtable( + TableId::new(1), + table_columns, + table_order_types, + table_pk_indices, + 0, + ), store, - TableId::new(1), - table_columns, - table_order_types, - table_pk_indices, + None, ) .await; diff --git a/src/stream/src/executor/test_utils.rs b/src/stream/src/executor/test_utils.rs index 36534ace12b05..f4e0f40761aab 100644 --- a/src/stream/src/executor/test_utils.rs +++ b/src/stream/src/executor/test_utils.rs @@ -282,6 +282,7 @@ pub mod agg_executor { use risingwave_storage::StateStore; use crate::common::table::state_table::StateTable; + use crate::common::table::test_utils::gen_pbtable; use crate::common::StateTableColumnMapping; use crate::executor::agg_common::{ AggExecutorArgs, HashAggExecutorExtraArgs, SimpleAggExecutorExtraArgs, @@ -362,12 +363,16 @@ pub mod agg_executor { add_column(*idx, input_fields[*idx].data_type(), Some(OrderType::ascending())); } - let state_table = StateTable::new_without_distribution( + let state_table = StateTable::from_table_catalog( + &gen_pbtable( + table_id, + column_descs, + order_types.clone(), + (0..order_types.len()).collect(), + 0, + ), store, - table_id, - column_descs, - order_types.clone(), - (0..order_types.len()).collect(), + None, ).await; AggStateStorage::MaterializedInput { table: state_table, mapping: StateTableColumnMapping::new(upstream_columns, None), order_columns } @@ -418,12 +423,16 @@ pub mod agg_executor { add_column_desc(agg_call.return_type.clone()); }); - StateTable::new_without_distribution_inconsistent_op( + StateTable::from_table_catalog_inconsistent_op( + &gen_pbtable( + table_id, + column_descs, + order_types, + (0..group_key_indices.len()).collect(), + 0, + ), store, - table_id, - column_descs, - order_types, - (0..group_key_indices.len()).collect(), + None, ) .await } @@ -574,6 +583,7 @@ pub mod top_n_executor { use risingwave_storage::memory::MemoryStateStore; use crate::common::table::state_table::StateTable; + use crate::common::table::test_utils::gen_pbtable; pub async fn create_in_memory_state_table( data_types: &[DataType], @@ -600,12 +610,16 @@ pub mod top_n_executor { .enumerate() .map(|(id, data_type)| ColumnDesc::unnamed(ColumnId::new(id as i32), data_type.clone())) .collect_vec(); - StateTable::new_without_distribution( + StateTable::from_table_catalog( + &gen_pbtable( + TableId::new(0), + column_descs, + order_types.to_vec(), + pk_indices.to_vec(), + 0, + ), state_store, - TableId::new(0), - column_descs, - order_types.to_vec(), - pk_indices.to_vec(), + None, ) .await } diff --git a/src/stream/src/from_proto/asof_join.rs b/src/stream/src/from_proto/asof_join.rs new file mode 100644 index 0000000000000..3d74ac884b4f0 --- /dev/null +++ b/src/stream/src/from_proto/asof_join.rs @@ -0,0 +1,192 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use risingwave_common::hash::{HashKey, HashKeyDispatcher}; +use risingwave_common::types::DataType; +use risingwave_pb::plan_common::AsOfJoinType as JoinTypeProto; +use risingwave_pb::stream_plan::AsOfJoinNode; + +use super::*; +use crate::common::table::state_table::StateTable; +use crate::executor::asof_join::*; +use crate::executor::monitor::StreamingMetrics; +use crate::executor::{ActorContextRef, AsOfDesc, AsOfJoinType, JoinType}; +use crate::task::AtomicU64Ref; + +pub struct AsOfJoinExecutorBuilder; + +impl ExecutorBuilder for AsOfJoinExecutorBuilder { + type Node = AsOfJoinNode; + + async fn new_boxed_executor( + params: ExecutorParams, + node: &Self::Node, + store: impl StateStore, + ) -> StreamResult { + // This assert is to make sure AsOf join can use `JoinChunkBuilder` as Hash join. + assert_eq!(AsOfJoinType::Inner, JoinType::Inner); + assert_eq!(AsOfJoinType::LeftOuter, JoinType::LeftOuter); + let vnodes = Arc::new(params.vnode_bitmap.expect("vnodes not set for AsOf join")); + + let [source_l, source_r]: [_; 2] = params.input.try_into().unwrap(); + + let table_l = node.get_left_table()?; + let degree_table_l = node.get_left_degree_table()?; + + let table_r = node.get_right_table()?; + let degree_table_r = node.get_right_degree_table()?; + + let params_l = JoinParams::new( + node.get_left_key() + .iter() + .map(|key| *key as usize) + .collect_vec(), + node.get_left_deduped_input_pk_indices() + .iter() + .map(|key| *key as usize) + .collect_vec(), + ); + let params_r = JoinParams::new( + node.get_right_key() + .iter() + .map(|key| *key as usize) + .collect_vec(), + node.get_right_deduped_input_pk_indices() + .iter() + .map(|key| *key as usize) + .collect_vec(), + ); + let null_safe = node.get_null_safe().to_vec(); + let output_indices = node + .get_output_indices() + .iter() + .map(|&x| x as usize) + .collect_vec(); + + let join_key_data_types = params_l + .join_key_indices + .iter() + .map(|idx| source_l.schema().fields[*idx].data_type()) + .collect_vec(); + + let state_table_l = + StateTable::from_table_catalog(table_l, store.clone(), Some(vnodes.clone())).await; + let degree_state_table_l = + StateTable::from_table_catalog(degree_table_l, store.clone(), Some(vnodes.clone())) + .await; + + let state_table_r = + StateTable::from_table_catalog(table_r, store.clone(), Some(vnodes.clone())).await; + let degree_state_table_r = + StateTable::from_table_catalog(degree_table_r, store, Some(vnodes)).await; + + let join_type_proto = node.get_join_type()?; + let as_of_desc_proto = node.get_asof_desc()?; + let asof_desc = AsOfDesc::from_protobuf(as_of_desc_proto)?; + + let args = AsOfJoinExecutorDispatcherArgs { + ctx: params.actor_context, + info: params.info.clone(), + source_l, + source_r, + params_l, + params_r, + null_safe, + output_indices, + state_table_l, + degree_state_table_l, + state_table_r, + degree_state_table_r, + lru_manager: params.watermark_epoch, + metrics: params.executor_stats, + join_type_proto, + join_key_data_types, + chunk_size: params.env.config().developer.chunk_size, + high_join_amplification_threshold: params + .env + .config() + .developer + .high_join_amplification_threshold, + asof_desc, + }; + + let exec = args.dispatch()?; + Ok((params.info, exec).into()) + } +} + +struct AsOfJoinExecutorDispatcherArgs { + ctx: ActorContextRef, + info: ExecutorInfo, + source_l: Executor, + source_r: Executor, + params_l: JoinParams, + params_r: JoinParams, + null_safe: Vec, + output_indices: Vec, + state_table_l: StateTable, + degree_state_table_l: StateTable, + state_table_r: StateTable, + degree_state_table_r: StateTable, + lru_manager: AtomicU64Ref, + metrics: Arc, + join_type_proto: JoinTypeProto, + join_key_data_types: Vec, + chunk_size: usize, + high_join_amplification_threshold: usize, + asof_desc: AsOfDesc, +} + +impl HashKeyDispatcher for AsOfJoinExecutorDispatcherArgs { + type Output = StreamResult>; + + fn dispatch_impl(self) -> Self::Output { + /// This macro helps to fill the const generic type parameter. + macro_rules! build { + ($join_type:ident) => { + Ok(AsOfJoinExecutor::::new( + self.ctx, + self.info, + self.source_l, + self.source_r, + self.params_l, + self.params_r, + self.null_safe, + self.output_indices, + self.state_table_l, + self.degree_state_table_l, + self.state_table_r, + self.degree_state_table_r, + self.lru_manager, + self.metrics, + self.chunk_size, + self.high_join_amplification_threshold, + self.asof_desc, + ) + .boxed()) + }; + } + match self.join_type_proto { + JoinTypeProto::Unspecified => unreachable!(), + JoinTypeProto::Inner => build!(Inner), + JoinTypeProto::LeftOuter => build!(LeftOuter), + } + } + + fn data_types(&self) -> &[DataType] { + &self.join_key_data_types + } +} diff --git a/src/stream/src/from_proto/mod.rs b/src/stream/src/from_proto/mod.rs index 6be25b4bd8046..9a51dd10ddfb7 100644 --- a/src/stream/src/from_proto/mod.rs +++ b/src/stream/src/from_proto/mod.rs @@ -16,6 +16,7 @@ mod agg_common; mod append_only_dedup; +mod asof_join; mod barrier_recv; mod batch_query; mod cdc_filter; diff --git a/src/stream/src/task/barrier_manager.rs b/src/stream/src/task/barrier_manager.rs index 26107c2130586..242f21c17272b 100644 --- a/src/stream/src/task/barrier_manager.rs +++ b/src/stream/src/task/barrier_manager.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeSet, HashMap}; use std::fmt::Display; use std::future::pending; use std::sync::Arc; @@ -46,7 +46,6 @@ mod progress; mod tests; pub use progress::CreateMviewProgressReporter; -use risingwave_common::catalog::TableId; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::runtime::BackgroundShutdownRuntime; use risingwave_hummock_sdk::table_stats::to_prost_table_stats_map; @@ -57,7 +56,7 @@ use risingwave_pb::stream_service::streaming_control_stream_response::{ InitResponse, ShutdownResponse, }; use risingwave_pb::stream_service::{ - streaming_control_stream_response, BarrierCompleteResponse, BuildActorInfo, + streaming_control_stream_response, BarrierCompleteResponse, InjectBarrierRequest, StreamingControlStreamRequest, StreamingControlStreamResponse, }; @@ -336,6 +335,7 @@ impl LocalBarrierWorker { LocalActorOperation::NewControlStream { handle, init_request } => { self.control_stream_handle.reset_stream_with_err(Status::internal("control stream has been reset to a new one")); self.reset(HummockVersionId::new(init_request.version_id)).await; + self.state.add_subscriptions(init_request.subscriptions); self.control_stream_handle = handle; self.control_stream_handle.send_response(StreamingControlStreamResponse { response: Some(streaming_control_stream_response::Response::Init(InitResponse {})) @@ -376,17 +376,8 @@ impl LocalBarrierWorker { match request.request.expect("should not be empty") { Request::InjectBarrier(req) => { let barrier = Barrier::from_protobuf(req.get_barrier().unwrap())?; - self.update_actor_info(req.broadcast_info)?; - self.send_barrier( - &barrier, - req.actors_to_build, - req.actor_ids_to_collect.into_iter().collect(), - req.table_ids_to_sync - .into_iter() - .map(TableId::new) - .collect(), - PartialGraphId::new(req.partial_graph_id), - )?; + self.update_actor_info(req.broadcast_info.iter().cloned())?; + self.send_barrier(&barrier, req)?; Ok(()) } Request::RemovePartialGraph(req) => { @@ -536,10 +527,7 @@ impl LocalBarrierWorker { fn send_barrier( &mut self, barrier: &Barrier, - to_build: Vec, - to_collect: HashSet, - table_ids: HashSet, - partial_graph_id: PartialGraphId, + request: InjectBarrierRequest, ) -> StreamResult<()> { if barrier.kind == BarrierKind::Initial { self.actor_manager @@ -550,10 +538,10 @@ impl LocalBarrierWorker { target: "events::stream::barrier::manager::send", "send barrier {:?}, actor_ids_to_collect = {:?}", barrier, - to_collect + request.actor_ids_to_collect ); - for actor_id in &to_collect { + for actor_id in &request.actor_ids_to_collect { if self.failure_actors.contains_key(actor_id) { // The failure actors could exit before the barrier is issued, while their // up-downstream actors could be stuck somehow. Return error directly to trigger the @@ -566,13 +554,7 @@ impl LocalBarrierWorker { } } - self.state.transform_to_issued( - barrier, - to_build, - to_collect, - table_ids, - partial_graph_id, - )?; + self.state.transform_to_issued(barrier, request)?; Ok(()) } @@ -931,7 +913,10 @@ pub(crate) mod barrier_test_utils { response_tx, UnboundedReceiverStream::new(request_rx).boxed(), ), - init_request: InitRequest { version_id: 0 }, + init_request: InitRequest { + version_id: 0, + subscriptions: vec![], + }, }); assert_matches!( @@ -968,6 +953,8 @@ pub(crate) mod barrier_test_utils { partial_graph_id: u32::MAX, broadcast_info: vec![], actors_to_build: vec![], + subscriptions_to_add: vec![], + subscriptions_to_remove: vec![], }, )), })) diff --git a/src/stream/src/task/barrier_manager/managed_state.rs b/src/stream/src/task/barrier_manager/managed_state.rs index 2206ac420a6ef..4d0e82661fadc 100644 --- a/src/stream/src/task/barrier_manager/managed_state.rs +++ b/src/stream/src/task/barrier_manager/managed_state.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::assert_matches::assert_matches; +use std::cell::LazyCell; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::{Debug, Display, Formatter}; use std::future::{pending, poll_fn, Future}; @@ -31,7 +32,6 @@ use risingwave_common::must_match; use risingwave_common::util::epoch::EpochPair; use risingwave_hummock_sdk::SyncResult; use risingwave_pb::stream_plan::barrier::BarrierKind; -use risingwave_pb::stream_service::BuildActorInfo; use risingwave_storage::{dispatch_state_store, StateStore, StateStoreImpl}; use thiserror_ext::AsReport; use tokio::sync::mpsc; @@ -141,6 +141,8 @@ mod await_epoch_completed_future { } use await_epoch_completed_future::*; +use risingwave_pb::stream_plan::SubscriptionUpstreamInfo; +use risingwave_pb::stream_service::InjectBarrierRequest; fn sync_epoch( state_store: &S, @@ -423,6 +425,8 @@ pub(crate) struct ManagedBarrierState { pub(super) graph_states: HashMap, + mv_depended_subscriptions: HashMap>, + actor_manager: Arc, current_shared_context: Arc, @@ -437,6 +441,7 @@ impl ManagedBarrierState { Self { actor_states: Default::default(), graph_states: Default::default(), + mv_depended_subscriptions: Default::default(), actor_manager, current_shared_context, } @@ -502,14 +507,65 @@ impl ManagedBarrierState { .register_barrier_sender(tx) } + pub(super) fn add_subscriptions(&mut self, subscriptions: Vec) { + for subscription_to_add in subscriptions { + if !self + .mv_depended_subscriptions + .entry(TableId::new(subscription_to_add.upstream_mv_table_id)) + .or_default() + .insert(subscription_to_add.subscriber_id) + { + if cfg!(debug_assertions) { + panic!("add an existing subscription: {:?}", subscription_to_add); + } + warn!(?subscription_to_add, "add an existing subscription"); + } + } + } + + pub(super) fn remove_subscriptions(&mut self, subscriptions: Vec) { + for subscription_to_remove in subscriptions { + let upstream_table_id = TableId::new(subscription_to_remove.upstream_mv_table_id); + let Some(subscribers) = self.mv_depended_subscriptions.get_mut(&upstream_table_id) + else { + if cfg!(debug_assertions) { + panic!( + "unable to find upstream mv table to remove: {:?}", + subscription_to_remove + ); + } + warn!( + ?subscription_to_remove, + "unable to find upstream mv table to remove" + ); + continue; + }; + if !subscribers.remove(&subscription_to_remove.subscriber_id) { + if cfg!(debug_assertions) { + panic!( + "unable to find subscriber to remove: {:?}", + subscription_to_remove + ); + } + warn!( + ?subscription_to_remove, + "unable to find subscriber to remove" + ); + } + if subscribers.is_empty() { + self.mv_depended_subscriptions.remove(&upstream_table_id); + } + } + } + pub(super) fn transform_to_issued( &mut self, barrier: &Barrier, - actors_to_build: Vec, - actor_ids_to_collect: HashSet, - table_ids: HashSet, - partial_graph_id: PartialGraphId, + request: InjectBarrierRequest, ) -> StreamResult<()> { + self.add_subscriptions(request.subscriptions_to_add); + self.remove_subscriptions(request.subscriptions_to_remove); + let partial_graph_id = PartialGraphId::new(request.partial_graph_id); let actor_to_stop = barrier.all_stop_actors(); let is_stop_actor = |actor_id| { actor_to_stop @@ -527,17 +583,24 @@ impl ManagedBarrierState { ) }); - graph_state.transform_to_issued(barrier, actor_ids_to_collect.clone(), table_ids); + graph_state.transform_to_issued( + barrier, + request.actor_ids_to_collect.iter().cloned(), + HashSet::from_iter(request.table_ids_to_sync.iter().cloned().map(TableId::new)), + ); let mut new_actors = HashSet::new(); - for actor in actors_to_build { - let actor_id = actor.actor.as_ref().unwrap().actor_id; + let subscriptions = LazyCell::new(|| Arc::new(self.mv_depended_subscriptions.clone())); + for actor in request.actors_to_build { + let actor_id = actor.actor_id; assert!(!is_stop_actor(actor_id)); assert!(new_actors.insert(actor_id)); - assert!(actor_ids_to_collect.contains(&actor_id)); - let (join_handle, monitor_join_handle) = self - .actor_manager - .spawn_actor(actor, self.current_shared_context.clone()); + assert!(request.actor_ids_to_collect.contains(&actor_id)); + let (join_handle, monitor_join_handle) = self.actor_manager.spawn_actor( + actor, + (*subscriptions).clone(), + self.current_shared_context.clone(), + ); assert!(self .actor_states .try_insert( @@ -555,7 +618,7 @@ impl ManagedBarrierState { // Spawn a trivial join handle to be compatible with the unit test if cfg!(test) { - for actor_id in &actor_ids_to_collect { + for actor_id in &request.actor_ids_to_collect { if !self.actor_states.contains_key(actor_id) { let join_handle = self.actor_manager.runtime.spawn(async { pending().await }); assert!(self @@ -578,14 +641,17 @@ impl ManagedBarrierState { // Note: it's important to issue barrier to actor after issuing to graph to ensure that // we call `start_epoch` on the graph before the actors receive the barrier - for actor_id in &actor_ids_to_collect { + for actor_id in &request.actor_ids_to_collect { if new_actors.contains(actor_id) { continue; } self.actor_states .get_mut(actor_id) .unwrap_or_else(|| { - panic!("should exist: {} {:?}", actor_id, actor_ids_to_collect); + panic!( + "should exist: {} {:?}", + actor_id, request.actor_ids_to_collect + ); }) .issue_barrier(partial_graph_id, barrier, is_stop_actor(*actor_id))?; } @@ -770,7 +836,7 @@ impl PartialGraphManagedBarrierState { pub(super) fn transform_to_issued( &mut self, barrier: &Barrier, - actor_ids_to_collect: HashSet, + actor_ids_to_collect: impl IntoIterator, table_ids: HashSet, ) { let timer = self diff --git a/src/stream/src/task/stream_manager.rs b/src/stream/src/task/stream_manager.rs index b853aed3628b9..83a7137d4403b 100644 --- a/src/stream/src/task/stream_manager.rs +++ b/src/stream/src/task/stream_manager.rs @@ -13,7 +13,7 @@ // limitations under the License. use core::time::Duration; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::atomic::AtomicU64; use std::sync::Arc; @@ -33,10 +33,10 @@ use risingwave_pb::common::ActorInfo; use risingwave_pb::plan_common::StorageTableDesc; use risingwave_pb::stream_plan; use risingwave_pb::stream_plan::stream_node::NodeBody; -use risingwave_pb::stream_plan::{StreamNode, StreamScanNode, StreamScanType}; +use risingwave_pb::stream_plan::{StreamActor, StreamNode, StreamScanNode, StreamScanType}; use risingwave_pb::stream_service::streaming_control_stream_request::InitRequest; use risingwave_pb::stream_service::{ - BuildActorInfo, StreamingControlStreamRequest, StreamingControlStreamResponse, + StreamingControlStreamRequest, StreamingControlStreamResponse, }; use risingwave_storage::monitor::HummockTraceFutureExt; use risingwave_storage::table::batch_table::storage_table::StorageTable; @@ -574,15 +574,11 @@ impl StreamActorManager { async fn create_actor( self: Arc, - actor: BuildActorInfo, + actor: StreamActor, shared_context: Arc, + related_subscriptions: Arc>>, ) -> StreamResult> { { - let BuildActorInfo { - actor, - related_subscriptions, - } = actor; - let actor = actor.unwrap(); let actor_id = actor.actor_id; let streaming_config = self.env.config().clone(); let actor_context = ActorContext::create( @@ -590,15 +586,7 @@ impl StreamActorManager { self.env.total_mem_usage(), self.streaming_metrics.clone(), actor.dispatcher.len(), - related_subscriptions - .into_iter() - .map(|(table_id, subscription_ids)| { - ( - TableId::new(table_id), - HashSet::from_iter(subscription_ids.subscription_ids), - ) - }) - .collect(), + related_subscriptions, self.env.meta_client().clone(), streaming_config, ); @@ -642,19 +630,20 @@ impl StreamActorManager { impl StreamActorManager { pub(super) fn spawn_actor( self: &Arc, - actor: BuildActorInfo, + actor: StreamActor, + related_subscriptions: Arc>>, current_shared_context: Arc, ) -> (JoinHandle<()>, Option>) { { let monitor = tokio_metrics::TaskMonitor::new(); - let stream_actor_ref = actor.actor.as_ref().unwrap(); + let stream_actor_ref = &actor; let actor_id = stream_actor_ref.actor_id; let handle = { let trace_span = format!("Actor {actor_id}: `{}`", stream_actor_ref.mview_definition); let barrier_manager = current_shared_context.local_barrier_manager.clone(); // wrap the future of `create_actor` with `boxed` to avoid stack overflow - let actor = self.clone().create_actor(actor, current_shared_context).boxed().and_then(|actor| actor.run()).map(move |result| { + let actor = self.clone().create_actor(actor, current_shared_context, related_subscriptions).boxed().and_then(|actor| actor.run()).map(move |result| { if let Err(err) = result { // TODO: check error type and panic if it's unexpected. // Intentionally use `?` on the report to also include the backtrace. @@ -732,7 +721,10 @@ impl StreamActorManager { impl LocalBarrierWorker { /// This function could only be called once during the lifecycle of `LocalStreamManager` for /// now. - pub fn update_actor_info(&self, new_actor_infos: Vec) -> StreamResult<()> { + pub fn update_actor_info( + &self, + new_actor_infos: impl Iterator, + ) -> StreamResult<()> { let mut actor_infos = self.current_shared_context.actor_infos.write(); for actor in new_actor_infos { if let Some(prev_actor) = actor_infos.get(&actor.get_actor_id()) diff --git a/src/stream/tests/integration_tests/eowc_over_window.rs b/src/stream/tests/integration_tests/eowc_over_window.rs index 8365331ad8876..72c406e798a71 100644 --- a/src/stream/tests/integration_tests/eowc_over_window.rs +++ b/src/stream/tests/integration_tests/eowc_over_window.rs @@ -14,6 +14,7 @@ use risingwave_expr::aggregate::{AggArgs, PbAggKind}; use risingwave_expr::window_function::{Frame, FrameBound, WindowFuncCall, WindowFuncKind}; +use risingwave_stream::common::table::test_utils::gen_pbtable; use risingwave_stream::executor::{EowcOverWindowExecutor, EowcOverWindowExecutorArgs}; use crate::prelude::*; @@ -53,12 +54,16 @@ async fn create_executor( Schema { fields } }; - let state_table = StateTable::new_without_distribution_inconsistent_op( + let state_table = StateTable::from_table_catalog_inconsistent_op( + &gen_pbtable( + TableId::new(1), + table_columns, + table_order_types, + table_pk_indices, + 0, + ), store, - TableId::new(1), - table_columns, - table_order_types, - table_pk_indices, + None, ) .await; diff --git a/src/stream/tests/integration_tests/over_window.rs b/src/stream/tests/integration_tests/over_window.rs index 8f96995a8c0d6..dc6209622af87 100644 --- a/src/stream/tests/integration_tests/over_window.rs +++ b/src/stream/tests/integration_tests/over_window.rs @@ -17,6 +17,7 @@ use risingwave_expr::aggregate::{AggArgs, PbAggKind}; use risingwave_expr::window_function::{ Frame, FrameBound, FrameExclusion, WindowFuncCall, WindowFuncKind, }; +use risingwave_stream::common::table::test_utils::gen_pbtable; use risingwave_stream::executor::monitor::StreamingMetrics; use risingwave_stream::executor::{OverWindowExecutor, OverWindowExecutorArgs}; @@ -64,12 +65,16 @@ async fn create_executor( Schema { fields } }; - let state_table = StateTable::new_without_distribution( + let state_table = StateTable::from_table_catalog( + &gen_pbtable( + TableId::new(1), + table_columns, + table_order_types, + table_pk_indices, + 0, + ), store, - TableId::new(1), - table_columns, - table_order_types, - table_pk_indices, + None, ) .await; diff --git a/src/tests/compaction_test/Cargo.toml b/src/tests/compaction_test/Cargo.toml index 330e8b2a3637f..3bc86649ea3b3 100644 --- a/src/tests/compaction_test/Cargo.toml +++ b/src/tests/compaction_test/Cargo.toml @@ -56,10 +56,5 @@ name = "compaction-test" path = "src/bin/compaction.rs" test = false -[[bin]] -name = "delete-range-test" -path = "src/bin/delete_range.rs" -test = false - [lints] workspace = true diff --git a/src/tests/compaction_test/src/delete_range_runner.rs b/src/tests/compaction_test/src/delete_range_runner.rs deleted file mode 100644 index 13df85bf25d97..0000000000000 --- a/src/tests/compaction_test/src/delete_range_runner.rs +++ /dev/null @@ -1,657 +0,0 @@ -// Copyright 2024 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::future::Future; -use std::ops::{Bound, RangeBounds}; -use std::pin::{pin, Pin}; -use std::sync::Arc; -use std::time::{Duration, SystemTime}; - -use bytes::Bytes; -use foyer::{CacheContext, HybridCacheBuilder}; -use rand::rngs::StdRng; -use rand::{RngCore, SeedableRng}; -use risingwave_common::catalog::TableId; -use risingwave_common::config::{ - extract_storage_memory_config, load_config, NoOverride, ObjectStoreConfig, RwConfig, -}; -use risingwave_common::system_param::reader::SystemParamsRead; -use risingwave_common::util::epoch::{test_epoch, EpochExt}; -use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; -use risingwave_hummock_sdk::key::TableKey; -use risingwave_hummock_test::get_notification_client_for_test; -use risingwave_hummock_test::local_state_store_test_utils::LocalStateStoreTestExt; -use risingwave_meta::hummock::compaction::compaction_config::CompactionConfigBuilder; -use risingwave_meta::hummock::test_utils::setup_compute_env_with_config; -use risingwave_meta::hummock::MockHummockMetaClient; -use risingwave_object_store::object::build_remote_object_store; -use risingwave_object_store::object::object_metrics::ObjectStoreMetrics; -use risingwave_pb::catalog::{PbCreateType, PbStreamJobStatus, PbTable}; -use risingwave_pb::hummock::{CompactionConfig, CompactionGroupInfo}; -use risingwave_pb::meta::SystemParams; -use risingwave_rpc_client::HummockMetaClient; -use risingwave_storage::filter_key_extractor::{ - FilterKeyExtractorImpl, FilterKeyExtractorManager, FullKeyFilterKeyExtractor, - RpcFilterKeyExtractorManager, -}; -use risingwave_storage::hummock::compactor::{ - start_compactor, CompactionExecutor, CompactorContext, -}; -use risingwave_storage::hummock::sstable_store::SstableStoreRef; -use risingwave_storage::hummock::utils::cmp_delete_range_left_bounds; -use risingwave_storage::hummock::{ - CachePolicy, HummockStorage, MemoryLimiter, SstableObjectIdManager, SstableStore, - SstableStoreConfig, -}; -use risingwave_storage::monitor::{CompactorMetrics, HummockStateStoreMetrics}; -use risingwave_storage::opts::StorageOpts; -use risingwave_storage::store::{ - LocalStateStore, NewLocalOptions, PrefetchOptions, ReadOptions, SealCurrentEpochOptions, -}; -use risingwave_storage::{StateStore, StateStoreIter}; - -use crate::CompactionTestOpts; -pub fn start_delete_range(opts: CompactionTestOpts) -> Pin + Send>> { - // WARNING: don't change the function signature. Making it `async fn` will cause - // slow compile in release mode. - Box::pin(async move { - tracing::info!("Compaction delete-range test start with options {:?}", opts); - let prefix = opts.state_store.strip_prefix("hummock+"); - match prefix { - Some(s) => { - assert!( - s.starts_with("s3://") || s.starts_with("minio://"), - "Only support S3 and MinIO object store" - ); - } - None => { - panic!("Invalid state store"); - } - } - let ret = compaction_test_main(opts).await; - - match ret { - Ok(_) => { - tracing::info!("Compaction delete-range test Success"); - } - Err(e) => { - panic!("Compaction delete-range test Fail: {}", e); - } - } - }) -} -pub async fn compaction_test_main(opts: CompactionTestOpts) -> anyhow::Result<()> { - let config = load_config(&opts.config_path, NoOverride); - let compaction_config = - CompactionConfigBuilder::with_opt(&config.meta.compaction_config).build(); - compaction_test( - compaction_config, - config, - &opts.state_store, - 1000000, - 800, - 1, - ) - .await -} - -async fn compaction_test( - compaction_config: CompactionConfig, - config: RwConfig, - state_store_type: &str, - test_range: u64, - test_count: u64, - test_delete_ratio: u32, -) -> anyhow::Result<()> { - let (env, hummock_manager_ref, _cluster_manager_ref, worker_node) = - setup_compute_env_with_config(8080, compaction_config.clone()).await; - let meta_client = Arc::new(MockHummockMetaClient::new( - hummock_manager_ref.clone(), - worker_node.id, - )); - - let delete_key_table = PbTable { - id: 1, - schema_id: 1, - database_id: 1, - name: "delete-key-table".to_string(), - columns: vec![], - pk: vec![], - dependent_relations: vec![], - distribution_key: vec![], - stream_key: vec![], - owner: 0, - retention_seconds: None, - fragment_id: 0, - dml_fragment_id: None, - initialized_at_epoch: None, - vnode_col_index: None, - value_indices: vec![], - definition: "".to_string(), - handle_pk_conflict_behavior: 0, - version_column_index: None, - read_prefix_len_hint: 0, - optional_associated_source_id: None, - table_type: 0, - append_only: false, - row_id_index: None, - version: None, - watermark_indices: vec![], - dist_key_in_pk: vec![], - cardinality: None, - created_at_epoch: None, - cleaned_by_watermark: false, - stream_job_status: PbStreamJobStatus::Created.into(), - create_type: PbCreateType::Foreground.into(), - description: None, - incoming_sinks: vec![], - initialized_at_cluster_version: None, - created_at_cluster_version: None, - cdc_table_id: None, - }; - let mut delete_range_table = delete_key_table.clone(); - delete_range_table.id = 2; - delete_range_table.name = "delete-range-table".to_string(); - let group1 = CompactionGroupInfo { - id: StaticCompactionGroupId::StateDefault as _, - parent_id: 0, - member_table_ids: vec![1], - compaction_config: Some(compaction_config.clone()), - }; - let group2 = CompactionGroupInfo { - id: StaticCompactionGroupId::MaterializedView as _, - parent_id: 0, - member_table_ids: vec![2], - compaction_config: Some(compaction_config.clone()), - }; - hummock_manager_ref - .init_metadata_for_version_replay( - vec![delete_key_table, delete_range_table], - vec![group1, group2], - ) - .await?; - - let system_params = SystemParams { - sstable_size_mb: Some(128), - parallel_compact_size_mb: Some(512), - block_size_kb: Some(1024), - bloom_false_positive: Some(0.001), - data_directory: Some("hummock_001".to_string()), - backup_storage_url: Some("memory".to_string()), - backup_storage_directory: Some("backup".to_string()), - ..Default::default() - } - .into(); - let storage_memory_config = extract_storage_memory_config(&config); - let storage_opts = Arc::new(StorageOpts::from(( - &config, - &system_params, - &storage_memory_config, - ))); - let state_store_metrics = Arc::new(HummockStateStoreMetrics::unused()); - let compactor_metrics = Arc::new(CompactorMetrics::unused()); - let object_store_metrics = Arc::new(ObjectStoreMetrics::unused()); - let remote_object_store = build_remote_object_store( - state_store_type.strip_prefix("hummock+").unwrap(), - object_store_metrics.clone(), - "Hummock", - Arc::new(ObjectStoreConfig::default()), - ) - .await; - let meta_cache = HybridCacheBuilder::new() - .memory(storage_memory_config.meta_cache_capacity_mb * (1 << 20)) - .with_shards(storage_memory_config.meta_cache_shard_num) - .storage() - .build() - .await?; - let block_cache = HybridCacheBuilder::new() - .memory(storage_memory_config.block_cache_capacity_mb * (1 << 20)) - .with_shards(storage_memory_config.block_cache_shard_num) - .storage() - .build() - .await?; - let sstable_store = Arc::new(SstableStore::new(SstableStoreConfig { - store: Arc::new(remote_object_store), - path: system_params.data_directory().to_string(), - prefetch_buffer_capacity: storage_memory_config.prefetch_buffer_capacity_mb * (1 << 20), - max_prefetch_block_number: storage_opts.max_prefetch_block_number, - recent_filter: None, - state_store_metrics: state_store_metrics.clone(), - use_new_object_prefix_strategy: system_params.use_new_object_prefix_strategy(), - meta_cache, - block_cache, - })); - - let store = HummockStorage::new( - storage_opts.clone(), - sstable_store.clone(), - meta_client.clone(), - get_notification_client_for_test(env, hummock_manager_ref.clone(), worker_node), - Arc::new(RpcFilterKeyExtractorManager::default()), - state_store_metrics.clone(), - compactor_metrics.clone(), - None, - ) - .await?; - let sstable_object_id_manager = store.sstable_object_id_manager().clone(); - let filter_key_extractor_manager = match store.filter_key_extractor_manager().clone() { - FilterKeyExtractorManager::RpcFilterKeyExtractorManager( - rpc_filter_key_extractor_manager, - ) => rpc_filter_key_extractor_manager, - FilterKeyExtractorManager::StaticFilterKeyExtractorManager(_) => unreachable!(), - }; - - filter_key_extractor_manager.update( - 1, - Arc::new(FilterKeyExtractorImpl::FullKey( - FullKeyFilterKeyExtractor {}, - )), - ); - filter_key_extractor_manager.update( - 2, - Arc::new(FilterKeyExtractorImpl::FullKey( - FullKeyFilterKeyExtractor {}, - )), - ); - - let (compactor_thrd, compactor_shutdown_tx) = run_compactor_thread( - storage_opts, - sstable_store, - meta_client.clone(), - filter_key_extractor_manager, - sstable_object_id_manager, - compactor_metrics, - ); - run_compare_result( - &store, - meta_client.clone(), - test_range, - test_count, - test_delete_ratio, - ) - .await - .unwrap(); - let version = store.get_pinned_version().version().clone(); - let remote_version = meta_client.get_current_version().await.unwrap(); - println!( - "version-{}, remote version-{}", - version.id, remote_version.id - ); - for (group, levels) in &version.levels { - let l0 = &levels.l0; - println!( - "group-{}: l0 sz: {}, count: {}", - group, - l0.total_file_size, - l0.sub_levels - .iter() - .map(|level| level.table_infos.len()) - .sum::() - ); - } - - compactor_shutdown_tx.send(()).unwrap(); - compactor_thrd.await.unwrap(); - Ok(()) -} - -async fn run_compare_result( - hummock: &HummockStorage, - meta_client: Arc, - test_range: u64, - test_count: u64, - test_delete_ratio: u32, -) -> Result<(), String> { - let init_epoch = test_epoch(hummock.get_pinned_version().max_committed_epoch() + 1); - - let mut normal = NormalState::new(hummock, 1, init_epoch).await; - let mut delete_range = DeleteRangeState::new(hummock, 2, init_epoch).await; - const RANGE_BASE: u64 = 4000; - let range_mod = test_range / RANGE_BASE; - - let seed = SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap() - .as_secs(); - println!("========== run with seed: {}", seed); - let mut rng = StdRng::seed_from_u64(seed); - let mut overlap_ranges = vec![]; - for epoch_idx in 0..test_count { - let epoch = test_epoch(init_epoch / test_epoch(1) + epoch_idx); - for idx in 0..1000 { - let op = rng.next_u32() % 50; - let key_number = rng.next_u64() % test_range; - if op < test_delete_ratio { - let end_key = key_number + (rng.next_u64() % range_mod) + 1; - overlap_ranges.push((key_number, end_key, epoch, idx)); - let start_key = format!("\0\0{:010}", key_number); - let end_key = format!("\0\0{:010}", end_key); - normal - .delete_range(start_key.as_bytes(), end_key.as_bytes()) - .await; - delete_range - .delete_range(start_key.as_bytes(), end_key.as_bytes()) - .await; - } else if op < test_delete_ratio + 5 { - let key = format!("\0\0{:010}", key_number); - let a = normal.get(key.as_bytes()).await; - let b = delete_range.get(key.as_bytes()).await; - assert!( - a.eq(&b), - "query {} {:?} vs {:?} in epoch-{}", - key_number, - a.map(|raw| String::from_utf8(raw.to_vec()).unwrap()), - b.map(|raw| String::from_utf8(raw.to_vec()).unwrap()), - epoch, - ); - } else if op < test_delete_ratio + 10 { - let end_key = key_number + (rng.next_u64() % range_mod) + 1; - let start_key = format!("\0\0{:010}", key_number); - let end_key = format!("\0\0{:010}", end_key); - let ret1 = normal.scan(start_key.as_bytes(), end_key.as_bytes()).await; - let ret2 = delete_range - .scan(start_key.as_bytes(), end_key.as_bytes()) - .await; - assert_eq!(ret1, ret2); - } else { - let overlap = overlap_ranges - .iter() - .any(|(left, right, _, _)| *left <= key_number && key_number < *right); - if overlap { - continue; - } - let key = format!("\0\0{:010}", key_number); - let val = format!("val-{:010}-{:016}-{:016}", idx, key_number, epoch); - normal.insert(key.as_bytes(), val.as_bytes()); - delete_range.insert(key.as_bytes(), val.as_bytes()); - } - } - let next_epoch = epoch.next_epoch(); - normal.commit(next_epoch).await?; - delete_range.commit(next_epoch).await?; - // let checkpoint = epoch % 10 == 0; - let ret = hummock.seal_and_sync_epoch(epoch).await.unwrap(); - meta_client - .commit_epoch(epoch, ret) - .await - .map_err(|e| format!("{:?}", e))?; - if (epoch / test_epoch(1)) % 200 == 0 { - tokio::time::sleep(Duration::from_secs(1)).await; - } - } - Ok(()) -} - -struct NormalState { - storage: ::Local, - table_id: TableId, -} - -struct DeleteRangeState { - inner: NormalState, - delete_ranges: Vec<(Bound, Bound)>, -} - -impl DeleteRangeState { - async fn new(hummock: &HummockStorage, table_id: u32, epoch: u64) -> Self { - Self { - inner: NormalState::new(hummock, table_id, epoch).await, - delete_ranges: vec![], - } - } -} - -#[async_trait::async_trait] -trait CheckState { - async fn delete_range(&mut self, left: &[u8], right: &[u8]); - async fn get(&self, key: &[u8]) -> Option; - async fn scan(&self, left: &[u8], right: &[u8]) -> Vec<(Bytes, Bytes)>; - fn insert(&mut self, key: &[u8], val: &[u8]); - async fn commit(&mut self, epoch: u64) -> Result<(), String>; -} - -impl NormalState { - async fn new(hummock: &HummockStorage, table_id: u32, epoch: u64) -> Self { - let table_id = TableId::new(table_id); - let mut storage = hummock.new_local(NewLocalOptions::for_test(table_id)).await; - storage.init_for_test(epoch).await.unwrap(); - Self { storage, table_id } - } - - async fn commit_impl( - &mut self, - _delete_ranges: Vec<(Bound, Bound)>, - next_epoch: u64, - ) -> Result<(), String> { - // self.storage - // .flush(delete_ranges) - // .await - // .map_err(|e| format!("{:?}", e))?; - self.storage.flush().await.map_err(|e| format!("{:?}", e))?; - self.storage - .seal_current_epoch(next_epoch, SealCurrentEpochOptions::for_test()); - Ok(()) - } - - async fn get_impl(&self, key: &[u8], ignore_range_tombstone: bool) -> Option { - self.storage - .get( - TableKey(Bytes::copy_from_slice(key)), - ReadOptions { - ignore_range_tombstone, - table_id: self.table_id, - cache_policy: CachePolicy::Fill(CacheContext::Default), - ..Default::default() - }, - ) - .await - .unwrap() - } - - async fn scan_impl( - &self, - left: &[u8], - right: &[u8], - ignore_range_tombstone: bool, - ) -> Vec<(Bytes, Bytes)> { - let mut iter = pin!(self - .storage - .iter( - ( - Bound::Included(TableKey(Bytes::copy_from_slice(left))), - Bound::Excluded(TableKey(Bytes::copy_from_slice(right))), - ), - ReadOptions { - ignore_range_tombstone, - table_id: self.table_id, - read_version_from_backup: false, - prefetch_options: PrefetchOptions::default(), - cache_policy: CachePolicy::Fill(CacheContext::Default), - ..Default::default() - }, - ) - .await - .unwrap(),); - let mut ret = vec![]; - while let Some(item) = iter.try_next().await.unwrap() { - let (full_key, val) = item; - let tkey = Bytes::copy_from_slice(full_key.user_key.table_key.0); - ret.push((tkey, Bytes::copy_from_slice(val))); - } - ret - } -} - -#[async_trait::async_trait] -impl CheckState for NormalState { - async fn delete_range(&mut self, left: &[u8], right: &[u8]) { - let mut iter = self - .storage - .iter( - ( - Bound::Included(Bytes::copy_from_slice(left)).map(TableKey), - Bound::Excluded(Bytes::copy_from_slice(right)).map(TableKey), - ), - ReadOptions { - ignore_range_tombstone: true, - table_id: self.table_id, - read_version_from_backup: false, - prefetch_options: PrefetchOptions::default(), - cache_policy: CachePolicy::Fill(CacheContext::Default), - ..Default::default() - }, - ) - .await - .unwrap(); - let mut delete_item = Vec::new(); - while let Some(item) = iter.try_next().await.unwrap() { - let (full_key, value) = item; - delete_item.push(( - full_key.user_key.table_key.copy_into(), - Bytes::copy_from_slice(value), - )); - } - drop(iter); - for (key, value) in delete_item { - self.storage.delete(key, value).unwrap(); - } - } - - fn insert(&mut self, key: &[u8], val: &[u8]) { - self.storage - .insert( - TableKey(Bytes::from(key.to_vec())), - Bytes::copy_from_slice(val), - None, - ) - .unwrap(); - } - - async fn get(&self, key: &[u8]) -> Option { - self.get_impl(key, true).await - } - - async fn scan(&self, left: &[u8], right: &[u8]) -> Vec<(Bytes, Bytes)> { - self.scan_impl(left, right, true).await - } - - async fn commit(&mut self, next_epoch: u64) -> Result<(), String> { - self.commit_impl(vec![], next_epoch).await - } -} - -#[async_trait::async_trait] -impl CheckState for DeleteRangeState { - async fn delete_range(&mut self, left: &[u8], right: &[u8]) { - self.delete_ranges.push(( - Bound::Included(Bytes::copy_from_slice(left)), - Bound::Excluded(Bytes::copy_from_slice(right)), - )); - } - - async fn get(&self, key: &[u8]) -> Option { - for delete_range in &self.delete_ranges { - if delete_range.contains(key) { - return None; - } - } - self.inner.get_impl(key, false).await - } - - async fn scan(&self, left: &[u8], right: &[u8]) -> Vec<(Bytes, Bytes)> { - let mut ret = self.inner.scan_impl(left, right, false).await; - ret.retain(|(key, _)| { - for delete_range in &self.delete_ranges { - if delete_range.contains(key) { - return false; - } - } - true - }); - ret - } - - fn insert(&mut self, key: &[u8], val: &[u8]) { - self.inner.insert(key, val); - } - - async fn commit(&mut self, next_epoch: u64) -> Result<(), String> { - let mut delete_ranges = std::mem::take(&mut self.delete_ranges); - delete_ranges.sort_by(|a, b| cmp_delete_range_left_bounds(a.0.as_ref(), b.0.as_ref())); - self.inner.commit_impl(delete_ranges, next_epoch).await - } -} - -fn run_compactor_thread( - storage_opts: Arc, - sstable_store: SstableStoreRef, - meta_client: Arc, - filter_key_extractor_manager: Arc, - sstable_object_id_manager: Arc, - compactor_metrics: Arc, -) -> ( - tokio::task::JoinHandle<()>, - tokio::sync::oneshot::Sender<()>, -) { - let filter_key_extractor_manager = - FilterKeyExtractorManager::RpcFilterKeyExtractorManager(filter_key_extractor_manager); - let compactor_context = CompactorContext { - storage_opts, - sstable_store, - compactor_metrics, - is_share_buffer_compact: false, - compaction_executor: Arc::new(CompactionExecutor::new(None)), - memory_limiter: MemoryLimiter::unlimit(), - task_progress_manager: Default::default(), - await_tree_reg: None, - }; - - start_compactor( - compactor_context, - meta_client, - sstable_object_id_manager, - filter_key_extractor_manager, - ) -} - -#[cfg(test)] -mod tests { - - use risingwave_common::config::RwConfig; - use risingwave_meta::hummock::compaction::compaction_config::CompactionConfigBuilder; - - use super::compaction_test; - - #[ignore] - // TODO: may modify the test to use per vnode table watermark - #[tokio::test(flavor = "multi_thread", worker_threads = 3)] - async fn test_small_data() { - let config = RwConfig::default(); - let mut compaction_config = CompactionConfigBuilder::new().build(); - compaction_config.max_sub_compaction = 1; - compaction_config.level0_tier_compact_file_number = 2; - compaction_config.max_bytes_for_level_base = 512 * 1024; - compaction_config.sub_level_max_compaction_bytes = 256 * 1024; - compaction_test( - compaction_config.clone(), - config.clone(), - "hummock+memory", - 1000000, - 60, - 10, - ) - .await - .unwrap(); - } -} diff --git a/src/tests/compaction_test/src/lib.rs b/src/tests/compaction_test/src/lib.rs index e5fd10b10b176..70f6e20b62adc 100644 --- a/src/tests/compaction_test/src/lib.rs +++ b/src/tests/compaction_test/src/lib.rs @@ -27,10 +27,8 @@ #![allow(rw::format_error)] // test code mod compaction_test_runner; -mod delete_range_runner; use clap::Parser; -pub use delete_range_runner::start_delete_range; use crate::compaction_test_runner::compaction_test_main; diff --git a/src/tests/simulation/src/slt.rs b/src/tests/simulation/src/slt.rs index 799602a00aa3f..7ac5a7b27d70b 100644 --- a/src/tests/simulation/src/slt.rs +++ b/src/tests/simulation/src/slt.rs @@ -497,8 +497,6 @@ fn hack_kafka_test(path: &Path) -> tempfile::NamedTempFile { let complex_avsc_full_path = std::fs::canonicalize("src/connector/src/test_data/complex-schema.avsc") .expect("failed to get schema path"); - let proto_full_path = std::fs::canonicalize("src/connector/src/test_data/complex-schema") - .expect("failed to get schema path"); let json_schema_full_path = std::fs::canonicalize("src/connector/src/test_data/complex-schema.json") .expect("failed to get schema path"); @@ -513,10 +511,6 @@ fn hack_kafka_test(path: &Path) -> tempfile::NamedTempFile { "/risingwave/avro-complex-schema.avsc", complex_avsc_full_path.to_str().unwrap(), ) - .replace( - "/risingwave/proto-complex-schema", - proto_full_path.to_str().unwrap(), - ) .replace( "/risingwave/json-complex-schema", json_schema_full_path.to_str().unwrap(),