diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 0523e534c5d8..3ee3bea8bbab 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -39,6 +39,7 @@ updates: google-cloud: patterns: - "google-cloud*" + # Don't update these directories - package-ecosystem: cargo directory: /integration_tests/feature-store @@ -46,3 +47,21 @@ updates: interval: "daily" ignore: - dependency-name: "*" + +- package-ecosystem: maven + directory: /java + schedule: + interval: "weekly" + open-pull-requests-limit: 5 + # Disable auto rebase to reduce cost. Use `@dependabot rebase` manually instead. + rebase-strategy: "disabled" + ignore: + # Do not bump Debezium because we have hacked its source code e.g. #18760 + - dependency-name: "io.debezium:*" + update-types: + ["version-update:semver-minor", "version-update:semver-major"] + groups: + # Group all dependenies together because Java libraries are quite stable + all: + patterns: + - "*" diff --git a/.github/label-commenter-config.yml b/.github/label-commenter-config.yml new file mode 100644 index 000000000000..6b887f27429a --- /dev/null +++ b/.github/label-commenter-config.yml @@ -0,0 +1,22 @@ +comment: + header: Hi, there. + footer: "\ + ---\n\n\ + > This is an automated comment created by the [peaceiris/actions-label-commenter]. \ + Responding to the bot or mentioning it won't have any effect.\n\n\ + [peaceiris/actions-label-commenter]: https://github.com/peaceiris/actions-label-commenter" + +labels: + - name: 'user-facing-changes' + labeled: + pr: + body: | + :memo: **Telemetry Reminder**: + If you're implementing this feature, please consider adding telemetry metrics to track its usage. This helps us understand how the feature is being used and improve it further. + You can find the function `report_event` of telemetry reporting in the following files. Feel free to ask questions if you need any guidance! + * `src/frontend/src/telemetry.rs` + * `src/meta/src/telemetry.rs` + * `src/stream/src/telemetry.rs` + * `src/storage/compactor/src/telemetry.rs` + Or calling `report_event_common` (`src/common/telemetry_event/src/lib.rs`) as if finding it hard to implement. + :sparkles: Thank you for your contribution to RisingWave! :sparkles: diff --git a/.github/workflows/label-triggered.yml b/.github/workflows/label-triggered.yml new file mode 100644 index 000000000000..83a26251e78e --- /dev/null +++ b/.github/workflows/label-triggered.yml @@ -0,0 +1,23 @@ +name: Label Triggered Comment + +on: + issues: + types: [labeled, unlabeled] + pull_request: + types: [labeled, unlabeled] + +permissions: + contents: read + issues: write + pull-requests: write + +jobs: + comment: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + - name: Label Commenter + uses: peaceiris/actions-label-commenter@v1 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + config_file: .github/label-commenter-config.yml diff --git a/Cargo.lock b/Cargo.lock index 259c225fed3f..c8be3e154ddd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -224,7 +224,7 @@ dependencies = [ "snap", "strum 0.25.0", "strum_macros 0.25.3", - "thiserror", + "thiserror 1.0.63", "typed-builder 0.16.2", "uuid", "xz2", @@ -250,7 +250,7 @@ dependencies = [ "serde_json", "strum 0.26.3", "strum_macros 0.26.4", - "thiserror", + "thiserror 1.0.63", "typed-builder 0.19.1", "uuid", ] @@ -273,7 +273,7 @@ dependencies = [ "serde_json", "strum 0.26.3", "strum_macros 0.26.4", - "thiserror", + "thiserror 1.0.63", "typed-builder 0.18.2", "uuid", ] @@ -287,7 +287,7 @@ dependencies = [ "proc-macro2", "quote", "serde_json", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -906,7 +906,7 @@ dependencies = [ "arrow-schema 53.2.0", "arrow-select 53.2.0", "futures-util", - "thiserror", + "thiserror 1.0.63", "tokio", "tonic", "tracing", @@ -1079,9 +1079,9 @@ dependencies = [ [[package]] name = "async-nats" -version = "0.37.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3bdd6ea595b2ea504500a3566071beb81125fc15d40a6f6bffa43575f64152" +checksum = "76433c4de73442daedb3a59e991d94e85c14ebfc33db53dfcd347a21cd6ef4f8" dependencies = [ "base64 0.22.0", "bytes", @@ -1090,6 +1090,7 @@ dependencies = [ "nkeys", "nuid", "once_cell", + "pin-project", "portable-atomic", "rand", "regex", @@ -1101,11 +1102,12 @@ dependencies = [ "serde_json", "serde_nanos", "serde_repr", - "thiserror", + "thiserror 1.0.63", "time", "tokio", "tokio-rustls 0.26.0", "tokio-util", + "tokio-websockets", "tracing", "tryhard", "url", @@ -1119,7 +1121,7 @@ checksum = "30c5ef0ede93efbf733c1a727f3b6b5a1060bbedd5600183e66f6e4be4af0ec5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -1168,7 +1170,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -1185,7 +1187,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -1240,7 +1242,7 @@ dependencies = [ "derive_utils", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -1251,7 +1253,7 @@ checksum = "3c87f3f15e7794432337fc718554eaa4dc8f04c9677a950ffe366f20a162ae42" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -1376,7 +1378,7 @@ dependencies = [ "base64 0.22.0", "chrono", "futures", - "thiserror", + "thiserror 1.0.63", "url", ] @@ -1994,7 +1996,7 @@ dependencies = [ "byteorder", "libc", "socket2 0.4.9", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -2087,7 +2089,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.79", + "syn 2.0.87", "which", ] @@ -2222,7 +2224,7 @@ dependencies = [ "proc-macro-crate 3.1.0", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", "syn_derive", ] @@ -2511,7 +2513,7 @@ checksum = "bc7cb2538d4ecc42b6c3b57a83094d8c69894e74468d18cd045a09fdea807358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -2660,7 +2662,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -2685,7 +2687,7 @@ dependencies = [ "sealed 0.4.0", "serde", "static_assertions", - "thiserror", + "thiserror 1.0.63", "time", "tokio", "url", @@ -2857,7 +2859,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -3115,7 +3117,7 @@ dependencies = [ "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -3366,7 +3368,7 @@ checksum = "83fdaf97f4804dcebfa5862639bc9ce4121e82140bec2a987ac5140294865b5b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -3438,7 +3440,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -3471,7 +3473,7 @@ checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178" dependencies = [ "darling_core 0.20.9", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -3960,7 +3962,7 @@ dependencies = [ "serde", "serde_json", "strum 0.26.3", - "thiserror", + "thiserror 1.0.63", "tracing", "url", "uuid", @@ -3976,7 +3978,7 @@ checksum = "ec5c4fb5b59b1bd55ed8ebcf941f27a327d600c19a4a4103546846c358be93ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -4011,7 +4013,7 @@ dependencies = [ "maplit", "object_store", "regex", - "thiserror", + "thiserror 1.0.63", "tokio", "tracing", "url", @@ -4075,7 +4077,7 @@ dependencies = [ "serde", "serde_json", "sqlparser 0.51.0", - "thiserror", + "thiserror 1.0.63", "tokio", "tracing", "url", @@ -4097,7 +4099,7 @@ dependencies = [ "lazy_static", "object_store", "regex", - "thiserror", + "thiserror 1.0.63", "tokio", "tracing", "url", @@ -4184,7 +4186,7 @@ dependencies = [ "darling 0.20.9", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -4204,7 +4206,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" dependencies = [ "derive_builder_core 0.20.0", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -4228,7 +4230,7 @@ checksum = "61bb5a1014ce6dfc2a378578509abe775a5aa06bff584a547555d9efdb81b926" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -4240,7 +4242,7 @@ dependencies = [ "console", "shell-words", "tempfile", - "thiserror", + "thiserror 1.0.63", "zeroize", ] @@ -4386,7 +4388,7 @@ dependencies = [ "chrono", "rust_decimal", "serde", - "thiserror", + "thiserror 1.0.63", "time", "winnow 0.6.11", ] @@ -4463,7 +4465,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -4585,7 +4587,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -4625,7 +4627,7 @@ checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -4645,7 +4647,7 @@ checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -4665,7 +4667,7 @@ checksum = "f95e2801cd355d4a1a3e3953ce6ee5ae9603a5c833455343a8bfe3f44d418246" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -4830,7 +4832,7 @@ dependencies = [ "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -5170,7 +5172,7 @@ dependencies = [ "pin-project", "rand", "serde", - "thiserror", + "thiserror 1.0.63", "tracing", "twox-hash", "zstd 0.13.0", @@ -5207,7 +5209,7 @@ checksum = "b0fa992f1656e1707946bbba340ad244f0814009ef8c0118eb7b658395f19a2e" dependencies = [ "frunk_proc_macro_helpers", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -5219,7 +5221,7 @@ dependencies = [ "frunk_core", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -5231,16 +5233,7 @@ dependencies = [ "frunk_core", "frunk_proc_macro_helpers", "quote", - "syn 2.0.79", -] - -[[package]] -name = "fs-err" -version = "2.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88a41f105fe1d5b6b34b2055e3dc59bb79b46b48b2040b9e6c7b4b5de097aa41" -dependencies = [ - "autocfg", + "syn 2.0.87", ] [[package]] @@ -5353,7 +5346,7 @@ checksum = "5ac45ed0bddbd110eb68862768a194f88700f5b91c39931d2f432fab67a16d08" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -5421,7 +5414,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -5508,7 +5501,7 @@ dependencies = [ "reqwest 0.11.20", "serde", "serde_json", - "thiserror", + "thiserror 1.0.63", "time", "tokio", "tokio-stream 0.1.16", @@ -5599,7 +5592,7 @@ dependencies = [ "anyhow", "async-trait", "http 1.1.0", - "thiserror", + "thiserror 1.0.63", "tokio", "tonic", "tower 0.4.13", @@ -5640,7 +5633,7 @@ dependencies = [ "reqwest 0.12.4", "serde", "serde_json", - "thiserror", + "thiserror 1.0.63", "time", "tokio", "tracing", @@ -5668,7 +5661,7 @@ dependencies = [ "reqwest-middleware", "serde", "serde_json", - "thiserror", + "thiserror 1.0.63", "time", "tokio", "tracing", @@ -5682,7 +5675,7 @@ checksum = "9c3eaaad103912825594d674a4b1e556ccbb05a13a6cac17dcfd871997fb760a" dependencies = [ "google-cloud-token", "http 1.1.0", - "thiserror", + "thiserror 1.0.63", "tokio", "tokio-retry", "tonic", @@ -5708,7 +5701,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04f945a208886a13d07636f38fb978da371d0abc3e34bad338124b9f8c135a8f" dependencies = [ "reqwest 0.12.4", - "thiserror", + "thiserror 1.0.63", "tokio", ] @@ -5725,7 +5718,7 @@ dependencies = [ "google-cloud-googleapis", "google-cloud-token", "prost-types 0.13.1", - "thiserror", + "thiserror 1.0.63", "tokio", "tokio-util", "tracing", @@ -6336,7 +6329,7 @@ dependencies = [ [[package]] name = "icelake" version = "0.3.141592654" -source = "git+https://github.com/risingwavelabs/icelake.git?rev=1783f8f106958d6d0ce0249c1c708934a15c2a47#1783f8f106958d6d0ce0249c1c708934a15c2a47" +source = "git+https://github.com/risingwavelabs/icelake.git?rev=0ec44fa826c91139c9cf459b005741df990ae9da#0ec44fa826c91139c9cf459b005741df990ae9da" dependencies = [ "anyhow", "apache-avro 0.17.0 (git+https://github.com/apache/avro.git)", @@ -6496,7 +6489,7 @@ checksum = "ce243b1bfa62ffc028f1cc3b6034ec63d649f3031bc8a4fbbb004e1ac17d1f68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -6711,7 +6704,7 @@ dependencies = [ "jni-sys", "libloading", "log", - "thiserror", + "thiserror 1.0.63", "walkdir", "windows-sys 0.45.0", ] @@ -7067,18 +7060,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "libtest-mimic" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc0bda45ed5b3a2904262c1bb91e526127aa70e7ef3758aba2ef93cf896b9b58" -dependencies = [ - "clap", - "escape8259", - "termcolor", - "threadpool", -] - [[package]] name = "libtest-mimic" version = "0.8.1" @@ -7126,7 +7107,7 @@ checksum = "04e542a18c94a9b6fcc7adb090fa3ba6b79ee220a16404f325672729f32a66ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -7188,7 +7169,7 @@ dependencies = [ "proc-macro2", "quote", "regex-syntax 0.8.5", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -7354,7 +7335,7 @@ dependencies = [ "serde_json", "slab", "spin 0.9.8", - "thiserror", + "thiserror 1.0.63", "tokio", "tracing", ] @@ -7396,7 +7377,7 @@ dependencies = [ "proc-macro2", "prost-build 0.13.1", "quote", - "syn 2.0.79", + "syn 2.0.87", "tonic-build", ] @@ -7470,7 +7451,7 @@ dependencies = [ "bytes", "rust_decimal", "serde", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -7523,7 +7504,7 @@ dependencies = [ "prometheus", "sealed 0.5.0", "smallvec", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -7547,7 +7528,7 @@ checksum = "4edc8853320c2a0dab800fbda86253c8938f6ea88510dc92c5f1ed20e794afc1" dependencies = [ "cfg-if", "miette-derive", - "thiserror", + "thiserror 1.0.63", "unicode-width", ] @@ -7559,7 +7540,7 @@ checksum = "dcf09caffaac8068c346b6df2a7fc27a177fd20b39421a39ce0a211bde679a6c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -7680,7 +7661,7 @@ dependencies = [ "rustc_version 0.4.0", "smallvec", "tagptr", - "thiserror", + "thiserror 1.0.63", "triomphe", "uuid", ] @@ -7721,7 +7702,7 @@ dependencies = [ "stringprep", "strsim 0.10.0", "take_mut", - "thiserror", + "thiserror 1.0.63", "tokio", "tokio-rustls 0.24.1", "tokio-util", @@ -7772,9 +7753,9 @@ dependencies = [ "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", "termcolor", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -7803,7 +7784,7 @@ dependencies = [ "serde", "serde_json", "socket2 0.5.6", - "thiserror", + "thiserror 1.0.63", "tokio", "tokio-native-tls", "tokio-util", @@ -7845,7 +7826,7 @@ dependencies = [ "sha2", "smallvec", "subprocess", - "thiserror", + "thiserror 1.0.63", "time", "uuid", "zstd 0.13.0", @@ -8207,7 +8188,7 @@ dependencies = [ "serde_json", "serde_path_to_error", "sha2", - "thiserror", + "thiserror 1.0.63", "url", ] @@ -8371,7 +8352,7 @@ dependencies = [ "serde_with 3.8.1", "sha2", "subtle", - "thiserror", + "thiserror 1.0.63", "url", ] @@ -8418,7 +8399,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -8460,7 +8441,7 @@ dependencies = [ "js-sys", "once_cell", "pin-project-lite", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -8476,7 +8457,7 @@ dependencies = [ "opentelemetry-proto", "opentelemetry_sdk", "prost 0.13.1", - "thiserror", + "thiserror 1.0.63", "tokio", "tonic", ] @@ -8515,7 +8496,7 @@ dependencies = [ "percent-encoding", "rand", "serde_json", - "thiserror", + "thiserror 1.0.63", "tokio", "tokio-stream 0.1.16", ] @@ -8618,7 +8599,7 @@ dependencies = [ "proc-macro2", "proc-macro2-diagnostics", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -8830,7 +8811,7 @@ dependencies = [ "regex", "regex-syntax 0.8.5", "structmeta", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -8989,7 +8970,7 @@ dependencies = [ "serde_json", "socket2 0.5.6", "tempfile", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tokio-openssl", "tokio-postgres", @@ -9053,7 +9034,7 @@ checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -9215,7 +9196,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -9289,7 +9270,7 @@ dependencies = [ "smallvec", "symbolic-demangle", "tempfile", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -9373,7 +9354,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" dependencies = [ "proc-macro2", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -9477,7 +9458,7 @@ checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", "version_check", "yansi 1.0.1", ] @@ -9531,7 +9512,7 @@ dependencies = [ "parking_lot 0.12.1", "procfs 0.14.2", "protobuf", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -9617,7 +9598,7 @@ dependencies = [ "prost 0.13.1", "prost-types 0.13.1", "regex", - "syn 2.0.79", + "syn 2.0.87", "tempfile", ] @@ -9644,7 +9625,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -9657,7 +9638,7 @@ dependencies = [ "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -9666,7 +9647,7 @@ version = "0.1.0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -9721,7 +9702,7 @@ dependencies = [ "prost-reflect", "prost-types 0.13.1", "protox-parse", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -9733,7 +9714,7 @@ dependencies = [ "logos", "miette", "prost-types 0.13.1", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -9850,7 +9831,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -9863,7 +9844,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -10098,7 +10079,7 @@ checksum = "a18479200779601e498ada4e8c1e1f50e3ee19deb0259c25825a98b5603b2cb4" dependencies = [ "getrandom", "libredox 0.0.1", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -10118,7 +10099,7 @@ checksum = "7f7473c2cfcf90008193dd0e3e16599455cb601a9fce322b5bb55de799664925" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -10330,7 +10311,7 @@ dependencies = [ "http 1.1.0", "reqwest 0.12.4", "serde", - "thiserror", + "thiserror 1.0.63", "tower-service", ] @@ -10417,7 +10398,7 @@ dependencies = [ "chrono", "clap", "console", - "fs-err 3.0.0", + "fs-err", "glob", "google-cloud-pubsub", "indicatif", @@ -10448,7 +10429,7 @@ dependencies = [ "console", "dialoguer", "enum-iterator", - "fs-err 3.0.0", + "fs-err", "itertools 0.13.0", ] @@ -10461,7 +10442,7 @@ dependencies = [ "prettyplease 0.2.15", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -10482,7 +10463,7 @@ dependencies = [ "risingwave_pb", "serde", "serde_json", - "thiserror", + "thiserror 1.0.63", "twox-hash", ] @@ -10532,7 +10513,7 @@ dependencies = [ "scopeguard", "serde_json", "tempfile", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tikv-jemallocator", "tokio-metrics", @@ -10739,7 +10720,7 @@ dependencies = [ "strum_macros 0.26.4", "sysinfo", "tempfile", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tinyvec", "tokio-retry", @@ -10781,7 +10762,7 @@ dependencies = [ "madsim-tokio", "parking_lot 0.12.1", "risingwave_common", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tikv-jemalloc-ctl", "tracing", @@ -10848,7 +10829,7 @@ dependencies = [ "prost 0.13.1", "risingwave_pb", "serde", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tracing", ] @@ -10869,7 +10850,7 @@ dependencies = [ "risingwave_pb", "risingwave_rpc_client", "serde", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tower 0.5.0", "tower-http", @@ -11015,7 +10996,7 @@ dependencies = [ "elasticsearch", "enum-as-inner 0.6.0", "expect-test", - "fs-err 3.0.0", + "fs-err", "futures", "futures-async-stream", "gcp-bigquery-client", @@ -11087,9 +11068,9 @@ dependencies = [ "sqlx", "strum 0.26.3", "strum_macros 0.26.4", - "syn 2.0.79", + "syn 2.0.87", "tempfile", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tiberius", "time", @@ -11119,7 +11100,7 @@ dependencies = [ "chrono", "easy-ext", "expect-test", - "fs-err 3.0.0", + "fs-err", "hex", "itertools 0.13.0", "jsonbb", @@ -11135,7 +11116,7 @@ dependencies = [ "risingwave_pb", "rust_decimal", "serde_json", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "time", "tracing", @@ -11202,7 +11183,7 @@ dependencies = [ "risingwave_pb", "rw_futures_util", "tempfile", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tracing", "workspace-hack", @@ -11234,7 +11215,7 @@ dependencies = [ "madsim-tonic", "serde", "serde-error", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tracing", ] @@ -11272,7 +11253,7 @@ dependencies = [ "risingwave_pb", "smallvec", "static_assertions", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tracing", "workspace-hack", @@ -11325,7 +11306,7 @@ dependencies = [ "sha2", "smallvec", "sql-json-path", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tonic", "tracing", @@ -11340,7 +11321,7 @@ dependencies = [ "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -11420,7 +11401,7 @@ dependencies = [ "smallvec", "speedate", "tempfile", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tokio-postgres", "tokio-stream 0.1.15", @@ -11436,7 +11417,7 @@ version = "2.2.0-alpha" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -11510,7 +11491,7 @@ dependencies = [ "risingwave_common", "risingwave_hummock_sdk", "risingwave_pb", - "thiserror", + "thiserror 1.0.63", "tracing", ] @@ -11549,7 +11530,7 @@ dependencies = [ "chrono", "expect-test", "foyer", - "fs-err 3.0.0", + "fs-err", "futures", "itertools 0.13.0", "jni", @@ -11563,7 +11544,7 @@ dependencies = [ "rw_futures_util", "serde", "serde_json", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tracing", ] @@ -11578,7 +11559,7 @@ dependencies = [ "risingwave_pb", "risingwave_telemetry_event", "serde", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tracing", ] @@ -11667,7 +11648,7 @@ dependencies = [ "strum 0.26.3", "sync-point", "tempfile", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tokio-retry", "tokio-stream 0.1.15", @@ -11819,7 +11800,7 @@ dependencies = [ "risingwave_common", "risingwave_jni_core", "spin 0.9.8", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tokio-retry", "tracing", @@ -11830,7 +11811,7 @@ name = "risingwave_pb" version = "2.2.0-alpha" dependencies = [ "enum-as-inner 0.6.0", - "fs-err 3.0.0", + "fs-err", "madsim-tonic", "madsim-tonic-build", "pbjson", @@ -11841,7 +11822,7 @@ dependencies = [ "risingwave_error", "serde", "strum 0.26.3", - "thiserror", + "thiserror 1.0.63", "walkdir", "workspace-hack", ] @@ -11853,7 +11834,7 @@ dependencies = [ "anyhow", "expect-test", "itertools 0.13.0", - "libtest-mimic 0.8.1", + "libtest-mimic", "madsim-tokio", "paste", "risingwave_expr_impl", @@ -11906,7 +11887,7 @@ dependencies = [ "risingwave_pb", "rw_futures_util", "static_assertions", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tokio-retry", "tokio-stream 0.1.15", @@ -12009,13 +11990,13 @@ dependencies = [ "anyhow", "console", "itertools 0.13.0", - "libtest-mimic 0.8.1", + "libtest-mimic", "madsim-tokio", "matches", "serde", "serde_with 3.8.1", "serde_yaml", - "thiserror", + "thiserror 1.0.63", "tracing", "tracing-subscriber", "walkdir", @@ -12032,7 +12013,7 @@ dependencies = [ "clap", "expect-test", "itertools 0.13.0", - "libtest-mimic 0.8.1", + "libtest-mimic", "madsim-tokio", "rand", "rand_chacha", @@ -12132,7 +12113,7 @@ dependencies = [ "spin 0.9.8", "sync-point", "tempfile", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tokio-retry", "tracing", @@ -12202,7 +12183,7 @@ dependencies = [ "static_assertions", "strum 0.26.3", "strum_macros 0.26.4", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tokio-metrics", "tokio-retry", @@ -12221,7 +12202,7 @@ dependencies = [ "prost 0.13.1", "reqwest 0.12.4", "risingwave_pb", - "thiserror", + "thiserror 1.0.63", "thiserror-ext", "tracing", "uuid", @@ -12365,7 +12346,7 @@ dependencies = [ "rustls-native-certs 0.7.3", "rustls-pemfile 2.2.0", "rustls-webpki 0.102.2", - "thiserror", + "thiserror 1.0.63", "tokio", "tokio-rustls 0.25.0", "url", @@ -12392,7 +12373,7 @@ dependencies = [ "quote", "rust-embed-utils", "shellexpand 3.1.0", - "syn 2.0.79", + "syn 2.0.87", "walkdir", ] @@ -12692,7 +12673,7 @@ dependencies = [ name = "rw_resource_util" version = "0.0.0" dependencies = [ - "fs-err 3.0.0", + "fs-err", "sysinfo", "tempfile", "thiserror-ext", @@ -12814,7 +12795,7 @@ dependencies = [ "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -12838,7 +12819,7 @@ dependencies = [ "serde_json", "sqlx", "strum 0.26.3", - "thiserror", + "thiserror 1.0.63", "time", "tracing", "url", @@ -12872,7 +12853,7 @@ dependencies = [ "proc-macro2", "quote", "sea-bae", - "syn 2.0.79", + "syn 2.0.87", "unicode-ident", ] @@ -12936,8 +12917,8 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.79", - "thiserror", + "syn 2.0.87", + "thiserror 1.0.63", ] [[package]] @@ -12962,7 +12943,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -12992,7 +12973,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -13125,7 +13106,7 @@ dependencies = [ "darling 0.20.9", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -13136,7 +13117,7 @@ checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -13211,7 +13192,7 @@ checksum = "8725e1dfadb3a50f7e5ce0b1a540466f6ed3fe7a0fca2ac2b8b831d31316bd00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -13284,7 +13265,7 @@ dependencies = [ "darling 0.20.9", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -13322,7 +13303,7 @@ checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -13510,7 +13491,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" dependencies = [ "num-bigint", "num-traits", - "thiserror", + "thiserror 1.0.63", "time", ] @@ -13693,7 +13674,7 @@ dependencies = [ "nom", "regex", "serde_json", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -13709,25 +13690,25 @@ dependencies = [ [[package]] name = "sqllogictest" -version = "0.22.0" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b6b8f606d3c4cdcaf2031c4320b79d7584e454b79562ba3d675f49701c160e" +checksum = "ec31dce96f489e2247a165837f49bbce4912b0cbcf127b79b4fdd87503022ae9" dependencies = [ "async-trait", "educe", - "fs-err 2.11.0", + "fs-err", "futures", "glob", "humantime", "itertools 0.13.0", - "libtest-mimic 0.7.3", + "libtest-mimic", "md-5", "owo-colors", "regex", "similar", "subst", "tempfile", - "thiserror", + "thiserror 2.0.3", "tracing", ] @@ -13758,7 +13739,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -13809,7 +13790,7 @@ dependencies = [ "sha2", "smallvec", "sqlformat", - "thiserror", + "thiserror 1.0.63", "time", "tokio-stream 0.1.16", "tracing", @@ -13826,7 +13807,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -13849,7 +13830,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn 2.0.79", + "syn 2.0.87", "tempfile", "url", ] @@ -13893,7 +13874,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror", + "thiserror 1.0.63", "time", "tracing", "uuid", @@ -13936,7 +13917,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror", + "thiserror 1.0.63", "time", "tracing", "uuid", @@ -14031,7 +14012,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -14042,7 +14023,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -14070,7 +14051,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -14083,7 +14064,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -14148,9 +14129,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.79" +version = "2.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" dependencies = [ "proc-macro2", "quote", @@ -14177,7 +14158,7 @@ dependencies = [ "proc-macro-error 1.0.4", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -14187,7 +14168,7 @@ dependencies = [ "futures-util", "madsim-tokio", "spin 0.9.8", - "thiserror", + "thiserror 1.0.63", ] [[package]] @@ -14304,7 +14285,16 @@ version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.63", +] + +[[package]] +name = "thiserror" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +dependencies = [ + "thiserror-impl 2.0.3", ] [[package]] @@ -14313,7 +14303,7 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7c19760dc47062bca5c1b3699b032111c93802d51ac47660db11b08afc6bad2" dependencies = [ - "thiserror", + "thiserror 1.0.63", "thiserror-ext-derive", ] @@ -14326,7 +14316,7 @@ dependencies = [ "either", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -14337,7 +14327,18 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", ] [[package]] @@ -14361,15 +14362,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "threadpool" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" -dependencies = [ - "num_cpus", -] - [[package]] name = "thrift" version = "0.17.0" @@ -14403,7 +14395,7 @@ dependencies = [ "rust_decimal", "rustls-native-certs 0.6.3", "rustls-pemfile 1.0.4", - "thiserror", + "thiserror 1.0.63", "time", "tokio", "tokio-rustls 0.24.1", @@ -14535,7 +14527,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -14675,6 +14667,27 @@ dependencies = [ "tracing", ] +[[package]] +name = "tokio-websockets" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d" +dependencies = [ + "base64 0.22.0", + "bytes", + "futures-core", + "futures-sink", + "http 1.1.0", + "httparse", + "rand", + "ring 0.17.5", + "rustls-native-certs 0.8.0", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.26.0", + "tokio-util", +] + [[package]] name = "toml" version = "0.7.8" @@ -14789,7 +14802,7 @@ dependencies = [ "proc-macro2", "prost-build 0.13.1", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -14885,7 +14898,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -15043,7 +15056,7 @@ dependencies = [ "log", "rand", "smallvec", - "thiserror", + "thiserror 1.0.63", "tinyvec", "tokio", "url", @@ -15067,7 +15080,7 @@ dependencies = [ "once_cell", "rand", "smallvec", - "thiserror", + "thiserror 1.0.63", "tinyvec", "tokio", "tracing", @@ -15089,7 +15102,7 @@ dependencies = [ "parking_lot 0.12.1", "resolv-conf", "smallvec", - "thiserror", + "thiserror 1.0.63", "tokio", "trust-dns-proto 0.21.2", ] @@ -15109,7 +15122,7 @@ dependencies = [ "rand", "resolv-conf", "smallvec", - "thiserror", + "thiserror 1.0.63", "tokio", "tracing", "trust-dns-proto 0.23.2", @@ -15198,7 +15211,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -15209,7 +15222,7 @@ checksum = "1f718dfaf347dcb5b983bfc87608144b0bad87970aebcbea5ce44d2a30c08e63" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -15220,7 +15233,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -15231,7 +15244,7 @@ checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -15434,7 +15447,7 @@ checksum = "d674d135b4a8c1d7e813e2f8d1c9a58308aee4a680323066025e53132218bd91" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -15502,7 +15515,7 @@ dependencies = [ "once_cell", "rustix 0.38.37", "system-interface", - "thiserror", + "thiserror 1.0.63", "tracing", "wasmtime", "wiggle", @@ -15536,7 +15549,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", "wasm-bindgen-shared", ] @@ -15570,7 +15583,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -15730,7 +15743,7 @@ dependencies = [ "anyhow", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", "wasmtime-component-util", "wasmtime-wit-bindgen", "wit-parser", @@ -15760,7 +15773,7 @@ dependencies = [ "log", "object 0.36.4", "target-lexicon", - "thiserror", + "thiserror 1.0.63", "wasmparser", "wasmtime-environ", "wasmtime-versioned-export-macros", @@ -15857,7 +15870,7 @@ checksum = "455fc30062a08ba6a9c2ccc6e8c76ea2759d01324d3548324f5d38257d0e8d96" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -15999,7 +16012,7 @@ dependencies = [ "anyhow", "async-trait", "bitflags 2.6.0", - "thiserror", + "thiserror 1.0.63", "tracing", "wasmtime", "wiggle-macro", @@ -16016,7 +16029,7 @@ dependencies = [ "proc-macro2", "quote", "shellexpand 2.1.2", - "syn 2.0.79", + "syn 2.0.87", "witx", ] @@ -16028,7 +16041,7 @@ checksum = "cc26129a8aea20b62c961d1b9ab4a3c3b56b10042ed85d004f8678af0f21ba6e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", "wiggle-generate", ] @@ -16138,7 +16151,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -16149,7 +16162,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -16447,7 +16460,7 @@ version = "2.2.0-alpha" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] @@ -16458,7 +16471,7 @@ checksum = "e366f27a5cabcddb2706a78296a40b8fcc451e1a6aba2fc1d94b4a01bdaaef4b" dependencies = [ "anyhow", "log", - "thiserror", + "thiserror 1.0.63", "wast 35.0.2", ] @@ -16590,7 +16603,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.87", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index ab4db275b086..c260bf8c5293 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -138,7 +138,7 @@ otlp-embedded = { git = "https://github.com/risingwavelabs/otlp-embedded", rev = prost = { version = "0.13" } prost-build = { version = "0.13" } # branch rw_patch -icelake = { git = "https://github.com/risingwavelabs/icelake.git", rev = "1783f8f106958d6d0ce0249c1c708934a15c2a47", features = [ +icelake = { git = "https://github.com/risingwavelabs/icelake.git", rev = "0ec44fa826c91139c9cf459b005741df990ae9da", features = [ "prometheus", ] } # branch dev-rebase-main-20241030 diff --git a/ci/scripts/run-e2e-test.sh b/ci/scripts/run-e2e-test.sh index a8601fbb0ebe..e84ead4a81df 100755 --- a/ci/scripts/run-e2e-test.sh +++ b/ci/scripts/run-e2e-test.sh @@ -109,6 +109,10 @@ sqllogictest -p 4566 -d dev './e2e_test/ttl/ttl.slt' sqllogictest -p 4566 -d dev './e2e_test/database/prepare.slt' sqllogictest -p 4566 -d test './e2e_test/database/test.slt' +echo "--- e2e, $mode, python_client" +python3 -m pip install --break-system-packages psycopg +python3 ./e2e_test/python_client/main.py + echo "--- e2e, $mode, subscription" python3 -m pip install --break-system-packages psycopg2-binary sqllogictest -p 4566 -d dev './e2e_test/subscription/check_sql_statement.slt' diff --git a/ci/workflows/main-cron.yml b/ci/workflows/main-cron.yml index e8a0fa32f101..df022c8fb927 100644 --- a/ci/workflows/main-cron.yml +++ b/ci/workflows/main-cron.yml @@ -367,6 +367,28 @@ steps: timeout_in_minutes: 120 retry: *auto-retry + - label: "end-to-end test (madsim, random vnode count)" + key: "e2e-test-deterministic-random-vnode-count" + command: "TEST_NUM=32 RW_SIM_RANDOM_VNODE_COUNT=true timeout 120m ci/scripts/deterministic-e2e-test.sh" + if: | + !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null + || build.pull_request.labels includes "ci/run-e2e-test-deterministic-simulation" + || build.env("CI_STEPS") =~ /(^|,)e2e-tests?-deterministic-simulation(,|$$)/ + depends_on: "build-simulation" + plugins: + - seek-oss/aws-sm#v2.3.1: + env: + GITHUB_TOKEN: github-token + - docker-compose#v5.1.0: + run: rw-build-env + config: ci/docker-compose.yml + mount-buildkite-agent: true + environment: + - GITHUB_TOKEN + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 120 + retry: *auto-retry + - label: "recovery test (madsim)" key: "recovery-test-deterministic" command: "TEST_NUM=12 KILL_RATE=1.0 BACKGROUND_DDL_RATE=0.0 timeout 65m ci/scripts/deterministic-recovery-test.sh" @@ -388,7 +410,7 @@ steps: # Ddl statements will randomly run with background_ddl. - label: "background_ddl, arrangement_backfill recovery test (madsim)" key: "background-ddl-arrangement-backfill-recovery-test-deterministic" - command: "TEST_NUM=12 KILL_RATE=1.0 BACKGROUND_DDL_RATE=0.8 USE_ARRANGEMENT_BACKFILL=true timeout 65m ci/scripts/deterministic-recovery-test.sh" + command: "TEST_NUM=12 KILL_RATE=1.0 BACKGROUND_DDL_RATE=0.8 USE_ARRANGEMENT_BACKFILL=true timeout 70m ci/scripts/deterministic-recovery-test.sh" if: | !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null || build.pull_request.labels includes "ci/run-recovery-test-deterministic-simulation" diff --git a/e2e_test/batch/basic/dml.slt.part b/e2e_test/batch/basic/dml_basic.slt.part similarity index 100% rename from e2e_test/batch/basic/dml.slt.part rename to e2e_test/batch/basic/dml_basic.slt.part diff --git a/e2e_test/batch/basic/dml_update.slt.part b/e2e_test/batch/basic/dml_update.slt.part new file mode 100644 index 000000000000..fcc3bbdfce9a --- /dev/null +++ b/e2e_test/batch/basic/dml_update.slt.part @@ -0,0 +1,132 @@ +# Extension to `dml_basic.slt.part` for testing advanced `UPDATE` statements. + +statement ok +SET RW_IMPLICIT_FLUSH TO true; + +statement ok +create table t (v1 int default 1919, v2 int default 810); + +statement ok +insert into t values (114, 514); + + +# Single assignment, to subquery. +statement ok +update t set v1 = (select 666); + +query II +select * from t; +---- +666 514 + +# Single assignment, to runtime-cardinality subquery returning 1 row. +statement ok +update t set v1 = (select generate_series(888, 888)); + +query II +select * from t; +---- +888 514 + +# Single assignment, to runtime-cardinality subquery returning 0 rows (set to NULL). +statement ok +update t set v1 = (select generate_series(1, 0)); + +query II +select * from t; +---- +NULL 514 + +# Single assignment, to runtime-cardinality subquery returning multiple rows. +statement error Scalar subquery produced more than one row +update t set v1 = (select generate_series(1, 2)); + +# Single assignment, to correlated subquery. +statement ok +update t set v1 = (select count(*) from t as source where source.v2 = t.v2); + +query II +select * from t; +---- +1 514 + +# Single assignment, to subquery with mismatched column count. +statement error must return only one column +update t set v1 = (select 666, 888); + + +# Multiple assignment clauses. +statement ok +update t set v1 = 1919, v2 = 810; + +query II +select * from t; +---- +1919 810 + +# Multiple assignments to the same column. +statement error multiple assignments to the same column +update t set v1 = 1, v1 = 2; + +statement error multiple assignments to the same column +update t set (v1, v1) = (1, 2); + +statement error multiple assignments to the same column +update t set (v1, v2) = (1, 2), v2 = 2; + +# Multiple assignments, to subquery. +statement ok +update t set (v1, v2) = (select 666, 888); + +query II +select * from t; +---- +666 888 + +# Multiple assignments, to subquery with cast. +statement ok +update t set (v1, v2) = (select 888.88, 999); + +query II +select * from t; +---- +889 999 + +# Multiple assignments, to subquery with cast failure. +# TODO: this currently shows `cannot cast type "record" to "record"` because we wrap the subquery result +# into a struct, which is not quite clear. +statement error cannot cast type +update t set (v1, v2) = (select '888.88', 999); + +# Multiple assignments, to subquery with mismatched column count. +statement error number of columns does not match number of values +update t set (v1, v2) = (select 666); + +# Multiple assignments, to scalar expression. +statement error source for a multiple-column UPDATE item must be a sub-SELECT or ROW\(\) expression +update t set (v1, v2) = v1 + 1; + + +# Assignment to system columns. +statement error update modifying column `_rw_timestamp` is unsupported +update t set _rw_timestamp = _rw_timestamp + interval '1 second'; + + +# https://github.com/risingwavelabs/risingwave/pull/19402#pullrequestreview-2444427475 +# https://github.com/risingwavelabs/risingwave/pull/19452 +statement ok +create table y (v1 int, v2 int); + +statement ok +insert into y values (11, 11), (22, 22); + +statement error Scalar subquery produced more than one row +update t set (v1, v2) = (select y.v1, y.v2 from y); + +statement ok +drop table y; + + +# Cleanup. +statement ok +drop table t; diff --git a/e2e_test/batch/distribution_mode.slt b/e2e_test/batch/distribution_mode.slt index b680796277c1..6f46a69a4352 100644 --- a/e2e_test/batch/distribution_mode.slt +++ b/e2e_test/batch/distribution_mode.slt @@ -10,9 +10,10 @@ include ./order/*.slt.part include ./join/*.slt.part include ./join/*/*.slt.part include ./aggregate/*.slt.part -include ./types/*.slt.part +include ./types/**/*.slt.part include ./functions/*.slt.part include ./over_window/main.slt.part +include ./subquery/**/*.slt.part statement ok SET QUERY_MODE TO auto; diff --git a/e2e_test/batch/local_mode.slt b/e2e_test/batch/local_mode.slt index 68df9f0d9195..9194ddfb1a83 100644 --- a/e2e_test/batch/local_mode.slt +++ b/e2e_test/batch/local_mode.slt @@ -10,10 +10,11 @@ include ./order/*.slt.part include ./join/*.slt.part include ./join/*/*.slt.part include ./aggregate/*.slt.part -include ./types/*.slt.part +include ./types/**/*.slt.part include ./catalog/*.slt.part include ./functions/*.slt.part include ./over_window/main.slt.part +include ./subquery/**/*.slt.part statement ok SET QUERY_MODE TO auto; diff --git a/e2e_test/batch/subquery/scalar_subquery.slt.part b/e2e_test/batch/subquery/scalar_subquery.slt.part index a0676e98f245..5bb08201b7a7 100644 --- a/e2e_test/batch/subquery/scalar_subquery.slt.part +++ b/e2e_test/batch/subquery/scalar_subquery.slt.part @@ -21,8 +21,10 @@ select (select x from t) x, 1 one; statement error Scalar subquery might produce more than one row create materialized view mv as select (select x from t) x, 1 one; +# Use a random value here to occasionally make it not in the same shard as `114514`, +# demonstrating that `BatchMaxOneRow` correctly handles distributed subqueries. statement ok -insert into t values (1919810); +insert into t values (extract(epoch from now())); # Cannot query as the cardinality of the subquery is now 2 query error Scalar subquery produced more than one row diff --git a/e2e_test/batch/types/list/list_case.slt.part b/e2e_test/batch/types/list/list_case.slt.part index dab3ece05cdc..20c87b8d0624 100644 --- a/e2e_test/batch/types/list/list_case.slt.part +++ b/e2e_test/batch/types/list/list_case.slt.part @@ -20,7 +20,7 @@ SELECT case when i%2=0 then ARRAY[i] else ARRAY[-i] end from (select generate_se {-3} {4} -query I +query T SELECT case when i%2=0 then NULL else ARRAY[i] end from (select generate_series as i from generate_series(0,9,1)) as t; ---- NULL @@ -34,7 +34,7 @@ NULL NULL {9} -query I +query T with a as ( SELECT (case when i%2=0 then NULL else ARRAY[i] end) as i from (select generate_series as i from generate_series(0,9,1)) as t ) diff --git a/e2e_test/batch/types/list/list_cast.slt.part b/e2e_test/batch/types/list/list_cast.slt.part index f968011cff7e..842a1349f7a3 100644 --- a/e2e_test/batch/types/list/list_cast.slt.part +++ b/e2e_test/batch/types/list/list_cast.slt.part @@ -1,45 +1,45 @@ statement ok SET RW_IMPLICIT_FLUSH TO true; -query I -select {1,2,3}::double[]; +query T +select array[1,2,3]::double[]; ---- {1,2,3} -query I -select {1.4,2.5,3.6}::int[]; +query T +select array[1.4,2.5,3.6]::int[]; ---- {1,3,4} -query I -select {'1','2','3'}::int[]; +query T +select array['1','2','3']::int[]; ---- {1,2,3} -statement error -select {'1','2','a'}::int[]; +statement error invalid digit +select array['1','2','a']::int[]; -query I -select {{1,2.4},{3,4.7},null,{null}::int[]}::int[][]; +query T +select array[array[1,2.4],array[3,4.7],null,array[null]::int[]]::int[][]; ---- {{1,2},{3,5},NULL,{NULL}} statement ok create table t (a double[]); -statement error -insert into t values ({null}); +statement error cannot cast +insert into t values (array[null]); statement ok -insert into t values ({null::double}); +insert into t values (array[null::double]); statement ok -insert into t values ({null}::double[]); +insert into t values (array[null]::double[]); statement ok insert into t values (null); -query I +query T select * from t order by 1; ---- {NULL} @@ -47,9 +47,9 @@ select * from t order by 1; NULL statement ok -insert into t values ({3.4, 4.3}); +insert into t values (array[3.4, 4.3]); -query I +query T select a::int[] from t order by 1; ---- {3,4} diff --git a/e2e_test/batch/types/list/list_storage.slt.part b/e2e_test/batch/types/list/list_storage.slt.part index 2c993e1c79bc..70fb9bc1fe4b 100644 --- a/e2e_test/batch/types/list/list_storage.slt.part +++ b/e2e_test/batch/types/list/list_storage.slt.part @@ -10,11 +10,11 @@ CREATE TABLE a(b INTEGER[]); statement ok INSERT INTO a VALUES (ARRAY[1, 2]), (NULL), (ARRAY[3, 4, 5, 6]), (ARRAY[NULL, 7]); -query I rowsort +query T rowsort SELECT * FROM a ---- -{1,2} NULL +{1,2} {3,4,5,6} {NULL,7} @@ -24,13 +24,13 @@ CREATE TABLE c(b VARCHAR[]); statement ok INSERT INTO c VALUES (ARRAY['hello', 'world']), (NULL), (ARRAY['fejwfoaejwfoijwafew', 'b', 'c']), (ARRAY[NULL, 'XXXXXXXXXXXXXXXXXXXXXXXX']); -query I rowsort +query T rowsort SELECT * FROM c ---- -{hello,world} NULL {NULL,XXXXXXXXXXXXXXXXXXXXXXXX} {fejwfoaejwfoijwafew,b,c} +{hello,world} statement ok drop table a; diff --git a/e2e_test/batch/types/list/multi-dimentional_list_cast.slt.part b/e2e_test/batch/types/list/multi-dimentional_list_cast.slt.part index 8a67840a6c20..9345e30dec65 100644 --- a/e2e_test/batch/types/list/multi-dimentional_list_cast.slt.part +++ b/e2e_test/batch/types/list/multi-dimentional_list_cast.slt.part @@ -1,25 +1,12 @@ -query I +query T select array[array[1, 2], array[3, 4]]; ---- {{1,2},{3,4}} -query I +query T select array[[1, 2], [3, 4]]; ---- {{1,2},{3,4}} -query I +query error sql parser error select array[[array[1, 2]], [[3, 4]]]; ----- -{{{1,2}},{{3,4}}} - -query I -select array[[[1, 2]], [array[3, 4]]]; ----- -{{{1,2}},{{3,4}}} - -statement error syntax error at or near -select array[array[1, 2], [3, 4]]; - -statement error syntax error at or near -select array[[1, 2], array[3, 4]]; \ No newline at end of file diff --git a/e2e_test/ddl/alter_session_params.slt b/e2e_test/ddl/alter_session_params.slt index ecaf7f09ebad..49968ded8324 100644 --- a/e2e_test/ddl/alter_session_params.slt +++ b/e2e_test/ddl/alter_session_params.slt @@ -26,9 +26,6 @@ set STREAMING_ENABLE_DELTA_JOIN to false; statement ok set RW_STREAMING_ENABLE_DELTA_JOIN to true; -statement error session param query_mode cannot be altered system wide -alter system set query_mode to auto; - connection other1 query T show RW_STREAMING_ENABLE_DELTA_JOIN; diff --git a/e2e_test/extended_mode/1dim_list.slt.part b/e2e_test/extended_mode/1dim_list.slt.part new file mode 100644 index 000000000000..1e067bd6bfeb --- /dev/null +++ b/e2e_test/extended_mode/1dim_list.slt.part @@ -0,0 +1,57 @@ +# Test binary format of 1-dimension lists (arrays) + +query T +select ARRAY['foo', 'bar', null]; +---- +{foo,bar,NULL} + +query T +select ARRAY[1,2+3,4*5+1]; +---- +{1,5,21} + +query T +select ARRAY[null]; +---- +{NULL} + +statement error +select ARRAY[]; + +query T +select ARRAY[]::int[]; +---- +{} + +statement ok +create table t (v1 int); + +statement ok +insert into t values (1), (2), (3); + +query T rowsort +select ARRAY[1, v1*2] from t; +---- +{1,2} +{1,4} +{1,6} + +query T +select min(ARRAY[1, v1*2]) from t; +---- +{1,2} + +query T +select max(ARRAY[1, v1*2]) from t; +---- +{1,6} + +query T +select array[false, false] from t; +---- +{f,f} +{f,f} +{f,f} + +statement ok +drop table t; \ No newline at end of file diff --git a/e2e_test/extended_mode/README.md b/e2e_test/extended_mode/README.md new file mode 100644 index 000000000000..2c96d7e0c232 --- /dev/null +++ b/e2e_test/extended_mode/README.md @@ -0,0 +1,5 @@ +# How to run + +```shell +sqllogictest -p 4566 -d dev -e postgres-extended './e2e_test/extended_mode/**/*.slt' +``` diff --git a/e2e_test/extended_mode/type.slt b/e2e_test/extended_mode/type.slt index b172fcf389ab..c6e5c51b5c04 100644 --- a/e2e_test/extended_mode/type.slt +++ b/e2e_test/extended_mode/type.slt @@ -3,10 +3,10 @@ statement ok SET RW_IMPLICIT_FLUSH TO true; -# RisingWave can't support list and struct now so we skip them. -# include ../batch/types/array.slt.part -# include ../batch/types/struct.slt.part -# include ../batch/types/list.slt.part +include 1dim_list.slt.part + +# RisingWave can't support struct now so we skip it. +# include ../batch/types/struct/*.slt.part # Sqllogitest can't support binary format bytea type so we skip it. # include ../batch/types/bytea.slt.part diff --git a/e2e_test/python_client/main.py b/e2e_test/python_client/main.py new file mode 100644 index 000000000000..bb41ba6c38f3 --- /dev/null +++ b/e2e_test/python_client/main.py @@ -0,0 +1,19 @@ +import psycopg + +def test_psycopg_extended_mode(): + conn = psycopg.connect(host='localhost', port='4566', dbname='dev', user='root') + with conn.cursor() as cur: + cur.execute("select Array[1::bigint, 2::bigint, 3::bigint]", binary=True) + assert cur.fetchone() == ([1, 2, 3],) + + cur.execute("select Array['foo', null, 'bar']", binary=True) + assert cur.fetchone() == (['foo', None, 'bar'],) + + cur.execute("select ROW('123 Main St', 'New York', '10001')", binary=True) + assert cur.fetchone() == (('123 Main St', 'New York', '10001'),) + + cur.execute("select array[ROW('123 Main St', 'New York', '10001'), ROW('234 Main St', null, '10001')]", binary=True) + assert cur.fetchone() == ([('123 Main St', 'New York', '10001'), ('234 Main St', None, '10001')],) + +if __name__ == '__main__': + test_psycopg_extended_mode() diff --git a/e2e_test/source_inline/cdc/mysql/mysql_create_drop.slt.serial b/e2e_test/source_inline/cdc/mysql/mysql_create_drop.slt.serial index fde008079dc6..2766f37fefe1 100644 --- a/e2e_test/source_inline/cdc/mysql/mysql_create_drop.slt.serial +++ b/e2e_test/source_inline/cdc/mysql/mysql_create_drop.slt.serial @@ -49,12 +49,6 @@ create source s with ( sleep 2s -# At the beginning, the source is paused. It will resume after a downstream is created. -system ok -internal_table.mjs --name s --type '' --count ----- -count: 0 - statement ok create table tt1_shared (v1 int, diff --git a/e2e_test/source_inline/fs/posix_fs.slt b/e2e_test/source_inline/fs/posix_fs.slt index 96fb18e97e66..5408daf28321 100644 --- a/e2e_test/source_inline/fs/posix_fs.slt +++ b/e2e_test/source_inline/fs/posix_fs.slt @@ -14,17 +14,55 @@ CREATE TABLE diamonds ( source_rate_limit = 0 ) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ','); +statement ok +CREATE SOURCE diamonds_source ( + carat FLOAT, + cut TEXT, + color TEXT, + depth FLOAT, +) WITH ( + connector = 'posix_fs', + match_pattern = 'data*.csv', + posix_fs.root = 'e2e_test/source_inline/fs/data', + source_rate_limit = 0 +) FORMAT PLAIN ENCODE CSV ( without_header = 'false', delimiter = ','); + +statement ok +create materialized view diamonds_mv as select * from diamonds_source; + sleep 1s # no output due to rate limit -query TTTT rowsort +statement count 0 select * from diamonds; + + +statement count 0 +select * from diamonds_mv; + + +query T +select name, node_name, fragment_type, rate_limit from rw_rate_limit join rw_relations on table_id=id +order by name, node_name; ---- +diamonds FS_FETCH {FS_FETCH} 0 +diamonds SOURCE {SOURCE} 0 +diamonds_mv FS_FETCH {MVIEW,FS_FETCH} 0 +diamonds_mv SOURCE {SOURCE} 0 statement ok ALTER TABLE diamonds SET source_rate_limit TO DEFAULT; -sleep 10s + +query T +select name, node_name, fragment_type, rate_limit from rw_rate_limit join rw_relations on table_id=id +order by name, node_name; +---- +diamonds_mv FS_FETCH {MVIEW,FS_FETCH} 0 +diamonds_mv SOURCE {SOURCE} 0 + + +sleep 3s query TTTT rowsort select * from diamonds; @@ -40,5 +78,42 @@ select * from diamonds; 1.28 Good J 63.1 1.3 Fair E 64.7 + +statement count 0 +select * from diamonds_mv; + + + +statement ok +ALTER SOURCE diamonds_source SET source_rate_limit TO DEFAULT; + +query T +select name, node_name, fragment_type, rate_limit from rw_rate_limit join rw_relations on table_id=id +order by name, node_name; +---- + + +sleep 3s + +query TTTT rowsort +select * from diamonds_mv; +---- +0.22 Premium I 62 +0.23 Very Good H 57.5 +0.25 Ideal E 61.4 +0.28 Good J 63.1 +0.3 Fair E 64.7 +1.22 Premium I 62 +1.23 Very Good H 57.5 +1.25 Ideal E 61.4 +1.28 Good J 63.1 +1.3 Fair E 64.7 + statement ok DROP TABLE diamonds; + +statement ok +drop materialized view diamonds_mv; + +statement ok +drop source diamonds_source; diff --git a/e2e_test/source_inline/kafka/alter/rate_limit_source_kafka.slt b/e2e_test/source_inline/kafka/alter/rate_limit_source_kafka.slt.serial similarity index 80% rename from e2e_test/source_inline/kafka/alter/rate_limit_source_kafka.slt rename to e2e_test/source_inline/kafka/alter/rate_limit_source_kafka.slt.serial index 96fd016c5812..8353166b5a87 100644 --- a/e2e_test/source_inline/kafka/alter/rate_limit_source_kafka.slt +++ b/e2e_test/source_inline/kafka/alter/rate_limit_source_kafka.slt.serial @@ -80,16 +80,38 @@ select * from rl_mv3; ---- 0 +query T +select name, node_name, fragment_type, rate_limit from rw_rate_limit join rw_relations on table_id=id +order by name; +---- +rl_mv1 SOURCE {SOURCE} 0 +rl_mv2 SOURCE {SOURCE} 0 +rl_mv3 SOURCE {SOURCE} 0 + ############## Alter Source (rate_limit = 0 --> rate_limit = 1000) skipif in-memory -query I +statement count 0 alter source kafka_source set source_rate_limit to 1000; +query T +select name, node_name, fragment_type, rate_limit from rw_rate_limit join rw_relations on table_id=id +order by name; +---- +rl_mv1 SOURCE {SOURCE} 1000 +rl_mv2 SOURCE {SOURCE} 1000 +rl_mv3 SOURCE {SOURCE} 1000 + skipif in-memory -query I +statement count 0 alter source kafka_source set source_rate_limit to default; +# rate limit becomes None +query T +select count(*) from rw_rate_limit; +---- +0 + skipif in-memory sleep 3s diff --git a/e2e_test/source_inline/kafka/alter/rate_limit_source_kafka_shared.slt.serial b/e2e_test/source_inline/kafka/alter/rate_limit_source_kafka_shared.slt.serial new file mode 100644 index 000000000000..a9a730930b1b --- /dev/null +++ b/e2e_test/source_inline/kafka/alter/rate_limit_source_kafka_shared.slt.serial @@ -0,0 +1,166 @@ +control substitution on + +############## Create kafka seed data + +statement ok +create table kafka_seed_data (v1 int); + +statement ok +insert into kafka_seed_data select * from generate_series(1, 1000); + +############## Sink into kafka + +statement ok +create sink kafka_sink +from + kafka_seed_data with ( + ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, + topic = 'test_rate_limit_shared', + type = 'append-only', + force_append_only='true' +); + +############## Source from kafka (rate_limit = 0) + +# Wait for the topic to create +skipif in-memory +sleep 5s + +statement ok +create source kafka_source (v1 int) with ( + ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, + topic = 'test_rate_limit_shared', + source_rate_limit = 0, +) FORMAT PLAIN ENCODE JSON + +statement ok +flush; + +############## Check data + +skipif in-memory +sleep 3s + +############## Create MV on source + +statement ok +create materialized view rl_mv1 as select count(*) from kafka_source; + +############## Although source is rate limited, the MV's SourceBackfill is not. + +statement ok +flush; + +query I +select * from rl_mv1; +---- +1000 + +############## Insert more data. They will not go into the MV. + +statement ok +insert into kafka_seed_data select * from generate_series(1, 1000); + +sleep 3s + +query I +select * from rl_mv1; +---- +1000 + +statement ok +SET BACKGROUND_DDL=true; + +statement ok +SET BACKFILL_RATE_LIMIT=0; + +statement ok +create materialized view rl_mv2 as select count(*) from kafka_source; + +sleep 1s + +query T +SELECT progress from rw_ddl_progress; +---- +0 rows consumed + +query T +select name, node_name, fragment_type, rate_limit from rw_rate_limit join rw_relations on table_id=id +order by name; +---- +kafka_source SOURCE {SOURCE} 0 +rl_mv2 SOURCE_BACKFILL {SOURCE_SCAN} 0 + + +############## Alter Source (rate_limit = 0 --> rate_limit = 1000) + +statement ok +alter source kafka_source set source_rate_limit to 1000; + +query T +select name, node_name, fragment_type, rate_limit from rw_rate_limit join rw_relations on table_id=id +order by name; +---- +kafka_source SOURCE {SOURCE} 1000 +rl_mv2 SOURCE_BACKFILL {SOURCE_SCAN} 0 + +sleep 3s + +query I +select * from rl_mv1; +---- +2000 + +query T +SELECT progress from rw_ddl_progress; +---- +0 rows consumed + + + +statement error +alter materialized view rl_mv2 set source_rate_limit = 1000; +---- +db error: ERROR: Failed to run the query + +Caused by: + sql parser error: expected SCHEMA/PARALLELISM/BACKFILL_RATE_LIMIT after SET, found: source_rate_limit +LINE 1: alter materialized view rl_mv2 set source_rate_limit = 1000; + ^ + + +query T +select name, node_name, fragment_type, rate_limit from rw_rate_limit join rw_relations on table_id=id +order by name; +---- +kafka_source SOURCE {SOURCE} 1000 +rl_mv2 SOURCE_BACKFILL {SOURCE_SCAN} 0 + + +statement ok +alter materialized view rl_mv2 set backfill_rate_limit = 2000; + + +query T +select name, node_name, fragment_type, rate_limit from rw_rate_limit join rw_relations on table_id=id +order by name; +---- +kafka_source SOURCE {SOURCE} 1000 +rl_mv2 SOURCE_BACKFILL {SOURCE_SCAN} 2000 + +sleep 3s + +query T +select * from rl_mv2; +---- +2000 + + + +############## Cleanup + +statement ok +drop source kafka_source cascade; + +statement ok +drop table kafka_seed_data cascade; diff --git a/e2e_test/source_inline/kafka/alter/rate_limit_table_kafka.slt b/e2e_test/source_inline/kafka/alter/rate_limit_table_kafka.slt.serial similarity index 99% rename from e2e_test/source_inline/kafka/alter/rate_limit_table_kafka.slt rename to e2e_test/source_inline/kafka/alter/rate_limit_table_kafka.slt.serial index ac2a665fd10c..5d22fc85dea4 100644 --- a/e2e_test/source_inline/kafka/alter/rate_limit_table_kafka.slt +++ b/e2e_test/source_inline/kafka/alter/rate_limit_table_kafka.slt.serial @@ -63,7 +63,7 @@ select count(*) from kafka_source; ############## Alter source (rate_limit = 0 --> rate_limit = 1000) skipif in-memory -query I +statement ok alter table kafka_source set source_rate_limit to 1000; skipif in-memory diff --git a/e2e_test/source_inline/kafka/avro/alter_table.slt b/e2e_test/source_inline/kafka/avro/alter_table.slt index 330cdc490cdb..08a98c2cca4c 100644 --- a/e2e_test/source_inline/kafka/avro/alter_table.slt +++ b/e2e_test/source_inline/kafka/avro/alter_table.slt @@ -78,3 +78,20 @@ ABC statement ok drop table t; + +statement ok +create table t (primary key (kafka_key)) +INCLUDE key as kafka_key +WITH ( + ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, + topic = 'avro_alter_table_test' +) +FORMAT UPSERT ENCODE AVRO ( + schema.registry = '${RISEDEV_SCHEMA_REGISTRY_URL}' +); + +statement ok +ALTER TABLE t REFRESH SCHEMA; + +statement ok +drop table t; diff --git a/e2e_test/source_inline/kafka/shared_source.slt.serial b/e2e_test/source_inline/kafka/shared_source.slt.serial index 3397f90f081d..af6b371d21c4 100644 --- a/e2e_test/source_inline/kafka/shared_source.slt.serial +++ b/e2e_test/source_inline/kafka/shared_source.slt.serial @@ -59,11 +59,17 @@ select count(*) from rw_internal_tables where name like '%s0%'; sleep 1s -# SourceExecutor's ingestion does not start (state table is empty), even after sleep +statement ok +flush; + +# SourceExecutor's starts from latest. system ok internal_table.mjs --name s0 --type source ---- -(empty) +0,"{""split_info"": {""partition"": 0, ""start_offset"": 0, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" +1,"{""split_info"": {""partition"": 1, ""start_offset"": 0, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" +2,"{""split_info"": {""partition"": 2, ""start_offset"": 1, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" +3,"{""split_info"": {""partition"": 3, ""start_offset"": 2, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" statement ok @@ -72,12 +78,6 @@ create materialized view mv_1 as select * from s0; # Wait enough time to ensure SourceExecutor consumes all Kafka data. sleep 2s -# SourceExecutor's ingestion started, but it only starts from latest (offset 1). -system ok -internal_table.mjs --name s0 --type source ----- -(empty) - # SourceBackfill starts from offset 0, with backfill_info: HasDataToBackfill { latest_offset: "0" } (decided by kafka high watermark). # (meaning upstream already consumed offset 0, so we only need to backfill to offset 0) @@ -144,7 +144,7 @@ EOF sleep 2s -# SourceExecutor's finally got new data now. +# SourceExecutor's got new data. system ok internal_table.mjs --name s0 --type source ---- @@ -185,16 +185,6 @@ select v1, v2 from mv_1; 4 dd -# start_offset changed to 1 -system ok -internal_table.mjs --name s0 --type source ----- -0,"{""split_info"": {""partition"": 0, ""start_offset"": 1, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" -1,"{""split_info"": {""partition"": 1, ""start_offset"": 1, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" -2,"{""split_info"": {""partition"": 2, ""start_offset"": 2, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" -3,"{""split_info"": {""partition"": 3, ""start_offset"": 3, ""stop_offset"": null, ""topic"": ""shared_source""}, ""split_type"": ""kafka""}" - - # Transition from SourceCachingUp to Finished after consuming one upstream message. system ok internal_table.mjs --name mv_1 --type sourcebackfill @@ -334,6 +324,47 @@ internal_table.mjs --name s0 --type source # # risedev psql -c "select name, flags, parallelism from rw_fragments JOIN rw_relations ON rw_fragments.table_id = rw_relations.id order by name;" +# Test: rate limit and resume won't lose data + +statement ok +alter source s0 set source_rate_limit to 0; + + +system ok +cat < with_properties = 6; catalog.StreamSourceInfo info = 7; string source_name = 8; - // Streaming rate limit + // Source rate limit optional uint32 rate_limit = 9; map secret_refs = 10; } @@ -205,7 +205,7 @@ message StreamFsFetch { map with_properties = 6; catalog.StreamSourceInfo info = 7; string source_name = 8; - // Streaming rate limit + // Source rate limit optional uint32 rate_limit = 9; map secret_refs = 10; } @@ -231,7 +231,7 @@ message SourceBackfillNode { catalog.StreamSourceInfo info = 4; string source_name = 5; map with_properties = 6; - // Streaming rate limit + // Backfill rate limit optional uint32 rate_limit = 7; // fields above are the same as StreamSource @@ -609,7 +609,7 @@ message StreamScanNode { // Used iff `ChainType::Backfill`. plan_common.StorageTableDesc table_desc = 7; - // The rate limit for the stream scan node. + // The backfill rate limit for the stream scan node. optional uint32 rate_limit = 8; // Snapshot read every N barriers @@ -646,7 +646,7 @@ message StreamCdcScanNode { // The external table that will be backfilled for CDC. plan_common.ExternalTableDesc cdc_table_desc = 5; - // The rate limit for the stream cdc scan node. + // The backfill rate limit for the stream cdc scan node. optional uint32 rate_limit = 6; // Whether skip the backfill and only consume from upstream. @@ -985,6 +985,8 @@ enum FragmentTypeFlag { FRAGMENT_TYPE_FLAG_CDC_FILTER = 256; FRAGMENT_TYPE_FLAG_SOURCE_SCAN = 1024; FRAGMENT_TYPE_FLAG_SNAPSHOT_BACKFILL_STREAM_SCAN = 2048; + // Note: this flag is not available in old fragments, so only suitable for debugging purpose. + FRAGMENT_TYPE_FLAG_FS_FETCH = 4096; } // The streaming context associated with a stream plan diff --git a/src/batch/src/executor/join/distributed_lookup_join.rs b/src/batch/src/executor/join/distributed_lookup_join.rs index 139717e06af3..0a328e2f985f 100644 --- a/src/batch/src/executor/join/distributed_lookup_join.rs +++ b/src/batch/src/executor/join/distributed_lookup_join.rs @@ -355,7 +355,7 @@ impl LookupExecutorBuilder for InnerSideExecutorBuilder { let pk_prefix = OwnedRow::new(scan_range.eq_conds); - if self.lookup_prefix_len == self.table.pk_indices().len() && !self.table.has_epoch_idx() { + if self.lookup_prefix_len == self.table.pk_indices().len() { let row = self.table.get_row(&pk_prefix, self.epoch.into()).await?; if let Some(row) = row { diff --git a/src/batch/src/executor/row_seq_scan.rs b/src/batch/src/executor/row_seq_scan.rs index 17ac6268dce8..b65f4bf8939b 100644 --- a/src/batch/src/executor/row_seq_scan.rs +++ b/src/batch/src/executor/row_seq_scan.rs @@ -396,40 +396,17 @@ impl RowSeqScanExecutor { ) -> Result> { let pk_prefix = scan_range.pk_prefix; assert!(pk_prefix.len() == table.pk_indices().len()); + let timer = histogram.as_ref().map(|histogram| histogram.start_timer()); - let res = if table.has_epoch_idx() { - // has epoch_idx means we need to select `_rw_timestamp` column which is unsupported by `get_row` interface, so use iterator interface instead. - let range_bounds = (Bound::::Unbounded, Bound::Unbounded); - let iter = table - .batch_chunk_iter_with_pk_bounds( - epoch.into(), - &pk_prefix, - range_bounds, - false, - 1, - PrefetchOptions::new(false, false), - ) - .await?; - pin_mut!(iter); - let chunk = iter.next().await.transpose().map_err(BatchError::from)?; - if let Some(chunk) = chunk { - let row = chunk.row_at(0).0.to_owned_row(); - Ok(Some(row)) - } else { - Ok(None) - } - } else { - // Point Get. - let row = table.get_row(&pk_prefix, epoch.into()).await?; - Ok(row) - }; + // Point Get. + let row = table.get_row(&pk_prefix, epoch.into()).await?; if let Some(timer) = timer { timer.observe_duration() } - res + Ok(row) } #[try_stream(ok = DataChunk, error = BatchError)] diff --git a/src/batch/src/executor/s3_file_scan.rs b/src/batch/src/executor/s3_file_scan.rs index a7b0d1bacd79..38907c63f841 100644 --- a/src/batch/src/executor/s3_file_scan.rs +++ b/src/batch/src/executor/s3_file_scan.rs @@ -12,13 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -use anyhow::anyhow; use futures_async_stream::try_stream; use futures_util::stream::StreamExt; -use parquet::arrow::ProjectionMask; -use risingwave_common::array::arrow::IcebergArrowConvert; use risingwave_common::catalog::{Field, Schema}; -use risingwave_connector::source::iceberg::parquet_file_reader::create_parquet_stream_builder; +use risingwave_connector::source::iceberg::{new_s3_operator, read_parquet_file}; use risingwave_pb::batch_plan::file_scan_node; use risingwave_pb::batch_plan::file_scan_node::StorageType; use risingwave_pb::batch_plan::plan_node::NodeBody; @@ -85,34 +82,18 @@ impl S3FileScanExecutor { async fn do_execute(self: Box) { assert_eq!(self.file_format, FileFormat::Parquet); for file in self.file_location { - let mut batch_stream_builder = create_parquet_stream_builder( + let op = new_s3_operator( self.s3_region.clone(), self.s3_access_key.clone(), self.s3_secret_key.clone(), - file, - ) - .await?; - - let arrow_schema = batch_stream_builder.schema(); - assert_eq!(arrow_schema.fields.len(), self.schema.fields.len()); - for (field, arrow_field) in self.schema.fields.iter().zip(arrow_schema.fields.iter()) { - assert_eq!(*field.name, *arrow_field.name()); - } - - batch_stream_builder = batch_stream_builder.with_projection(ProjectionMask::all()); - - batch_stream_builder = batch_stream_builder.with_batch_size(self.batch_size); - - let record_batch_stream = batch_stream_builder - .build() - .map_err(|e| anyhow!(e).context("fail to build arrow stream builder"))?; - + file.clone(), + )?; + let chunk_stream = read_parquet_file(op, file, None, None, self.batch_size, 0).await?; #[for_await] - for record_batch in record_batch_stream { - let record_batch = record_batch?; - let chunk = IcebergArrowConvert.chunk_from_record_batch(&record_batch)?; - debug_assert_eq!(chunk.data_types(), self.schema.data_types()); - yield chunk; + for stream_chunk in chunk_stream { + let stream_chunk = stream_chunk?; + let (data_chunk, _) = stream_chunk.into_parts(); + yield data_chunk; } } } diff --git a/src/batch/src/executor/source.rs b/src/batch/src/executor/source.rs index 7a37be918389..1ba28404a4d8 100644 --- a/src/batch/src/executor/source.rs +++ b/src/batch/src/executor/source.rs @@ -157,9 +157,9 @@ impl SourceExecutor { ConnectorProperties::default(), None, )); - let stream = self + let (stream, _) = self .source - .build_stream(Some(self.split_list), self.column_ids, source_ctx) + .build_stream(Some(self.split_list), self.column_ids, source_ctx, false) .await?; #[for_await] diff --git a/src/batch/src/executor/update.rs b/src/batch/src/executor/update.rs index a753aef840f5..95f1963cf582 100644 --- a/src/batch/src/executor/update.rs +++ b/src/batch/src/executor/update.rs @@ -42,13 +42,13 @@ pub struct UpdateExecutor { table_version_id: TableVersionId, dml_manager: DmlManagerRef, child: BoxedExecutor, - exprs: Vec, + old_exprs: Vec, + new_exprs: Vec, chunk_size: usize, schema: Schema, identity: String, returning: bool, txn_id: TxnId, - update_column_indices: Vec, session_id: u32, } @@ -59,11 +59,11 @@ impl UpdateExecutor { table_version_id: TableVersionId, dml_manager: DmlManagerRef, child: BoxedExecutor, - exprs: Vec, + old_exprs: Vec, + new_exprs: Vec, chunk_size: usize, identity: String, returning: bool, - update_column_indices: Vec, session_id: u32, ) -> Self { let chunk_size = chunk_size.next_multiple_of(2); @@ -75,7 +75,8 @@ impl UpdateExecutor { table_version_id, dml_manager, child, - exprs, + old_exprs, + new_exprs, chunk_size, schema: if returning { table_schema @@ -87,7 +88,6 @@ impl UpdateExecutor { identity, returning, txn_id, - update_column_indices, session_id, } } @@ -109,7 +109,7 @@ impl Executor for UpdateExecutor { impl UpdateExecutor { #[try_stream(boxed, ok = DataChunk, error = BatchError)] - async fn do_execute(mut self: Box) { + async fn do_execute(self: Box) { let table_dml_handle = self .dml_manager .table_dml_handle(self.table_id, self.table_version_id)?; @@ -122,15 +122,12 @@ impl UpdateExecutor { assert_eq!( data_types, - self.exprs.iter().map(|e| e.return_type()).collect_vec(), + self.new_exprs.iter().map(|e| e.return_type()).collect_vec(), "bad update schema" ); assert_eq!( data_types, - self.update_column_indices - .iter() - .map(|i: &usize| self.child.schema()[*i].data_type.clone()) - .collect_vec(), + self.old_exprs.iter().map(|e| e.return_type()).collect_vec(), "bad update schema" ); @@ -159,27 +156,35 @@ impl UpdateExecutor { let mut rows_updated = 0; #[for_await] - for data_chunk in self.child.execute() { - let data_chunk = data_chunk?; + for input in self.child.execute() { + let input = input?; + + let old_data_chunk = { + let mut columns = Vec::with_capacity(self.old_exprs.len()); + for expr in &self.old_exprs { + let column = expr.eval(&input).await?; + columns.push(column); + } + + DataChunk::new(columns, input.visibility().clone()) + }; let updated_data_chunk = { - let mut columns = Vec::with_capacity(self.exprs.len()); - for expr in &mut self.exprs { - let column = expr.eval(&data_chunk).await?; + let mut columns = Vec::with_capacity(self.new_exprs.len()); + for expr in &self.new_exprs { + let column = expr.eval(&input).await?; columns.push(column); } - DataChunk::new(columns, data_chunk.visibility().clone()) + DataChunk::new(columns, input.visibility().clone()) }; if self.returning { yield updated_data_chunk.clone(); } - for (row_delete, row_insert) in data_chunk - .project(&self.update_column_indices) - .rows() - .zip_eq_debug(updated_data_chunk.rows()) + for (row_delete, row_insert) in + (old_data_chunk.rows()).zip_eq_debug(updated_data_chunk.rows()) { rows_updated += 1; // If row_delete == row_insert, we don't need to do a actual update @@ -227,34 +232,35 @@ impl BoxedExecutorBuilder for UpdateExecutor { let table_id = TableId::new(update_node.table_id); - let exprs: Vec<_> = update_node - .get_exprs() + let old_exprs: Vec<_> = update_node + .get_old_exprs() .iter() .map(build_from_prost) .try_collect()?; - let update_column_indices = update_node - .update_column_indices + let new_exprs: Vec<_> = update_node + .get_new_exprs() .iter() - .map(|x| *x as usize) - .collect_vec(); + .map(build_from_prost) + .try_collect()?; Ok(Box::new(Self::new( table_id, update_node.table_version_id, source.context().dml_manager(), child, - exprs, + old_exprs, + new_exprs, source.context.get_config().developer.chunk_size, source.plan_node().get_identity().clone(), update_node.returning, - update_column_indices, update_node.session_id, ))) } } #[cfg(test)] +#[cfg(any())] mod tests { use std::sync::Arc; diff --git a/src/common/src/config.rs b/src/common/src/config.rs index 64d7675903ec..1f67057801c4 100644 --- a/src/common/src/config.rs +++ b/src/common/src/config.rs @@ -1100,6 +1100,9 @@ pub struct StreamingDeveloperConfig { #[serde(default = "default::developer::memory_controller_eviction_factor_stable")] pub memory_controller_eviction_factor_stable: f64, + #[serde(default = "default::developer::memory_controller_update_interval_ms")] + pub memory_controller_update_interval_ms: usize, + #[serde(default = "default::developer::memory_controller_sequence_tls_step")] pub memory_controller_sequence_tls_step: u64, @@ -1203,11 +1206,11 @@ pub struct ObjectStoreConfig { #[serde(default)] pub s3: S3ObjectStoreConfig, - // TODO: the following field will be deprecated after opendal is stablized + // TODO: the following field will be deprecated after opendal is stabilized #[serde(default = "default::object_store_config::opendal_upload_concurrency")] pub opendal_upload_concurrency: usize, - // TODO: the following field will be deprecated after opendal is stablized + // TODO: the following field will be deprecated after opendal is stabilized #[serde(default)] pub opendal_writer_abort_on_err: bool, @@ -1819,7 +1822,7 @@ pub mod default { } pub fn time_travel_version_cache_capacity() -> u64 { - 32 + 2 } } @@ -2056,6 +2059,10 @@ pub mod default { 1.0 } + pub fn memory_controller_update_interval_ms() -> usize { + 100 + } + pub fn memory_controller_sequence_tls_step() -> u64 { 128 } diff --git a/src/common/src/session_config/mod.rs b/src/common/src/session_config/mod.rs index 3b9451bf5511..d452c51ad353 100644 --- a/src/common/src/session_config/mod.rs +++ b/src/common/src/session_config/mod.rs @@ -77,7 +77,7 @@ pub struct SessionConfig { /// The default value is auto which means let the system decide to run batch queries in local /// or distributed mode automatically. #[serde_as(as = "DisplayFromStr")] - #[parameter(default = QueryMode::default(), flags = "NO_ALTER_SYS")] + #[parameter(default = QueryMode::default())] query_mode: QueryMode, /// Sets the number of digits displayed for floating-point values. diff --git a/src/common/src/types/jsonb.rs b/src/common/src/types/jsonb.rs index 9e6fdf8641cb..4b25741fbe96 100644 --- a/src/common/src/types/jsonb.rs +++ b/src/common/src/types/jsonb.rs @@ -133,8 +133,8 @@ impl crate::types::to_binary::ToBinary for JsonbRef<'_> { fn to_binary_with_type( &self, _ty: &crate::types::DataType, - ) -> super::to_binary::Result> { - Ok(Some(self.value_serialize().into())) + ) -> super::to_binary::Result { + Ok(self.value_serialize().into()) } } diff --git a/src/common/src/types/mod.rs b/src/common/src/types/mod.rs index 44be87116643..ad516eab101c 100644 --- a/src/common/src/types/mod.rs +++ b/src/common/src/types/mod.rs @@ -987,7 +987,7 @@ pub fn hash_datum(datum: impl ToDatumRef, state: &mut impl std::hash::Hasher) { impl ScalarRefImpl<'_> { pub fn binary_format(&self, data_type: &DataType) -> to_binary::Result { use self::to_binary::ToBinary; - self.to_binary_with_type(data_type).transpose().unwrap() + self.to_binary_with_type(data_type) } pub fn text_format(&self, data_type: &DataType) -> String { diff --git a/src/common/src/types/num256.rs b/src/common/src/types/num256.rs index 6c96b3ddbbec..eccb0a9741ea 100644 --- a/src/common/src/types/num256.rs +++ b/src/common/src/types/num256.rs @@ -165,14 +165,11 @@ macro_rules! impl_common_for_num256 { } impl ToBinary for $scalar_ref<'_> { - fn to_binary_with_type( - &self, - _ty: &DataType, - ) -> super::to_binary::Result> { + fn to_binary_with_type(&self, _ty: &DataType) -> super::to_binary::Result { let mut output = bytes::BytesMut::new(); let buffer = self.to_be_bytes(); output.put_slice(&buffer); - Ok(Some(output.freeze())) + Ok(output.freeze()) } } diff --git a/src/common/src/types/postgres_type.rs b/src/common/src/types/postgres_type.rs index d85f08ed59cc..c84f3e19f309 100644 --- a/src/common/src/types/postgres_type.rs +++ b/src/common/src/types/postgres_type.rs @@ -116,7 +116,7 @@ impl DataType { )* DataType::Int256 => 1302, DataType::Serial => 1016, - DataType::Struct(_) => -1, + DataType::Struct(_) => 2287, // pseudo-type of array[struct] (see `pg_type.dat`) DataType::List { .. } => unreachable!("Never reach here!"), DataType::Map(_) => 1304, } @@ -125,8 +125,7 @@ impl DataType { DataType::Int256 => 1301, DataType::Map(_) => 1303, // TODO: Support to give a new oid for custom struct type. #9434 - // 1043 is varchar - DataType::Struct(_) => 1043, + DataType::Struct(_) => 2249, // pseudo-type of struct (see `pg_type.dat`) } } } diff --git a/src/common/src/types/to_binary.rs b/src/common/src/types/to_binary.rs index da7f75f0a2a3..294f96bc7045 100644 --- a/src/common/src/types/to_binary.rs +++ b/src/common/src/types/to_binary.rs @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -use bytes::{Bytes, BytesMut}; +use bytes::{BufMut, Bytes, BytesMut}; use postgres_types::{ToSql, Type}; +use rw_iter_util::ZipEqFast; use super::{ DataType, Date, Decimal, Interval, ScalarRefImpl, Serial, Time, Timestamp, Timestamptz, F32, F64, }; +use crate::array::{ListRef, StructRef}; use crate::error::NotImplemented; /// Error type for [`ToBinary`] trait. @@ -38,19 +40,19 @@ pub type Result = std::result::Result; /// [`postgres_types::ToSql`] has similar functionality, and most of our types implement /// that trait and forward `ToBinary` to it directly. pub trait ToBinary { - fn to_binary_with_type(&self, ty: &DataType) -> Result>; + fn to_binary_with_type(&self, ty: &DataType) -> Result; } macro_rules! implement_using_to_sql { ($({ $scalar_type:ty, $data_type:ident, $accessor:expr } ),* $(,)?) => { $( impl ToBinary for $scalar_type { - fn to_binary_with_type(&self, ty: &DataType) -> Result> { + fn to_binary_with_type(&self, ty: &DataType) -> Result { match ty { DataType::$data_type => { let mut output = BytesMut::new(); #[allow(clippy::redundant_closure_call)] $accessor(self).to_sql(&Type::ANY, &mut output).map_err(ToBinaryError::ToSql)?; - Ok(Some(output.freeze())) + Ok(output.freeze()) }, _ => unreachable!(), } @@ -78,8 +80,68 @@ implement_using_to_sql! { { Timestamptz, Timestamptz, |x: &Timestamptz| x.to_datetime_utc() } } +impl ToBinary for ListRef<'_> { + fn to_binary_with_type(&self, ty: &DataType) -> Result { + // Reference: Postgres code `src/backend/utils/adt/arrayfuncs.c` + // https://github.com/postgres/postgres/blob/c1c09007e219ae68d1f8428a54baf68ccc1f8683/src/backend/utils/adt/arrayfuncs.c#L1548 + use crate::row::Row; + let element_ty = match ty { + DataType::List(ty) => ty.as_ref(), + _ => unreachable!(), + }; + if matches!(element_ty, DataType::List(_)) { + bail_not_implemented!( + issue = 7949, + "list with 2 or more dimensions is not supported" + ) + } + let mut buf = BytesMut::new(); + buf.put_i32(1); // Number of dimensions (must be 1) + buf.put_i32(1); // Has nulls? + buf.put_i32(element_ty.to_oid()); // Element type + buf.put_i32(self.len() as i32); // Length of 1st dimension + buf.put_i32(0); // Offset of 1st dimension + for element in self.iter() { + match element { + None => { + buf.put_i32(-1); // -1 length means a NULL + } + Some(value) => { + let data = value.to_binary_with_type(element_ty)?; + buf.put_i32(data.len() as i32); // Length of element + buf.put(data); + } + } + } + Ok(buf.into()) + } +} + +impl ToBinary for StructRef<'_> { + fn to_binary_with_type(&self, ty: &DataType) -> Result { + // Reference: Postgres code `src/backend/utils/adt/rowtypes.c` + // https://github.com/postgres/postgres/blob/a3699daea2026de324ed7cc7115c36d3499010d3/src/backend/utils/adt/rowtypes.c#L687 + let mut buf = BytesMut::new(); + buf.put_i32(ty.as_struct().len() as i32); // number of columns + for (datum, field_ty) in self.iter_fields_ref().zip_eq_fast(ty.as_struct().types()) { + buf.put_i32(field_ty.to_oid()); // column type + match datum { + None => { + buf.put_i32(-1); // -1 length means a NULL + } + Some(value) => { + let data = value.to_binary_with_type(field_ty)?; + buf.put_i32(data.len() as i32); // Length of element + buf.put(data); + } + } + } + Ok(buf.into()) + } +} + impl ToBinary for ScalarRefImpl<'_> { - fn to_binary_with_type(&self, ty: &DataType) -> Result> { + fn to_binary_with_type(&self, ty: &DataType) -> Result { match self { ScalarRefImpl::Int16(v) => v.to_binary_with_type(ty), ScalarRefImpl::Int32(v) => v.to_binary_with_type(ty), @@ -98,11 +160,14 @@ impl ToBinary for ScalarRefImpl<'_> { ScalarRefImpl::Time(v) => v.to_binary_with_type(ty), ScalarRefImpl::Bytea(v) => v.to_binary_with_type(ty), ScalarRefImpl::Jsonb(v) => v.to_binary_with_type(ty), - ScalarRefImpl::Struct(_) | ScalarRefImpl::List(_) => bail_not_implemented!( - issue = 7949, - "the pgwire extended-mode encoding for {ty} is unsupported" - ), - ScalarRefImpl::Map(_) => todo!(), + ScalarRefImpl::List(v) => v.to_binary_with_type(ty), + ScalarRefImpl::Struct(v) => v.to_binary_with_type(ty), + ScalarRefImpl::Map(_) => { + bail_not_implemented!( + issue = 7949, + "the pgwire extended-mode encoding for {ty} is unsupported" + ) + } } } } diff --git a/src/compute/src/memory/manager.rs b/src/compute/src/memory/manager.rs index b90624193c70..235ab5802fbf 100644 --- a/src/compute/src/memory/manager.rs +++ b/src/compute/src/memory/manager.rs @@ -17,8 +17,6 @@ use std::sync::{Arc, Mutex}; use std::time::Duration; use risingwave_common::sequence::AtomicSequence; -use risingwave_common::system_param::local_manager::SystemParamsReaderRef; -use risingwave_common::system_param::reader::SystemParamsRead; use risingwave_stream::executor::monitor::StreamingMetrics; use super::controller::LruWatermarkController; @@ -50,7 +48,7 @@ pub struct MemoryManager { impl MemoryManager { // Arbitrarily set a minimal barrier interval in case it is too small, // especially when it's 0. - const MIN_TICK_INTERVAL_MS: u32 = 10; + const MIN_INTERVAL: Duration = Duration::from_millis(10); pub fn new(config: MemoryManagerConfig) -> Arc { let controller = Mutex::new(LruWatermarkController::new(&config)); @@ -67,42 +65,23 @@ impl MemoryManager { self.watermark_sequence.clone() } - pub async fn run( - self: Arc, - initial_interval_ms: u32, - mut system_params_change_rx: tokio::sync::watch::Receiver, - ) { + pub async fn run(self: Arc, interval: Duration) { // Loop interval of running control policy - let mut interval_ms = std::cmp::max(initial_interval_ms, Self::MIN_TICK_INTERVAL_MS); - tracing::info!( - "start running MemoryManager with interval {}ms", - interval_ms - ); + let interval = std::cmp::max(interval, Self::MIN_INTERVAL); + tracing::info!("start running MemoryManager with interval {interval:?}",); // Keep same interval with the barrier interval - let mut tick_interval = tokio::time::interval(Duration::from_millis(interval_ms as u64)); + let mut tick_interval = tokio::time::interval(interval); loop { - // Wait for a while to check if need eviction. - tokio::select! { - Ok(_) = system_params_change_rx.changed() => { - let params = system_params_change_rx.borrow().load(); - let new_interval_ms = std::cmp::max(params.barrier_interval_ms(), Self::MIN_TICK_INTERVAL_MS); - if new_interval_ms != interval_ms { - interval_ms = new_interval_ms; - tick_interval = tokio::time::interval(Duration::from_millis(interval_ms as u64)); - tracing::info!("updated MemoryManager interval to {}ms", interval_ms); - } - } - - _ = tick_interval.tick() => { - let new_watermark_sequence = self.controller.lock().unwrap().tick(); - - self.watermark_sequence.store(new_watermark_sequence, Ordering::Relaxed); - - self.metrics.lru_runtime_loop_count.inc(); - } - } + tick_interval.tick().await; + + let new_watermark_sequence = self.controller.lock().unwrap().tick(); + + self.watermark_sequence + .store(new_watermark_sequence, Ordering::Relaxed); + + self.metrics.lru_runtime_loop_count.inc(); } } } diff --git a/src/compute/src/server.rs b/src/compute/src/server.rs index da3328b0b2ce..eda0ee9c159d 100644 --- a/src/compute/src/server.rs +++ b/src/compute/src/server.rs @@ -325,10 +325,14 @@ pub async fn compute_node_serve( }); // Run a background memory manager - tokio::spawn(memory_mgr.clone().run( - system_params.barrier_interval_ms(), - system_params_manager.watch_params(), - )); + tokio::spawn( + memory_mgr.clone().run(Duration::from_millis( + config + .streaming + .developer + .memory_controller_update_interval_ms as _, + )), + ); let heap_profiler = HeapProfiler::new( opts.total_memory_bytes, diff --git a/src/config/docs.md b/src/config/docs.md index bfe6a2fb5429..74afda83f143 100644 --- a/src/config/docs.md +++ b/src/config/docs.md @@ -155,7 +155,7 @@ This page is automatically generated by `./risedev generate-example-config` | shared_buffer_min_batch_flush_size_mb | The minimum total flush size of shared buffer spill. When a shared buffer spilled is trigger, the total flush size across multiple epochs should be at least higher than this size. | 800 | | sstable_id_remote_fetch_number | Number of SST ids fetched from meta per RPC | 10 | | table_info_statistic_history_times | Deprecated: The window size of table info statistic history. | 240 | -| time_travel_version_cache_capacity | | 32 | +| time_travel_version_cache_capacity | | 2 | | write_conflict_detection_enabled | Whether to enable write conflict detection | true | ## storage.cache diff --git a/src/config/example.toml b/src/config/example.toml index 7a01ff5254e7..dca7fe505730 100644 --- a/src/config/example.toml +++ b/src/config/example.toml @@ -136,6 +136,7 @@ stream_memory_controller_threshold_stable = 0.72 stream_memory_controller_eviction_factor_aggressive = 2.0 stream_memory_controller_eviction_factor_graceful = 1.5 stream_memory_controller_eviction_factor_stable = 1.0 +stream_memory_controller_update_interval_ms = 100 stream_memory_controller_sequence_tls_step = 128 stream_memory_controller_sequence_tls_lag = 32 stream_enable_arrangement_backfill = true @@ -177,7 +178,7 @@ table_info_statistic_history_times = 240 mem_table_spill_threshold = 4194304 compactor_max_overlap_sst_count = 64 compactor_max_preload_meta_file_count = 32 -time_travel_version_cache_capacity = 32 +time_travel_version_cache_capacity = 2 [storage.cache.block_cache_eviction] algorithm = "Lru" diff --git a/src/connector/Cargo.toml b/src/connector/Cargo.toml index 4a7493dd3dd9..1fefe486d27f 100644 --- a/src/connector/Cargo.toml +++ b/src/connector/Cargo.toml @@ -18,7 +18,7 @@ anyhow = "1" apache-avro = { workspace = true } assert_matches = "1" async-compression = { version = "0.4.5", features = ["gzip", "tokio"] } -async-nats = "0.37" +async-nats = "0.38" async-trait = "0.1" auto_enums = { workspace = true } auto_impl = "1" diff --git a/src/connector/src/connector_common/iceberg/mod.rs b/src/connector/src/connector_common/iceberg/mod.rs index d10a9eefb68a..ac0000128a0a 100644 --- a/src/connector/src/connector_common/iceberg/mod.rs +++ b/src/connector/src/connector_common/iceberg/mod.rs @@ -62,6 +62,22 @@ pub struct IcebergCommon { /// Full name of table, must include schema name. #[serde(rename = "table.name")] pub table_name: String, + /// Credential for accessing iceberg catalog, only applicable in rest catalog. + /// A credential to exchange for a token in the OAuth2 client credentials flow. + #[serde(rename = "catalog.credential")] + pub credential: Option, + /// token for accessing iceberg catalog, only applicable in rest catalog. + /// A Bearer token which will be used for interaction with the server. + #[serde(rename = "catalog.token")] + pub token: Option, + /// `oauth2-server-uri` for accessing iceberg catalog, only applicable in rest catalog. + /// Token endpoint URI to fetch token from if the Rest Catalog is not the authorization server. + #[serde(rename = "catalog.oauth2-server-uri")] + pub oauth2_server_uri: Option, + /// scope for accessing iceberg catalog, only applicable in rest catalog. + /// Additional scope for OAuth2. + #[serde(rename = "catalog.scope")] + pub scope: Option, #[serde( rename = "s3.path.style.access", @@ -145,20 +161,32 @@ impl IcebergCommon { match &self.warehouse_path { Some(warehouse_path) => { let (bucket, _) = { - let url = Url::parse(warehouse_path).with_context(|| { - format!("Invalid warehouse path: {}", warehouse_path) - })?; - let bucket = url - .host_str() - .with_context(|| { - format!("Invalid s3 path: {}, bucket is missing", warehouse_path) - })? - .to_string(); - let root = url.path().trim_start_matches('/').to_string(); - (bucket, root) + let url = Url::parse(warehouse_path); + if url.is_err() && catalog_type == "rest" { + // If the warehouse path is not a valid URL, it could be a warehouse name in rest catalog + // so we allow it to pass here. + (None, None) + } else { + let url = url.with_context(|| { + format!("Invalid warehouse path: {}", warehouse_path) + })?; + let bucket = url + .host_str() + .with_context(|| { + format!( + "Invalid s3 path: {}, bucket is missing", + warehouse_path + ) + })? + .to_string(); + let root = url.path().trim_start_matches('/').to_string(); + (Some(bucket), Some(root)) + } }; - iceberg_configs.insert("iceberg.table.io.bucket".to_string(), bucket); + if let Some(bucket) = bucket { + iceberg_configs.insert("iceberg.table.io.bucket".to_string(), bucket); + } } None => { if catalog_type != "rest" { @@ -219,29 +247,48 @@ impl IcebergCommon { path_style_access.to_string(), ); } - if matches!(self.catalog_type.as_deref(), Some("glue")) { - java_catalog_configs.insert( - "client.credentials-provider".to_string(), - "com.risingwave.connector.catalog.GlueCredentialProvider".to_string(), - ); - // Use S3 ak/sk and region as glue ak/sk and region by default. - // TODO: use different ak/sk and region for s3 and glue. - java_catalog_configs.insert( - "client.credentials-provider.glue.access-key-id".to_string(), - self.access_key.clone().to_string(), - ); - java_catalog_configs.insert( - "client.credentials-provider.glue.secret-access-key".to_string(), - self.secret_key.clone().to_string(), - ); - if let Some(region) = &self.region { - java_catalog_configs - .insert("client.region".to_string(), region.clone().to_string()); + + match self.catalog_type.as_deref() { + Some("rest") => { + if let Some(credential) = &self.credential { + java_catalog_configs.insert("credential".to_string(), credential.clone()); + } + if let Some(token) = &self.token { + java_catalog_configs.insert("token".to_string(), token.clone()); + } + if let Some(oauth2_server_uri) = &self.oauth2_server_uri { + java_catalog_configs + .insert("oauth2-server-uri".to_string(), oauth2_server_uri.clone()); + } + if let Some(scope) = &self.scope { + java_catalog_configs.insert("scope".to_string(), scope.clone()); + } + } + Some("glue") => { + java_catalog_configs.insert( + "client.credentials-provider".to_string(), + "com.risingwave.connector.catalog.GlueCredentialProvider".to_string(), + ); + // Use S3 ak/sk and region as glue ak/sk and region by default. + // TODO: use different ak/sk and region for s3 and glue. + java_catalog_configs.insert( + "client.credentials-provider.glue.access-key-id".to_string(), + self.access_key.clone().to_string(), + ); java_catalog_configs.insert( - "glue.endpoint".to_string(), - format!("https://glue.{}.amazonaws.com", region), + "client.credentials-provider.glue.secret-access-key".to_string(), + self.secret_key.clone().to_string(), ); + if let Some(region) = &self.region { + java_catalog_configs + .insert("client.region".to_string(), region.clone().to_string()); + java_catalog_configs.insert( + "glue.endpoint".to_string(), + format!("https://glue.{}.amazonaws.com", region), + ); + } } + _ => {} } } @@ -492,6 +539,20 @@ mod v2 { S3_SECRET_ACCESS_KEY.to_string(), self.secret_key.clone().to_string(), ); + if let Some(credential) = &self.credential { + iceberg_configs.insert("credential".to_string(), credential.clone()); + } + if let Some(token) = &self.token { + iceberg_configs.insert("token".to_string(), token.clone()); + } + if let Some(oauth2_server_uri) = &self.oauth2_server_uri { + iceberg_configs + .insert("oauth2-server-uri".to_string(), oauth2_server_uri.clone()); + } + if let Some(scope) = &self.scope { + iceberg_configs.insert("scope".to_string(), scope.clone()); + } + let config_builder = iceberg_catalog_rest::RestCatalogConfig::builder() .uri(self.catalog_uri.clone().with_context(|| { "`catalog.uri` must be set in rest catalog".to_string() diff --git a/src/connector/src/sink/encoder/json.rs b/src/connector/src/sink/encoder/json.rs index 7691b3de5f44..b3c0580a5a78 100644 --- a/src/connector/src/sink/encoder/json.rs +++ b/src/connector/src/sink/encoder/json.rs @@ -204,7 +204,7 @@ fn datum_to_json_object( let data_type = field.data_type(); - tracing::debug!("datum_to_json_object: {:?}, {:?}", data_type, scalar_ref); + tracing::trace!("datum_to_json_object: {:?}, {:?}", data_type, scalar_ref); let value = match (data_type, scalar_ref) { (DataType::Boolean, ScalarRefImpl::Bool(v)) => { diff --git a/src/connector/src/sink/iceberg/mod.rs b/src/connector/src/sink/iceberg/mod.rs index 0c878ae1ba6d..54699b9599fe 100644 --- a/src/connector/src/sink/iceberg/mod.rs +++ b/src/connector/src/sink/iceberg/mod.rs @@ -49,6 +49,7 @@ use risingwave_pb::connector_service::SinkMetadata; use serde_derive::Deserialize; use serde_with::{serde_as, DisplayFromStr}; use thiserror_ext::AsReport; +use url::Url; use with_options::WithOptions; use self::prometheus::monitored_base_file_writer::MonitoredBaseFileWriterBuilder; @@ -283,7 +284,18 @@ impl IcebergSink { names.push(self.config.common.table_name.to_string()); match &self.config.common.warehouse_path { Some(warehouse_path) => { - if warehouse_path.ends_with('/') { + let url = Url::parse(warehouse_path); + if url.is_err() { + // For rest catalog, the warehouse_path could be a warehouse name. + // In this case, we should specify the location when creating a table. + if self.config.common.catalog_type() == "rest" + || self.config.common.catalog_type() == "rest_rust" + { + None + } else { + bail!(format!("Invalid warehouse path: {}", warehouse_path)) + } + } else if warehouse_path.ends_with('/') { Some(format!("{}{}", warehouse_path, names.join("/"))) } else { Some(format!("{}/{}", warehouse_path, names.join("/"))) @@ -1017,6 +1029,10 @@ mod test { database_name: Some("demo_db".to_string()), table_name: "demo_table".to_string(), path_style_access: Some(true), + credential: None, + oauth2_server_uri: None, + scope: None, + token: None, }, r#type: "upsert".to_string(), force_append_only: false, diff --git a/src/connector/src/sink/starrocks.rs b/src/connector/src/sink/starrocks.rs index 35d1033d8ee1..b5b6bf90e025 100644 --- a/src/connector/src/sink/starrocks.rs +++ b/src/connector/src/sink/starrocks.rs @@ -33,7 +33,6 @@ use serde_derive::Serialize; use serde_json::Value; use serde_with::{serde_as, DisplayFromStr}; use thiserror_ext::AsReport; -use tokio::task::JoinHandle; use url::form_urlencoded; use with_options::WithOptions; @@ -898,16 +897,12 @@ impl SinkCommitCoordinator for StarrocksSinkCommitter { tracing::debug!(?epoch, ?txn_labels, "commit transaction"); if !txn_labels.is_empty() { - let join_handles = txn_labels - .into_iter() - .map(|txn_label| { - let client = self.client.clone(); - tokio::spawn(async move { client.commit(txn_label).await }) - }) - .collect::>>>(); - futures::future::try_join_all(join_handles) - .await - .map_err(|err| SinkError::DorisStarrocksConnect(anyhow!(err)))?; + futures::future::try_join_all( + txn_labels + .into_iter() + .map(|txn_label| self.client.commit(txn_label)), + ) + .await?; } Ok(()) } diff --git a/src/connector/src/source/base.rs b/src/connector/src/source/base.rs index 59e3585431a6..e031a85a34d6 100644 --- a/src/connector/src/source/base.rs +++ b/src/connector/src/source/base.rs @@ -69,7 +69,7 @@ pub trait TryFromBTreeMap: Sized + UnknownFields { /// Represents `WITH` options for sources. /// /// Each instance should add a `#[derive(with_options::WithOptions)]` marker. -pub trait SourceProperties: TryFromBTreeMap + Clone + WithOptions { +pub trait SourceProperties: TryFromBTreeMap + Clone + WithOptions + std::fmt::Debug { const SOURCE_NAME: &'static str; type Split: SplitMetaData + TryFrom @@ -108,7 +108,7 @@ impl TryFromBTreeMap for P { } } -pub async fn create_split_reader( +pub async fn create_split_reader( prop: P, splits: Vec, parser_config: ParserConfig, @@ -375,6 +375,10 @@ pub trait SplitReader: Sized + Send { fn backfill_info(&self) -> HashMap { HashMap::new() } + + async fn seek_to_latest(&mut self) -> Result> { + Err(anyhow!("seek_to_latest is not supported for this connector").into()) + } } /// Information used to determine whether we should start and finish source backfill. diff --git a/src/connector/src/source/cdc/mod.rs b/src/connector/src/source/cdc/mod.rs index 3f3626449153..9e99c7db9e5e 100644 --- a/src/connector/src/source/cdc/mod.rs +++ b/src/connector/src/source/cdc/mod.rs @@ -58,7 +58,7 @@ pub fn build_cdc_table_id(source_id: u32, external_table_name: &str) -> String { format!("{}.{}", source_id, external_table_name) } -pub trait CdcSourceTypeTrait: Send + Sync + Clone + 'static { +pub trait CdcSourceTypeTrait: Send + Sync + Clone + std::fmt::Debug + 'static { const CDC_CONNECTOR_NAME: &'static str; fn source_type() -> CdcSourceType; } diff --git a/src/connector/src/source/filesystem/opendal_source/opendal_reader.rs b/src/connector/src/source/filesystem/opendal_source/opendal_reader.rs index 14258d892465..ca8ee1ae486b 100644 --- a/src/connector/src/source/filesystem/opendal_source/opendal_reader.rs +++ b/src/connector/src/source/filesystem/opendal_source/opendal_reader.rs @@ -19,24 +19,19 @@ use async_compression::tokio::bufread::GzipDecoder; use async_trait::async_trait; use futures::TryStreamExt; use futures_async_stream::try_stream; -use itertools::Itertools; use opendal::Operator; -use parquet::arrow::async_reader::AsyncFileReader; -use parquet::arrow::{parquet_to_arrow_schema, ParquetRecordBatchStreamBuilder, ProjectionMask}; -use parquet::file::metadata::FileMetaData; -use risingwave_common::array::arrow::IcebergArrowConvert; use risingwave_common::array::StreamChunk; -use risingwave_common::util::tokio_util::compat::FuturesAsyncReadCompatExt; use tokio::io::{AsyncRead, BufReader}; use tokio_util::io::{ReaderStream, StreamReader}; use super::opendal_enumerator::OpendalEnumerator; use super::OpendalSource; use crate::error::ConnectorResult; -use crate::parser::{ByteStreamSourceParserImpl, EncodingProperties, ParquetParser, ParserConfig}; +use crate::parser::{ByteStreamSourceParserImpl, EncodingProperties, ParserConfig}; use crate::source::filesystem::file_common::CompressionFormat; use crate::source::filesystem::nd_streaming::need_nd_streaming; use crate::source::filesystem::{nd_streaming, OpendalFsSplit}; +use crate::source::iceberg::read_parquet_file; use crate::source::{ BoxChunkSourceStream, Column, SourceContextRef, SourceMessage, SourceMeta, SplitMetaData, SplitReader, @@ -91,38 +86,15 @@ impl OpendalReader { let msg_stream; if let EncodingProperties::Parquet = &self.parser_config.specific.encoding_config { - // // If the format is "parquet", use `ParquetParser` to convert `record_batch` into stream chunk. - let mut reader: tokio_util::compat::Compat = self - .connector - .op - .reader_with(&object_name) - .into_future() // Unlike `rustc`, `try_stream` seems require manual `into_future`. - .await? - .into_futures_async_read(..) - .await? - .compat(); - let parquet_metadata = reader.get_metadata().await.map_err(anyhow::Error::from)?; - - let file_metadata = parquet_metadata.file_metadata(); - let column_indices = - extract_valid_column_indices(self.columns.clone(), file_metadata)?; - let projection_mask = - ProjectionMask::leaves(file_metadata.schema_descr(), column_indices); - // For the Parquet format, we directly convert from a record batch to a stream chunk. - // Therefore, the offset of the Parquet file represents the current position in terms of the number of rows read from the file. - let record_batch_stream = ParquetRecordBatchStreamBuilder::new(reader) - .await? - .with_batch_size(self.source_ctx.source_ctrl_opts.chunk_size) - .with_projection(projection_mask) - .with_offset(split.offset) - .build()?; - - let parquet_parser = ParquetParser::new( - self.parser_config.common.rw_columns.clone(), + msg_stream = read_parquet_file( + self.connector.op.clone(), object_name, + self.columns.clone(), + Some(self.parser_config.common.rw_columns.clone()), + self.source_ctx.source_ctrl_opts.chunk_size, split.offset, - )?; - msg_stream = parquet_parser.into_stream(record_batch_stream); + ) + .await?; } else { let data_stream = Self::stream_read_object( self.connector.op.clone(), @@ -229,61 +201,3 @@ impl OpendalReader { } } } - -/// Extracts valid column indices from a Parquet file schema based on the user's requested schema. -/// -/// This function is used for column pruning of Parquet files. It calculates the intersection -/// between the columns in the currently read Parquet file and the schema provided by the user. -/// This is useful for reading a `RecordBatch` with the appropriate `ProjectionMask`, ensuring that -/// only the necessary columns are read. -/// -/// # Parameters -/// - `columns`: A vector of `Column` representing the user's requested schema. -/// - `metadata`: A reference to `FileMetaData` containing the schema and metadata of the Parquet file. -/// -/// # Returns -/// - A `ConnectorResult>`, which contains the indices of the valid columns in the -/// Parquet file schema that match the requested schema. If an error occurs during processing, -/// it returns an appropriate error. -pub fn extract_valid_column_indices( - columns: Option>, - metadata: &FileMetaData, -) -> ConnectorResult> { - match columns { - Some(rw_columns) => { - let parquet_column_names = metadata - .schema_descr() - .columns() - .iter() - .map(|c| c.name()) - .collect_vec(); - - let converted_arrow_schema = - parquet_to_arrow_schema(metadata.schema_descr(), metadata.key_value_metadata()) - .map_err(anyhow::Error::from)?; - - let valid_column_indices: Vec = rw_columns - .iter() - .filter_map(|column| { - parquet_column_names - .iter() - .position(|&name| name == column.name) - .and_then(|pos| { - // We should convert Arrow field to the rw data type instead of converting the rw data type to the Arrow data type for comparison. - // The reason is that for the timestamp type, the different time units in Arrow need to match with the timestamp and timestamptz in rw. - let arrow_filed_to_rw_data_type = IcebergArrowConvert - .type_from_field(converted_arrow_schema.field(pos)) - .ok()?; - if arrow_filed_to_rw_data_type == column.data_type { - Some(pos) - } else { - None - } - }) - }) - .collect(); - Ok(valid_column_indices) - } - None => Ok(vec![]), - } -} diff --git a/src/connector/src/source/iceberg/mod.rs b/src/connector/src/source/iceberg/mod.rs index 1dbce9c73d82..ef15d0335881 100644 --- a/src/connector/src/source/iceberg/mod.rs +++ b/src/connector/src/source/iceberg/mod.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod parquet_file_reader; +pub mod parquet_file_handler; use std::collections::HashMap; @@ -24,7 +24,7 @@ use iceberg::scan::FileScanTask; use iceberg::spec::TableMetadata; use iceberg::table::Table; use itertools::Itertools; -pub use parquet_file_reader::*; +pub use parquet_file_handler::*; use risingwave_common::bail; use risingwave_common::catalog::{Schema, ICEBERG_SEQUENCE_NUM_COLUMN_NAME}; use risingwave_common::types::JsonbVal; diff --git a/src/connector/src/source/iceberg/parquet_file_handler.rs b/src/connector/src/source/iceberg/parquet_file_handler.rs new file mode 100644 index 000000000000..146348545fbc --- /dev/null +++ b/src/connector/src/source/iceberg/parquet_file_handler.rs @@ -0,0 +1,320 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::future::IntoFuture; +use std::ops::Range; +use std::pin::Pin; +use std::sync::Arc; + +use anyhow::anyhow; +use bytes::Bytes; +use futures::future::BoxFuture; +use futures::{FutureExt, Stream, TryFutureExt}; +use iceberg::io::{ + FileIOBuilder, FileMetadata, FileRead, S3_ACCESS_KEY_ID, S3_REGION, S3_SECRET_ACCESS_KEY, +}; +use iceberg::{Error, ErrorKind}; +use itertools::Itertools; +use opendal::layers::{LoggingLayer, RetryLayer}; +use opendal::services::S3; +use opendal::Operator; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::arrow::{parquet_to_arrow_schema, ParquetRecordBatchStreamBuilder, ProjectionMask}; +use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataReader}; +use risingwave_common::array::arrow::IcebergArrowConvert; +use risingwave_common::array::StreamChunk; +use risingwave_common::catalog::ColumnId; +use risingwave_common::util::tokio_util::compat::FuturesAsyncReadCompatExt; +use url::Url; + +use crate::error::ConnectorResult; +use crate::parser::ParquetParser; +use crate::source::{Column, SourceColumnDesc}; + +pub struct ParquetFileReader { + meta: FileMetadata, + r: R, +} + +impl ParquetFileReader { + pub fn new(meta: FileMetadata, r: R) -> Self { + Self { meta, r } + } +} + +impl AsyncFileReader for ParquetFileReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { + Box::pin( + self.r + .read(range.start as _..range.end as _) + .map_err(|err| parquet::errors::ParquetError::External(Box::new(err))), + ) + } + + fn get_metadata(&mut self) -> BoxFuture<'_, parquet::errors::Result>> { + async move { + let reader = ParquetMetaDataReader::new(); + let size = self.meta.size as usize; + let meta = reader.load_and_finish(self, size).await?; + + Ok(Arc::new(meta)) + } + .boxed() + } +} + +pub async fn create_parquet_stream_builder( + s3_region: String, + s3_access_key: String, + s3_secret_key: String, + location: String, +) -> Result>, anyhow::Error> { + let mut props = HashMap::new(); + props.insert(S3_REGION, s3_region.clone()); + props.insert(S3_ACCESS_KEY_ID, s3_access_key.clone()); + props.insert(S3_SECRET_ACCESS_KEY, s3_secret_key.clone()); + + let file_io_builder = FileIOBuilder::new("s3"); + let file_io = file_io_builder + .with_props(props.into_iter()) + .build() + .map_err(|e| anyhow!(e))?; + let parquet_file = file_io.new_input(&location).map_err(|e| anyhow!(e))?; + + let parquet_metadata = parquet_file.metadata().await.map_err(|e| anyhow!(e))?; + let parquet_reader = parquet_file.reader().await.map_err(|e| anyhow!(e))?; + let parquet_file_reader = ParquetFileReader::new(parquet_metadata, parquet_reader); + + ParquetRecordBatchStreamBuilder::new(parquet_file_reader) + .await + .map_err(|e| anyhow!(e)) +} + +pub fn new_s3_operator( + s3_region: String, + s3_access_key: String, + s3_secret_key: String, + location: String, +) -> ConnectorResult { + // Create s3 builder. + let bucket = extract_bucket(&location); + let mut builder = S3::default().bucket(&bucket).region(&s3_region); + builder = builder.secret_access_key(&s3_access_key); + builder = builder.secret_access_key(&s3_secret_key); + builder = builder.endpoint(&format!( + "https://{}.s3.{}.amazonaws.com", + bucket, s3_region + )); + + builder = builder.disable_config_load(); + + let op: Operator = Operator::new(builder)? + .layer(LoggingLayer::default()) + .layer(RetryLayer::default()) + .finish(); + + Ok(op) +} + +fn extract_bucket(location: &str) -> String { + let prefix = "s3://"; + let start = prefix.len(); + let end = location[start..] + .find('/') + .unwrap_or(location.len() - start); + location[start..start + end].to_string() +} + +pub async fn list_s3_directory( + s3_region: String, + s3_access_key: String, + s3_secret_key: String, + dir: String, +) -> Result, anyhow::Error> { + let url = Url::parse(&dir)?; + let bucket = url.host_str().ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Invalid s3 url: {}, missing bucket", dir), + ) + })?; + + let prefix = format!("s3://{}/", bucket); + if dir.starts_with(&prefix) { + let mut builder = S3::default(); + builder = builder + .region(&s3_region) + .access_key_id(&s3_access_key) + .secret_access_key(&s3_secret_key) + .bucket(bucket); + let op = Operator::new(builder)? + .layer(RetryLayer::default()) + .finish(); + + op.list(&dir[prefix.len()..]) + .await + .map_err(|e| anyhow!(e)) + .map(|list| { + list.into_iter() + .map(|entry| prefix.to_string() + entry.path()) + .collect() + }) + } else { + Err(Error::new( + ErrorKind::DataInvalid, + format!("Invalid s3 url: {}, should start with {}", dir, prefix), + ))? + } +} + +/// Extracts valid column indices from a Parquet file schema based on the user's requested schema. +/// +/// This function is used for column pruning of Parquet files. It calculates the intersection +/// between the columns in the currently read Parquet file and the schema provided by the user. +/// This is useful for reading a `RecordBatch` with the appropriate `ProjectionMask`, ensuring that +/// only the necessary columns are read. +/// +/// # Parameters +/// - `columns`: A vector of `Column` representing the user's requested schema. +/// - `metadata`: A reference to `FileMetaData` containing the schema and metadata of the Parquet file. +/// +/// # Returns +/// - A `ConnectorResult>`, which contains the indices of the valid columns in the +/// Parquet file schema that match the requested schema. If an error occurs during processing, +/// it returns an appropriate error. +pub fn extract_valid_column_indices( + columns: Option>, + metadata: &FileMetaData, +) -> ConnectorResult> { + match columns { + Some(rw_columns) => { + let parquet_column_names = metadata + .schema_descr() + .columns() + .iter() + .map(|c| c.name()) + .collect_vec(); + + let converted_arrow_schema = + parquet_to_arrow_schema(metadata.schema_descr(), metadata.key_value_metadata()) + .map_err(anyhow::Error::from)?; + + let valid_column_indices: Vec = rw_columns + .iter() + .filter_map(|column| { + parquet_column_names + .iter() + .position(|&name| name == column.name) + .and_then(|pos| { + let arrow_field = IcebergArrowConvert + .to_arrow_field(&column.name, &column.data_type) + .ok()?; + if &arrow_field == converted_arrow_schema.field(pos) { + Some(pos) + } else { + None + } + }) + }) + .collect(); + Ok(valid_column_indices) + } + None => Ok(vec![]), + } +} + +/// Reads a specified Parquet file and converts its content into a stream of chunks. +pub async fn read_parquet_file( + op: Operator, + file_name: String, + rw_columns: Option>, + parser_columns: Option>, + batch_size: usize, + offset: usize, +) -> ConnectorResult< + Pin> + Send>>, +> { + let mut reader: tokio_util::compat::Compat = op + .reader_with(&file_name) + .into_future() // Unlike `rustc`, `try_stream` seems require manual `into_future`. + .await? + .into_futures_async_read(..) + .await? + .compat(); + let parquet_metadata = reader.get_metadata().await.map_err(anyhow::Error::from)?; + + let file_metadata = parquet_metadata.file_metadata(); + let column_indices = extract_valid_column_indices(rw_columns, file_metadata)?; + let projection_mask = ProjectionMask::leaves(file_metadata.schema_descr(), column_indices); + // For the Parquet format, we directly convert from a record batch to a stream chunk. + // Therefore, the offset of the Parquet file represents the current position in terms of the number of rows read from the file. + let record_batch_stream = ParquetRecordBatchStreamBuilder::new(reader) + .await? + .with_batch_size(batch_size) + .with_projection(projection_mask) + .with_offset(offset) + .build()?; + let converted_arrow_schema = parquet_to_arrow_schema( + file_metadata.schema_descr(), + file_metadata.key_value_metadata(), + ) + .map_err(anyhow::Error::from)?; + let columns = match parser_columns { + Some(columns) => columns, + None => converted_arrow_schema + .fields + .iter() + .enumerate() + .map(|(index, field_ref)| { + let data_type = IcebergArrowConvert.type_from_field(field_ref).unwrap(); + SourceColumnDesc::simple( + field_ref.name().clone(), + data_type, + ColumnId::new(index as i32), + ) + }) + .collect(), + }; + + let parquet_parser = ParquetParser::new(columns, file_name, offset)?; + let msg_stream: Pin< + Box> + Send>, + > = parquet_parser.into_stream(record_batch_stream); + Ok(msg_stream) +} + +pub async fn get_parquet_fields( + op: Operator, + file_name: String, +) -> ConnectorResult { + let mut reader: tokio_util::compat::Compat = op + .reader_with(&file_name) + .into_future() // Unlike `rustc`, `try_stream` seems require manual `into_future`. + .await? + .into_futures_async_read(..) + .await? + .compat(); + let parquet_metadata = reader.get_metadata().await.map_err(anyhow::Error::from)?; + + let file_metadata = parquet_metadata.file_metadata(); + let converted_arrow_schema = parquet_to_arrow_schema( + file_metadata.schema_descr(), + file_metadata.key_value_metadata(), + ) + .map_err(anyhow::Error::from)?; + let fields: risingwave_common::array::arrow::arrow_schema_udf::Fields = + converted_arrow_schema.fields; + Ok(fields) +} diff --git a/src/connector/src/source/iceberg/parquet_file_reader.rs b/src/connector/src/source/iceberg/parquet_file_reader.rs deleted file mode 100644 index 5a01f2b0ed84..000000000000 --- a/src/connector/src/source/iceberg/parquet_file_reader.rs +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright 2024 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::ops::Range; -use std::sync::Arc; - -use anyhow::anyhow; -use bytes::Bytes; -use futures::future::BoxFuture; -use futures::{FutureExt, TryFutureExt}; -use iceberg::io::{ - FileIOBuilder, FileMetadata, FileRead, S3_ACCESS_KEY_ID, S3_REGION, S3_SECRET_ACCESS_KEY, -}; -use iceberg::{Error, ErrorKind}; -use opendal::layers::RetryLayer; -use opendal::services::S3; -use opendal::Operator; -use parquet::arrow::async_reader::AsyncFileReader; -use parquet::arrow::ParquetRecordBatchStreamBuilder; -use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; -use url::Url; - -pub struct ParquetFileReader { - meta: FileMetadata, - r: R, -} - -impl ParquetFileReader { - pub fn new(meta: FileMetadata, r: R) -> Self { - Self { meta, r } - } -} - -impl AsyncFileReader for ParquetFileReader { - fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { - Box::pin( - self.r - .read(range.start as _..range.end as _) - .map_err(|err| parquet::errors::ParquetError::External(Box::new(err))), - ) - } - - fn get_metadata(&mut self) -> BoxFuture<'_, parquet::errors::Result>> { - async move { - let reader = ParquetMetaDataReader::new(); - let size = self.meta.size as usize; - let meta = reader.load_and_finish(self, size).await?; - - Ok(Arc::new(meta)) - } - .boxed() - } -} - -pub async fn create_parquet_stream_builder( - s3_region: String, - s3_access_key: String, - s3_secret_key: String, - location: String, -) -> Result>, anyhow::Error> { - let mut props = HashMap::new(); - props.insert(S3_REGION, s3_region.clone()); - props.insert(S3_ACCESS_KEY_ID, s3_access_key.clone()); - props.insert(S3_SECRET_ACCESS_KEY, s3_secret_key.clone()); - - let file_io_builder = FileIOBuilder::new("s3"); - let file_io = file_io_builder - .with_props(props.into_iter()) - .build() - .map_err(|e| anyhow!(e))?; - let parquet_file = file_io.new_input(&location).map_err(|e| anyhow!(e))?; - - let parquet_metadata = parquet_file.metadata().await.map_err(|e| anyhow!(e))?; - let parquet_reader = parquet_file.reader().await.map_err(|e| anyhow!(e))?; - let parquet_file_reader = ParquetFileReader::new(parquet_metadata, parquet_reader); - - ParquetRecordBatchStreamBuilder::new(parquet_file_reader) - .await - .map_err(|e| anyhow!(e)) -} - -pub async fn list_s3_directory( - s3_region: String, - s3_access_key: String, - s3_secret_key: String, - dir: String, -) -> Result, anyhow::Error> { - let url = Url::parse(&dir)?; - let bucket = url.host_str().ok_or_else(|| { - Error::new( - ErrorKind::DataInvalid, - format!("Invalid s3 url: {}, missing bucket", dir), - ) - })?; - - let prefix = format!("s3://{}/", bucket); - if dir.starts_with(&prefix) { - let mut builder = S3::default(); - builder = builder - .region(&s3_region) - .access_key_id(&s3_access_key) - .secret_access_key(&s3_secret_key) - .bucket(bucket); - let op = Operator::new(builder)? - .layer(RetryLayer::default()) - .finish(); - - op.list(&dir[prefix.len()..]) - .await - .map_err(|e| anyhow!(e)) - .map(|list| { - list.into_iter() - .map(|entry| prefix.to_string() + entry.path()) - .collect() - }) - } else { - Err(Error::new( - ErrorKind::DataInvalid, - format!("Invalid s3 url: {}, should start with {}", dir, prefix), - ))? - } -} diff --git a/src/connector/src/source/kafka/enumerator/client.rs b/src/connector/src/source/kafka/enumerator/client.rs index 1d7525bc7a61..541c3757c27c 100644 --- a/src/connector/src/source/kafka/enumerator/client.rs +++ b/src/connector/src/source/kafka/enumerator/client.rs @@ -191,7 +191,6 @@ impl SplitEnumerator for KafkaSplitEnumerator { partition, start_offset: start_offsets.remove(&partition).unwrap(), stop_offset: stop_offsets.remove(&partition).unwrap(), - hack_seek_to_latest: false, }) .collect(); @@ -299,7 +298,6 @@ impl KafkaSplitEnumerator { partition: *partition, start_offset: Some(start_offset), stop_offset: Some(stop_offset), - hack_seek_to_latest:false } }) .collect::>()) @@ -419,7 +417,7 @@ impl KafkaSplitEnumerator { pub async fn check_reachability(&self) -> bool { self.client - .fetch_metadata(None, self.sync_call_timeout) + .fetch_metadata(Some(self.topic.as_str()), self.sync_call_timeout) .await .is_ok() } diff --git a/src/connector/src/source/kafka/source/reader.rs b/src/connector/src/source/kafka/source/reader.rs index b9523eca98b5..20fde897ceb4 100644 --- a/src/connector/src/source/kafka/source/reader.rs +++ b/src/connector/src/source/kafka/source/reader.rs @@ -37,13 +37,15 @@ use crate::source::kafka::{ }; use crate::source::{ into_chunk_stream, BackfillInfo, BoxChunkSourceStream, Column, SourceContextRef, SplitId, - SplitMetaData, SplitReader, + SplitImpl, SplitMetaData, SplitReader, }; pub struct KafkaSplitReader { consumer: StreamConsumer, offsets: HashMap, Option)>, backfill_info: HashMap, + splits: Vec, + sync_call_timeout: Duration, bytes_per_second: usize, max_num_messages: usize, parser_config: ParserConfig, @@ -110,12 +112,10 @@ impl SplitReader for KafkaSplitReader { let mut offsets = HashMap::new(); let mut backfill_info = HashMap::new(); - for split in splits { + for split in splits.clone() { offsets.insert(split.id(), (split.start_offset, split.stop_offset)); - if split.hack_seek_to_latest { - tpl.add_partition_offset(split.topic.as_str(), split.partition, Offset::End)?; - } else if let Some(offset) = split.start_offset { + if let Some(offset) = split.start_offset { tpl.add_partition_offset( split.topic.as_str(), split.partition, @@ -168,8 +168,10 @@ impl SplitReader for KafkaSplitReader { Ok(Self { consumer, offsets, + splits, backfill_info, bytes_per_second, + sync_call_timeout: properties.common.sync_call_timeout, max_num_messages, parser_config, source_ctx, @@ -185,6 +187,28 @@ impl SplitReader for KafkaSplitReader { fn backfill_info(&self) -> HashMap { self.backfill_info.clone() } + + async fn seek_to_latest(&mut self) -> Result> { + let mut latest_splits: Vec = Vec::new(); + let mut tpl = TopicPartitionList::with_capacity(self.splits.len()); + for mut split in self.splits.clone() { + // we can't get latest offset if we use Offset::End, so we just fetch watermark here. + let (_low, high) = self + .consumer + .fetch_watermarks( + split.topic.as_str(), + split.partition, + self.sync_call_timeout, + ) + .await?; + tpl.add_partition_offset(split.topic.as_str(), split.partition, Offset::Offset(high))?; + split.start_offset = Some(high - 1); + latest_splits.push(split.into()); + } + // replace the previous assignment + self.consumer.assign(&tpl)?; + Ok(latest_splits) + } } impl KafkaSplitReader { diff --git a/src/connector/src/source/kafka/split.rs b/src/connector/src/source/kafka/split.rs index 791836ac2c85..fa969bb37111 100644 --- a/src/connector/src/source/kafka/split.rs +++ b/src/connector/src/source/kafka/split.rs @@ -32,12 +32,6 @@ pub struct KafkaSplit { /// A better approach would be to make it **inclusive**. pub(crate) start_offset: Option, pub(crate) stop_offset: Option, - #[serde(skip)] - /// Used by shared source to hackily seek to the latest offset without fetching start offset first. - /// XXX: But why do we fetch low watermark for latest start offset..? - /// - /// When this is `true`, `start_offset` will be ignored. - pub(crate) hack_seek_to_latest: bool, } impl SplitMetaData for KafkaSplit { @@ -72,16 +66,10 @@ impl KafkaSplit { partition, start_offset, stop_offset, - hack_seek_to_latest: false, } } pub fn get_topic_and_partition(&self) -> (String, i32) { (self.topic.clone(), self.partition) } - - /// This should only be used for a fresh split, not persisted in state table yet. - pub fn seek_to_latest_offset(&mut self) { - self.hack_seek_to_latest = true; - } } diff --git a/src/connector/src/source/reader/reader.rs b/src/connector/src/source/reader/reader.rs index 89335f8f0d80..f849e7ba21aa 100644 --- a/src/connector/src/source/reader/reader.rs +++ b/src/connector/src/source/reader/reader.rs @@ -37,8 +37,8 @@ use crate::source::filesystem::opendal_source::{ use crate::source::filesystem::{FsPageItem, OpendalFsSplit}; use crate::source::{ create_split_reader, BackfillInfo, BoxChunkSourceStream, BoxTryStream, Column, - ConnectorProperties, ConnectorState, SourceColumnDesc, SourceContext, SplitId, SplitReader, - WaitCheckpointTask, + ConnectorProperties, ConnectorState, SourceColumnDesc, SourceContext, SplitId, SplitImpl, + SplitReader, WaitCheckpointTask, }; use crate::{dispatch_source_prop, WithOptionsSecResolved}; @@ -211,14 +211,17 @@ impl SourceReader { } /// Build `SplitReader`s and then `BoxChunkSourceStream` from the given `ConnectorState` (`SplitImpl`s). + /// + /// If `seek_to_latest` is true, will also return the latest splits after seek. pub async fn build_stream( &self, state: ConnectorState, column_ids: Vec, source_ctx: Arc, - ) -> ConnectorResult { + seek_to_latest: bool, + ) -> ConnectorResult<(BoxChunkSourceStream, Option>)> { let Some(splits) = state else { - return Ok(pending().boxed()); + return Ok((pending().boxed(), None)); }; let config = self.config.clone(); let columns = self.get_target_columns(column_ids)?; @@ -243,7 +246,7 @@ impl SourceReader { let support_multiple_splits = config.support_multiple_splits(); dispatch_source_prop!(config, prop, { - let readers = if support_multiple_splits { + let mut readers = if support_multiple_splits { tracing::debug!( "spawning connector split reader for multiple splits {:?}", splits @@ -268,7 +271,20 @@ impl SourceReader { .await? }; - Ok(select_all(readers.into_iter().map(|r| r.into_stream())).boxed()) + let latest_splits = if seek_to_latest { + let mut latest_splits = Vec::new(); + for reader in &mut readers { + latest_splits.extend(reader.seek_to_latest().await?); + } + Some(latest_splits) + } else { + None + }; + + Ok(( + select_all(readers.into_iter().map(|r| r.into_stream())).boxed(), + latest_splits, + )) }) } } diff --git a/src/connector/with_options_sink.yaml b/src/connector/with_options_sink.yaml index 31579dfd7032..88f2e64cce2e 100644 --- a/src/connector/with_options_sink.yaml +++ b/src/connector/with_options_sink.yaml @@ -412,6 +412,30 @@ IcebergConfig: field_type: String comments: Full name of table, must include schema name. required: true + - name: catalog.credential + field_type: String + comments: |- + Credential for accessing iceberg catalog, only applicable in rest catalog. + A credential to exchange for a token in the OAuth2 client credentials flow. + required: false + - name: catalog.token + field_type: String + comments: |- + token for accessing iceberg catalog, only applicable in rest catalog. + A Bearer token which will be used for interaction with the server. + required: false + - name: catalog.oauth2-server-uri + field_type: String + comments: |- + `oauth2-server-uri` for accessing iceberg catalog, only applicable in rest catalog. + Token endpoint URI to fetch token from if the Rest Catalog is not the authorization server. + required: false + - name: catalog.scope + field_type: String + comments: |- + scope for accessing iceberg catalog, only applicable in rest catalog. + Additional scope for OAuth2. + required: false - name: s3.path.style.access field_type: bool required: false diff --git a/src/connector/with_options_source.yaml b/src/connector/with_options_source.yaml index 75972546b299..f1db0a427673 100644 --- a/src/connector/with_options_source.yaml +++ b/src/connector/with_options_source.yaml @@ -117,6 +117,30 @@ IcebergProperties: field_type: String comments: Full name of table, must include schema name. required: true + - name: catalog.credential + field_type: String + comments: |- + Credential for accessing iceberg catalog, only applicable in rest catalog. + A credential to exchange for a token in the OAuth2 client credentials flow. + required: false + - name: catalog.token + field_type: String + comments: |- + token for accessing iceberg catalog, only applicable in rest catalog. + A Bearer token which will be used for interaction with the server. + required: false + - name: catalog.oauth2-server-uri + field_type: String + comments: |- + `oauth2-server-uri` for accessing iceberg catalog, only applicable in rest catalog. + Token endpoint URI to fetch token from if the Rest Catalog is not the authorization server. + required: false + - name: catalog.scope + field_type: String + comments: |- + scope for accessing iceberg catalog, only applicable in rest catalog. + Additional scope for OAuth2. + required: false - name: s3.path.style.access field_type: bool required: false diff --git a/src/expr/core/src/aggregate/user_defined.rs b/src/expr/core/src/aggregate/user_defined.rs index 2f4fdc5f9f9c..cba83d4f439e 100644 --- a/src/expr/core/src/aggregate/user_defined.rs +++ b/src/expr/core/src/aggregate/user_defined.rs @@ -138,7 +138,6 @@ pub fn new_user_defined( arg_names: &udf.arg_names, return_type, always_retry_on_network_error: false, - function_type: udf.function_type.as_deref(), }) .context("failed to build UDF runtime")?; diff --git a/src/expr/core/src/expr/expr_udf.rs b/src/expr/core/src/expr/expr_udf.rs index 6ae27dabb245..41c9257036e7 100644 --- a/src/expr/core/src/expr/expr_udf.rs +++ b/src/expr/core/src/expr/expr_udf.rs @@ -185,7 +185,6 @@ impl Build for UserDefinedFunction { arg_names: &udf.arg_names, return_type: &return_type, always_retry_on_network_error: udf.always_retry_on_network_error, - function_type: udf.function_type.as_deref(), }) .context("failed to build UDF runtime")?; diff --git a/src/expr/core/src/sig/udf.rs b/src/expr/core/src/sig/udf.rs index 047879b9192b..e8aa1c3efdf1 100644 --- a/src/expr/core/src/sig/udf.rs +++ b/src/expr/core/src/sig/udf.rs @@ -105,7 +105,6 @@ pub struct UdfOptions<'a> { pub arg_names: &'a [String], pub return_type: &'a DataType, pub always_retry_on_network_error: bool, - pub function_type: Option<&'a str>, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EnumAsInner)] diff --git a/src/expr/core/src/table_function/user_defined.rs b/src/expr/core/src/table_function/user_defined.rs index b490e9b023af..826369666944 100644 --- a/src/expr/core/src/table_function/user_defined.rs +++ b/src/expr/core/src/table_function/user_defined.rs @@ -140,7 +140,6 @@ pub fn new_user_defined(prost: &PbTableFunction, chunk_size: usize) -> Result 1 [ ] + 1 -> 2 [ ] + } +- name: test dot output format (batch) + sql: | + CREATE TABLE t (v1 int); + explain (physical, format dot) SELECT approx_percentile(0.5) WITHIN GROUP (order by v1) from t; + explain_output: | + digraph { + 0 [ label = "BatchSimpleAgg\lid: \"3\"\laggs: [\"approx_percentile($expr1)\"]\l" ] + 1 [ label = "BatchExchange\lid: \"2\"\lorder: []\ldist: \"Single\"\l" ] + 2 [ label = "BatchProject\lid: \"1\"\lexprs: [\"t.v1::Float64 as $expr1\"]\l" ] + 3 [ label = "BatchScan\lid: \"10033\"\ltable: \"t\"\lcolumns: [\"v1\"]\l" ] + 0 -> 1 [ ] + 1 -> 2 [ ] + 2 -> 3 [ ] + } +- name: test dot output format (stream) + sql: | + CREATE TABLE t (v1 int); + explain (physical, format dot) create materialized view m1 as SELECT approx_percentile(0.5) WITHIN GROUP (order by v1) from t; + explain_output: | + digraph { + 0 [ label = "StreamMaterialize\lid: \"3\"\lcolumns: [\"approx_percentile\"]\lstream_key: []\lpk_columns: []\lpk_conflict: \"NoCheck\"\l" ] + 1 [ label = "StreamGlobalApproxPercentile\lid: \"10046\"\lquantile: \"0.5:Float64\"\lrelative_error: \"0.01:Float64\"\l" ] + 2 [ label = "StreamExchange\lid: \"2\"\ldist: \"Single\"\l" ] + 3 [ label = "StreamLocalApproxPercentile\lid: \"10044\"\lpercentile_col: \"$expr1\"\lquantile: \"0.5:Float64\"\lrelative_error: \"0.01:Float64\"\l" ] + 4 [ label = "StreamProject\lid: \"1\"\lexprs: [\"t.v1::Float64 as $expr1\",\"t._row_id\"]\l" ] + 5 [ label = "StreamTableScan\lid: \"10049\"\ltable: \"t\"\lcolumns: [\"v1\",\"_row_id\"]\l" ] + 0 -> 1 [ ] + 1 -> 2 [ ] + 2 -> 3 [ ] + 3 -> 4 [ ] + 4 -> 5 [ ] + } +- name: test long dot output format (stream) + sql: | + create table t1(a int, b int); + create table t2(c int primary key, d int); + explain (physical, format dot) create materialized view m1 as SELECT + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col1, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col2, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col3, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col4, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col5, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col6, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col7, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col8, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col9, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col10, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col11, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col12, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col13, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col14, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col15, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col16, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col17, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col18 + from t1; + explain_output: | + digraph { + 0 [ label = "StreamMaterialize\lid: \"147\"\lcolumns: [\"col1\",\"col2\",\"col3\",\"col4\",\"col5\",\"col6\",\"col7\",\"col8\",\"col9\",\"col10\",\"col11\",\"col12\",\"col13\",\"col14\",\"col15\",\"col16\",\"col17\",\"col18\",\"t1._row_id(hidden)\",\"t1.b(hidden)\",\"t1.a(hidden)\",\"t1.b#1(hidden)\",\"t1.b#2(hidden)\",\"t1.b#3(hidden)\",\"t1.b#4(hidden)\",\"t1.b#5(hidden)\",\"t1.b#6(hidden)\",\"t1.b#7(hidden)\",\"t1.b#8(hidden)\",\"t1.b#9(hidden)\",\"t1.b#10(hidden)\",\"t1.b#11(hidden)\",\"t1.b#12(hidden)\",\"t1.b#13(hidden)\",\"t1.b#14(hidden)\",\"t1.b#15(hidden)\",\"t1.b#16(hidden)\",\"t1.b#17(hidden)\",\"t1.b#18(hidden)\"]\lstream_key: [\"t1._row_id\",\"t1.b\",\"t1.a\",\"t1.b#1\",\"t1.b#2\",\"t1.b#3\",\"t1.b#4\",\"t1.b#5\",\"t1.b#6\",\"t1.b#7\",\"t1.b#8\",\"t1.b#9\",\"t1.b#10\",\"t1.b#11\",\"t1.b#12\",\"t1.b#13\",\"t1.b#14\",\"t1.b#15\",\"t1.b#16\",\"t1.b#17\",\"t1.b#18\"]\lpk_columns: [\"t1._row_id\",\"t1.b\",\"t1.a\",\"t1.b#1\",\"t1.b#2\",\"t1.b#3\",\"t1.b#4\",\"t1.b#5\",\"t1.b#6\",\"t1.b#7\",\"t1.b#8\",\"t1.b#9\",\"t1.b#10\",\"t1.b#11\",\"t1.b#12\",\"t1.b#13\",\"t1.b#14\",\"t1.b#15\",\"t1.b#16\",\"t1.b#17\",\"t1.b#18\"]\lpk_conflict: \"NoCheck\"\l" ] + 1 [ label = "StreamProject\lid: \"146\"\lexprs: [\"Coalesce(t1.b, 0:Int32) as $expr1\",\"Coalesce(t1.b, 0:Int32) as $expr2\",\"Coalesce(t1.b, 0:Int32) as $expr3\",\"Coalesce(t1.b, 0:Int32) as $expr4\",\"Coalesce(t1.b, 0:Int32) as $expr5\",\"Coalesce(t1.b, 0:Int32) as $expr6\",\"Coalesce(t1.b, 0:Int32) as $expr7\",\"Coalesce(t1.b, 0:Int32) as $expr8\",\"Coalesce(t1.b, 0:Int32) as $expr9\",\"Coalesce(t1.b, 0:Int32) as $expr10\",\"Coalesce(t1.b, 0:Int32) as $expr11\",\"Coalesce(t1.b, 0:Int32) as $expr12\",\"Coalesce(t1.b, 0:Int32) as $expr13\",\"Coalesce(t1.b, 0:Int32) as $expr14\",\"Coalesce(t1.b, 0:Int32) as $expr15\",\"Coalesce(t1.b, 0:Int32) as $expr16\",\"Coalesce(t1.b, 0:Int32) as $expr17\",\"Coalesce(t1.b, 0:Int32) as $expr18\",\"t1._row_id\",\"t1.b\",\"t1.a\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\",\"t1.b\"]\l" ] + 2 [ label = "StreamHashJoin\lid: \"145\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 3 [ label = "StreamHashJoin\lid: \"137\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 4 [ label = "StreamHashJoin\lid: \"129\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 5 [ label = "StreamHashJoin\lid: \"121\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 6 [ label = "StreamHashJoin\lid: \"113\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 7 [ label = "StreamHashJoin\lid: \"105\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 8 [ label = "StreamHashJoin\lid: \"97\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 9 [ label = "StreamHashJoin\lid: \"89\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 10 [ label = "StreamHashJoin\lid: \"81\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 11 [ label = "StreamHashJoin\lid: \"73\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 12 [ label = "StreamHashJoin\lid: \"65\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 13 [ label = "StreamHashJoin\lid: \"57\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 14 [ label = "StreamHashJoin\lid: \"49\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 15 [ label = "StreamHashJoin\lid: \"41\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 16 [ label = "StreamHashJoin\lid: \"33\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 17 [ label = "StreamHashJoin\lid: \"25\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 18 [ label = "StreamHashJoin\lid: \"17\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 19 [ label = "StreamHashJoin\lid: \"9\"\ltype: \"LeftOuter\"\lpredicate: \"t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b\"\l" ] + 20 [ label = "StreamExchange\lid: \"1\"\ldist: \"HashShard(t1.a)\"\l" ] + 21 [ label = "StreamTableScan\lid: \"12914\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 22 [ label = "StreamProject\lid: \"8\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 23 [ label = "StreamHashJoin\lid: \"7\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 24 [ label = "StreamExchange\lid: \"5\"\ldist: \"HashShard(t1.a)\"\l" ] + 25 [ label = "StreamProject\lid: \"4\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 26 [ label = "StreamHashAgg\lid: \"3\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 27 [ label = "StreamExchange\lid: \"2\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 28 [ label = "StreamTableScan\lid: \"12921\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 29 [ label = "StreamExchange\lid: \"6\"\ldist: \"HashShard(t2.c)\"\l" ] + 30 [ label = "StreamTableScan\lid: \"12927\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 31 [ label = "StreamProject\lid: \"16\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 32 [ label = "StreamHashJoin\lid: \"15\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 33 [ label = "StreamExchange\lid: \"13\"\ldist: \"HashShard(t1.a)\"\l" ] + 34 [ label = "StreamProject\lid: \"12\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 35 [ label = "StreamHashAgg\lid: \"11\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 36 [ label = "StreamExchange\lid: \"10\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 37 [ label = "StreamTableScan\lid: \"12937\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 38 [ label = "StreamExchange\lid: \"14\"\ldist: \"HashShard(t2.c)\"\l" ] + 39 [ label = "StreamTableScan\lid: \"12943\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 40 [ label = "StreamProject\lid: \"24\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 41 [ label = "StreamHashJoin\lid: \"23\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 42 [ label = "StreamExchange\lid: \"21\"\ldist: \"HashShard(t1.a)\"\l" ] + 43 [ label = "StreamProject\lid: \"20\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 44 [ label = "StreamHashAgg\lid: \"19\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 45 [ label = "StreamExchange\lid: \"18\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 46 [ label = "StreamTableScan\lid: \"12953\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 47 [ label = "StreamExchange\lid: \"22\"\ldist: \"HashShard(t2.c)\"\l" ] + 48 [ label = "StreamTableScan\lid: \"12959\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 49 [ label = "StreamProject\lid: \"32\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 50 [ label = "StreamHashJoin\lid: \"31\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 51 [ label = "StreamExchange\lid: \"29\"\ldist: \"HashShard(t1.a)\"\l" ] + 52 [ label = "StreamProject\lid: \"28\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 53 [ label = "StreamHashAgg\lid: \"27\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 54 [ label = "StreamExchange\lid: \"26\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 55 [ label = "StreamTableScan\lid: \"12969\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 56 [ label = "StreamExchange\lid: \"30\"\ldist: \"HashShard(t2.c)\"\l" ] + 57 [ label = "StreamTableScan\lid: \"12975\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 58 [ label = "StreamProject\lid: \"40\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 59 [ label = "StreamHashJoin\lid: \"39\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 60 [ label = "StreamExchange\lid: \"37\"\ldist: \"HashShard(t1.a)\"\l" ] + 61 [ label = "StreamProject\lid: \"36\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 62 [ label = "StreamHashAgg\lid: \"35\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 63 [ label = "StreamExchange\lid: \"34\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 64 [ label = "StreamTableScan\lid: \"12985\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 65 [ label = "StreamExchange\lid: \"38\"\ldist: \"HashShard(t2.c)\"\l" ] + 66 [ label = "StreamTableScan\lid: \"12991\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 67 [ label = "StreamProject\lid: \"48\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 68 [ label = "StreamHashJoin\lid: \"47\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 69 [ label = "StreamExchange\lid: \"45\"\ldist: \"HashShard(t1.a)\"\l" ] + 70 [ label = "StreamProject\lid: \"44\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 71 [ label = "StreamHashAgg\lid: \"43\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 72 [ label = "StreamExchange\lid: \"42\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 73 [ label = "StreamTableScan\lid: \"13001\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 74 [ label = "StreamExchange\lid: \"46\"\ldist: \"HashShard(t2.c)\"\l" ] + 75 [ label = "StreamTableScan\lid: \"13007\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 76 [ label = "StreamProject\lid: \"56\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 77 [ label = "StreamHashJoin\lid: \"55\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 78 [ label = "StreamExchange\lid: \"53\"\ldist: \"HashShard(t1.a)\"\l" ] + 79 [ label = "StreamProject\lid: \"52\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 80 [ label = "StreamHashAgg\lid: \"51\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 81 [ label = "StreamExchange\lid: \"50\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 82 [ label = "StreamTableScan\lid: \"13017\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 83 [ label = "StreamExchange\lid: \"54\"\ldist: \"HashShard(t2.c)\"\l" ] + 84 [ label = "StreamTableScan\lid: \"13023\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 85 [ label = "StreamProject\lid: \"64\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 86 [ label = "StreamHashJoin\lid: \"63\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 87 [ label = "StreamExchange\lid: \"61\"\ldist: \"HashShard(t1.a)\"\l" ] + 88 [ label = "StreamProject\lid: \"60\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 89 [ label = "StreamHashAgg\lid: \"59\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 90 [ label = "StreamExchange\lid: \"58\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 91 [ label = "StreamTableScan\lid: \"13033\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 92 [ label = "StreamExchange\lid: \"62\"\ldist: \"HashShard(t2.c)\"\l" ] + 93 [ label = "StreamTableScan\lid: \"13039\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 94 [ label = "StreamProject\lid: \"72\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 95 [ label = "StreamHashJoin\lid: \"71\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 96 [ label = "StreamExchange\lid: \"69\"\ldist: \"HashShard(t1.a)\"\l" ] + 97 [ label = "StreamProject\lid: \"68\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 98 [ label = "StreamHashAgg\lid: \"67\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 99 [ label = "StreamExchange\lid: \"66\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 100 [ label = "StreamTableScan\lid: \"13049\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 101 [ label = "StreamExchange\lid: \"70\"\ldist: \"HashShard(t2.c)\"\l" ] + 102 [ label = "StreamTableScan\lid: \"13055\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 103 [ label = "StreamProject\lid: \"80\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 104 [ label = "StreamHashJoin\lid: \"79\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 105 [ label = "StreamExchange\lid: \"77\"\ldist: \"HashShard(t1.a)\"\l" ] + 106 [ label = "StreamProject\lid: \"76\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 107 [ label = "StreamHashAgg\lid: \"75\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 108 [ label = "StreamExchange\lid: \"74\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 109 [ label = "StreamTableScan\lid: \"13065\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 110 [ label = "StreamExchange\lid: \"78\"\ldist: \"HashShard(t2.c)\"\l" ] + 111 [ label = "StreamTableScan\lid: \"13071\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 112 [ label = "StreamProject\lid: \"88\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 113 [ label = "StreamHashJoin\lid: \"87\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 114 [ label = "StreamExchange\lid: \"85\"\ldist: \"HashShard(t1.a)\"\l" ] + 115 [ label = "StreamProject\lid: \"84\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 116 [ label = "StreamHashAgg\lid: \"83\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 117 [ label = "StreamExchange\lid: \"82\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 118 [ label = "StreamTableScan\lid: \"13081\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 119 [ label = "StreamExchange\lid: \"86\"\ldist: \"HashShard(t2.c)\"\l" ] + 120 [ label = "StreamTableScan\lid: \"13087\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 121 [ label = "StreamProject\lid: \"96\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 122 [ label = "StreamHashJoin\lid: \"95\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 123 [ label = "StreamExchange\lid: \"93\"\ldist: \"HashShard(t1.a)\"\l" ] + 124 [ label = "StreamProject\lid: \"92\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 125 [ label = "StreamHashAgg\lid: \"91\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 126 [ label = "StreamExchange\lid: \"90\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 127 [ label = "StreamTableScan\lid: \"13097\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 128 [ label = "StreamExchange\lid: \"94\"\ldist: \"HashShard(t2.c)\"\l" ] + 129 [ label = "StreamTableScan\lid: \"13103\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 130 [ label = "StreamProject\lid: \"104\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 131 [ label = "StreamHashJoin\lid: \"103\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 132 [ label = "StreamExchange\lid: \"101\"\ldist: \"HashShard(t1.a)\"\l" ] + 133 [ label = "StreamProject\lid: \"100\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 134 [ label = "StreamHashAgg\lid: \"99\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 135 [ label = "StreamExchange\lid: \"98\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 136 [ label = "StreamTableScan\lid: \"13113\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 137 [ label = "StreamExchange\lid: \"102\"\ldist: \"HashShard(t2.c)\"\l" ] + 138 [ label = "StreamTableScan\lid: \"13119\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 139 [ label = "StreamProject\lid: \"112\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 140 [ label = "StreamHashJoin\lid: \"111\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 141 [ label = "StreamExchange\lid: \"109\"\ldist: \"HashShard(t1.a)\"\l" ] + 142 [ label = "StreamProject\lid: \"108\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 143 [ label = "StreamHashAgg\lid: \"107\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 144 [ label = "StreamExchange\lid: \"106\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 145 [ label = "StreamTableScan\lid: \"13129\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 146 [ label = "StreamExchange\lid: \"110\"\ldist: \"HashShard(t2.c)\"\l" ] + 147 [ label = "StreamTableScan\lid: \"13135\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 148 [ label = "StreamProject\lid: \"120\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 149 [ label = "StreamHashJoin\lid: \"119\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 150 [ label = "StreamExchange\lid: \"117\"\ldist: \"HashShard(t1.a)\"\l" ] + 151 [ label = "StreamProject\lid: \"116\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 152 [ label = "StreamHashAgg\lid: \"115\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 153 [ label = "StreamExchange\lid: \"114\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 154 [ label = "StreamTableScan\lid: \"13145\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 155 [ label = "StreamExchange\lid: \"118\"\ldist: \"HashShard(t2.c)\"\l" ] + 156 [ label = "StreamTableScan\lid: \"13151\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 157 [ label = "StreamProject\lid: \"128\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 158 [ label = "StreamHashJoin\lid: \"127\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 159 [ label = "StreamExchange\lid: \"125\"\ldist: \"HashShard(t1.a)\"\l" ] + 160 [ label = "StreamProject\lid: \"124\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 161 [ label = "StreamHashAgg\lid: \"123\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 162 [ label = "StreamExchange\lid: \"122\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 163 [ label = "StreamTableScan\lid: \"13161\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 164 [ label = "StreamExchange\lid: \"126\"\ldist: \"HashShard(t2.c)\"\l" ] + 165 [ label = "StreamTableScan\lid: \"13167\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 166 [ label = "StreamProject\lid: \"136\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 167 [ label = "StreamHashJoin\lid: \"135\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 168 [ label = "StreamExchange\lid: \"133\"\ldist: \"HashShard(t1.a)\"\l" ] + 169 [ label = "StreamProject\lid: \"132\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 170 [ label = "StreamHashAgg\lid: \"131\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 171 [ label = "StreamExchange\lid: \"130\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 172 [ label = "StreamTableScan\lid: \"13177\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 173 [ label = "StreamExchange\lid: \"134\"\ldist: \"HashShard(t2.c)\"\l" ] + 174 [ label = "StreamTableScan\lid: \"13183\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 175 [ label = "StreamProject\lid: \"144\"\lexprs: [\"t1.a\",\"t1.b\",\"t1.b\"]\l" ] + 176 [ label = "StreamHashJoin\lid: \"143\"\ltype: \"Inner\"\lpredicate: \"t1.a = t2.c\"\l" ] + 177 [ label = "StreamExchange\lid: \"141\"\ldist: \"HashShard(t1.a)\"\l" ] + 178 [ label = "StreamProject\lid: \"140\"\lexprs: [\"t1.a\",\"t1.b\"]\l" ] + 179 [ label = "StreamHashAgg\lid: \"139\"\lgroup_key: [\"t1.a\",\"t1.b\"]\laggs: [\"count\"]\l" ] + 180 [ label = "StreamExchange\lid: \"138\"\ldist: \"HashShard(t1.a, t1.b)\"\l" ] + 181 [ label = "StreamTableScan\lid: \"13193\"\ltable: \"t1\"\lcolumns: [\"a\",\"b\",\"_row_id\"]\l" ] + 182 [ label = "StreamExchange\lid: \"142\"\ldist: \"HashShard(t2.c)\"\l" ] + 183 [ label = "StreamTableScan\lid: \"13199\"\ltable: \"t2\"\lcolumns: [\"c\"]\l" ] + 0 -> 1 [ ] + 1 -> 2 [ ] + 2 -> 3 [ ] + 3 -> 4 [ ] + 4 -> 5 [ ] + 5 -> 6 [ ] + 6 -> 7 [ ] + 7 -> 8 [ ] + 8 -> 9 [ ] + 9 -> 10 [ ] + 10 -> 11 [ ] + 11 -> 12 [ ] + 12 -> 13 [ ] + 13 -> 14 [ ] + 14 -> 15 [ ] + 15 -> 16 [ ] + 16 -> 17 [ ] + 17 -> 18 [ ] + 18 -> 19 [ ] + 19 -> 20 [ ] + 20 -> 21 [ ] + 19 -> 22 [ ] + 22 -> 23 [ ] + 23 -> 24 [ ] + 24 -> 25 [ ] + 25 -> 26 [ ] + 26 -> 27 [ ] + 27 -> 28 [ ] + 23 -> 29 [ ] + 29 -> 30 [ ] + 18 -> 31 [ ] + 31 -> 32 [ ] + 32 -> 33 [ ] + 33 -> 34 [ ] + 34 -> 35 [ ] + 35 -> 36 [ ] + 36 -> 37 [ ] + 32 -> 38 [ ] + 38 -> 39 [ ] + 17 -> 40 [ ] + 40 -> 41 [ ] + 41 -> 42 [ ] + 42 -> 43 [ ] + 43 -> 44 [ ] + 44 -> 45 [ ] + 45 -> 46 [ ] + 41 -> 47 [ ] + 47 -> 48 [ ] + 16 -> 49 [ ] + 49 -> 50 [ ] + 50 -> 51 [ ] + 51 -> 52 [ ] + 52 -> 53 [ ] + 53 -> 54 [ ] + 54 -> 55 [ ] + 50 -> 56 [ ] + 56 -> 57 [ ] + 15 -> 58 [ ] + 58 -> 59 [ ] + 59 -> 60 [ ] + 60 -> 61 [ ] + 61 -> 62 [ ] + 62 -> 63 [ ] + 63 -> 64 [ ] + 59 -> 65 [ ] + 65 -> 66 [ ] + 14 -> 67 [ ] + 67 -> 68 [ ] + 68 -> 69 [ ] + 69 -> 70 [ ] + 70 -> 71 [ ] + 71 -> 72 [ ] + 72 -> 73 [ ] + 68 -> 74 [ ] + 74 -> 75 [ ] + 13 -> 76 [ ] + 76 -> 77 [ ] + 77 -> 78 [ ] + 78 -> 79 [ ] + 79 -> 80 [ ] + 80 -> 81 [ ] + 81 -> 82 [ ] + 77 -> 83 [ ] + 83 -> 84 [ ] + 12 -> 85 [ ] + 85 -> 86 [ ] + 86 -> 87 [ ] + 87 -> 88 [ ] + 88 -> 89 [ ] + 89 -> 90 [ ] + 90 -> 91 [ ] + 86 -> 92 [ ] + 92 -> 93 [ ] + 11 -> 94 [ ] + 94 -> 95 [ ] + 95 -> 96 [ ] + 96 -> 97 [ ] + 97 -> 98 [ ] + 98 -> 99 [ ] + 99 -> 100 [ ] + 95 -> 101 [ ] + 101 -> 102 [ ] + 10 -> 103 [ ] + 103 -> 104 [ ] + 104 -> 105 [ ] + 105 -> 106 [ ] + 106 -> 107 [ ] + 107 -> 108 [ ] + 108 -> 109 [ ] + 104 -> 110 [ ] + 110 -> 111 [ ] + 9 -> 112 [ ] + 112 -> 113 [ ] + 113 -> 114 [ ] + 114 -> 115 [ ] + 115 -> 116 [ ] + 116 -> 117 [ ] + 117 -> 118 [ ] + 113 -> 119 [ ] + 119 -> 120 [ ] + 8 -> 121 [ ] + 121 -> 122 [ ] + 122 -> 123 [ ] + 123 -> 124 [ ] + 124 -> 125 [ ] + 125 -> 126 [ ] + 126 -> 127 [ ] + 122 -> 128 [ ] + 128 -> 129 [ ] + 7 -> 130 [ ] + 130 -> 131 [ ] + 131 -> 132 [ ] + 132 -> 133 [ ] + 133 -> 134 [ ] + 134 -> 135 [ ] + 135 -> 136 [ ] + 131 -> 137 [ ] + 137 -> 138 [ ] + 6 -> 139 [ ] + 139 -> 140 [ ] + 140 -> 141 [ ] + 141 -> 142 [ ] + 142 -> 143 [ ] + 143 -> 144 [ ] + 144 -> 145 [ ] + 140 -> 146 [ ] + 146 -> 147 [ ] + 5 -> 148 [ ] + 148 -> 149 [ ] + 149 -> 150 [ ] + 150 -> 151 [ ] + 151 -> 152 [ ] + 152 -> 153 [ ] + 153 -> 154 [ ] + 149 -> 155 [ ] + 155 -> 156 [ ] + 4 -> 157 [ ] + 157 -> 158 [ ] + 158 -> 159 [ ] + 159 -> 160 [ ] + 160 -> 161 [ ] + 161 -> 162 [ ] + 162 -> 163 [ ] + 158 -> 164 [ ] + 164 -> 165 [ ] + 3 -> 166 [ ] + 166 -> 167 [ ] + 167 -> 168 [ ] + 168 -> 169 [ ] + 169 -> 170 [ ] + 170 -> 171 [ ] + 171 -> 172 [ ] + 167 -> 173 [ ] + 173 -> 174 [ ] + 2 -> 175 [ ] + 175 -> 176 [ ] + 176 -> 177 [ ] + 177 -> 178 [ ] + 178 -> 179 [ ] + 179 -> 180 [ ] + 180 -> 181 [ ] + 176 -> 182 [ ] + 182 -> 183 [ ] + } diff --git a/src/frontend/planner_test/tests/testdata/output/explain_json_format.yaml b/src/frontend/planner_test/tests/testdata/output/explain_json_format.yaml index 9017c0609c25..d2b8d43771c9 100644 --- a/src/frontend/planner_test/tests/testdata/output/explain_json_format.yaml +++ b/src/frontend/planner_test/tests/testdata/output/explain_json_format.yaml @@ -145,3 +145,2009 @@ } ] } +- name: test long json output format (stream) + sql: | + create table t1(a int, b int); + create table t2(c int primary key, d int); + explain (physical, format json) create materialized view m1 as SELECT + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col1, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col2, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col3, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col4, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col5, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col6, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col7, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col8, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col9, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col10, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col11, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col12, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col13, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col14, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col15, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col16, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col17, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col18 + from t1; + explain_output: | + { + "name": "StreamMaterialize", + "fields": { + "columns": [ + "col1", + "col2", + "col3", + "col4", + "col5", + "col6", + "col7", + "col8", + "col9", + "col10", + "col11", + "col12", + "col13", + "col14", + "col15", + "col16", + "col17", + "col18", + "t1._row_id(hidden)", + "t1.b(hidden)", + "t1.a(hidden)", + "t1.b#1(hidden)", + "t1.b#2(hidden)", + "t1.b#3(hidden)", + "t1.b#4(hidden)", + "t1.b#5(hidden)", + "t1.b#6(hidden)", + "t1.b#7(hidden)", + "t1.b#8(hidden)", + "t1.b#9(hidden)", + "t1.b#10(hidden)", + "t1.b#11(hidden)", + "t1.b#12(hidden)", + "t1.b#13(hidden)", + "t1.b#14(hidden)", + "t1.b#15(hidden)", + "t1.b#16(hidden)", + "t1.b#17(hidden)", + "t1.b#18(hidden)" + ], + "pk_columns": [ + "t1._row_id", + "t1.b", + "t1.a", + "t1.b#1", + "t1.b#2", + "t1.b#3", + "t1.b#4", + "t1.b#5", + "t1.b#6", + "t1.b#7", + "t1.b#8", + "t1.b#9", + "t1.b#10", + "t1.b#11", + "t1.b#12", + "t1.b#13", + "t1.b#14", + "t1.b#15", + "t1.b#16", + "t1.b#17", + "t1.b#18" + ], + "pk_conflict": "NoCheck", + "stream_key": [ + "t1._row_id", + "t1.b", + "t1.a", + "t1.b#1", + "t1.b#2", + "t1.b#3", + "t1.b#4", + "t1.b#5", + "t1.b#6", + "t1.b#7", + "t1.b#8", + "t1.b#9", + "t1.b#10", + "t1.b#11", + "t1.b#12", + "t1.b#13", + "t1.b#14", + "t1.b#15", + "t1.b#16", + "t1.b#17", + "t1.b#18" + ] + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "Coalesce(t1.b, 0:Int32) as $expr1", + "Coalesce(t1.b, 0:Int32) as $expr2", + "Coalesce(t1.b, 0:Int32) as $expr3", + "Coalesce(t1.b, 0:Int32) as $expr4", + "Coalesce(t1.b, 0:Int32) as $expr5", + "Coalesce(t1.b, 0:Int32) as $expr6", + "Coalesce(t1.b, 0:Int32) as $expr7", + "Coalesce(t1.b, 0:Int32) as $expr8", + "Coalesce(t1.b, 0:Int32) as $expr9", + "Coalesce(t1.b, 0:Int32) as $expr10", + "Coalesce(t1.b, 0:Int32) as $expr11", + "Coalesce(t1.b, 0:Int32) as $expr12", + "Coalesce(t1.b, 0:Int32) as $expr13", + "Coalesce(t1.b, 0:Int32) as $expr14", + "Coalesce(t1.b, 0:Int32) as $expr15", + "Coalesce(t1.b, 0:Int32) as $expr16", + "Coalesce(t1.b, 0:Int32) as $expr17", + "Coalesce(t1.b, 0:Int32) as $expr18", + "t1._row_id", + "t1.b", + "t1.a", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b", + "type": "LeftOuter" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashJoin", + "fields": { + "predicate": "t1.a = t2.c", + "type": "Inner" + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a)" + }, + "children": [ + { + "name": "StreamProject", + "fields": { + "exprs": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamHashAgg", + "fields": { + "aggs": [ + "count" + ], + "group_key": [ + "t1.a", + "t1.b" + ] + }, + "children": [ + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t1.a, t1.b)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "a", + "b", + "_row_id" + ], + "table": "t1" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "StreamExchange", + "fields": { + "dist": "HashShard(t2.c)" + }, + "children": [ + { + "name": "StreamTableScan", + "fields": { + "columns": [ + "c" + ], + "table": "t2" + }, + "children": [] + } + ] + } + ] + } + ] + } + ] + } + ] + } + ] + } diff --git a/src/frontend/planner_test/tests/testdata/output/explain_xml_format.yaml b/src/frontend/planner_test/tests/testdata/output/explain_xml_format.yaml index ac1ff48badf6..c4a81e2a7057 100644 --- a/src/frontend/planner_test/tests/testdata/output/explain_xml_format.yaml +++ b/src/frontend/planner_test/tests/testdata/output/explain_xml_format.yaml @@ -17,3 +17,29 @@ explain (physical, format xml) create materialized view m1 as SELECT approx_percentile(0.5) WITHIN GROUP (order by v1) from t; explain_output: | StreamMaterializeapprox_percentileNoCheckStreamGlobalApproxPercentile0.5:Float640.01:Float64StreamExchangeSingleStreamLocalApproxPercentile$expr10.5:Float640.01:Float64StreamProjectt.v1::Float64 as $expr1t._row_idStreamTableScanv1_row_idt
+- name: test long xml output format (stream) + sql: | + create table t1(a int, b int); + create table t2(c int primary key, d int); + explain (physical, format xml) create materialized view m1 as SELECT + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col1, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col2, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col3, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col4, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col5, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col6, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col7, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col8, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col9, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col10, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col11, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col12, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col13, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col14, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col15, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col16, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col17, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col18 + from t1; + explain_output: | + StreamMaterializecol1col2col3col4col5col6col7col8col9col10col11col12col13col14col15col16col17col18t1._row_id(hidden)t1.b(hidden)t1.a(hidden)t1.b#1(hidden)t1.b#2(hidden)t1.b#3(hidden)t1.b#4(hidden)t1.b#5(hidden)t1.b#6(hidden)t1.b#7(hidden)t1.b#8(hidden)t1.b#9(hidden)t1.b#10(hidden)t1.b#11(hidden)t1.b#12(hidden)t1.b#13(hidden)t1.b#14(hidden)t1.b#15(hidden)t1.b#16(hidden)t1.b#17(hidden)t1.b#18(hidden)t1._row_idt1.bt1.at1.b#1t1.b#2t1.b#3t1.b#4t1.b#5t1.b#6t1.b#7t1.b#8t1.b#9t1.b#10t1.b#11t1.b#12t1.b#13t1.b#14t1.b#15t1.b#16t1.b#17t1.b#18NoCheckt1._row_idt1.bt1.at1.b#1t1.b#2t1.b#3t1.b#4t1.b#5t1.b#6t1.b#7t1.b#8t1.b#9t1.b#10t1.b#11t1.b#12t1.b#13t1.b#14t1.b#15t1.b#16t1.b#17t1.b#18StreamProjectCoalesce(t1.b, 0:Int32) as $expr1Coalesce(t1.b, 0:Int32) as $expr2Coalesce(t1.b, 0:Int32) as $expr3Coalesce(t1.b, 0:Int32) as $expr4Coalesce(t1.b, 0:Int32) as $expr5Coalesce(t1.b, 0:Int32) as $expr6Coalesce(t1.b, 0:Int32) as $expr7Coalesce(t1.b, 0:Int32) as $expr8Coalesce(t1.b, 0:Int32) as $expr9Coalesce(t1.b, 0:Int32) as $expr10Coalesce(t1.b, 0:Int32) as $expr11Coalesce(t1.b, 0:Int32) as $expr12Coalesce(t1.b, 0:Int32) as $expr13Coalesce(t1.b, 0:Int32) as $expr14Coalesce(t1.b, 0:Int32) as $expr15Coalesce(t1.b, 0:Int32) as $expr16Coalesce(t1.b, 0:Int32) as $expr17Coalesce(t1.b, 0:Int32) as $expr18t1._row_idt1.bt1.at1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bt1.bStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamHashJoint1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.bLeftOuterStreamExchangeHashShard(t1.a)StreamTableScanab_row_idt1
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
StreamProjectt1.at1.bt1.bStreamHashJoint1.a = t2.cInnerStreamExchangeHashShard(t1.a)StreamProjectt1.at1.bStreamHashAggcountt1.at1.bStreamExchangeHashShard(t1.a, t1.b)StreamTableScanab_row_idt1
StreamExchangeHashShard(t2.c)StreamTableScanct2
diff --git a/src/frontend/planner_test/tests/testdata/output/explain_yaml_format.yaml b/src/frontend/planner_test/tests/testdata/output/explain_yaml_format.yaml index 5386fe211fcc..ecfb2bf8863b 100644 --- a/src/frontend/planner_test/tests/testdata/output/explain_yaml_format.yaml +++ b/src/frontend/planner_test/tests/testdata/output/explain_yaml_format.yaml @@ -87,3 +87,1197 @@ - _row_id table: t children: [] +- name: test long yaml output format (stream) + sql: | + create table t1(a int, b int); + create table t2(c int primary key, d int); + explain (physical, format yaml) create materialized view m1 as SELECT + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col1, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col2, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col3, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col4, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col5, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col6, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col7, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col8, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col9, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col10, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col11, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col12, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col13, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col14, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col15, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col16, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col17, + COALESCE((SELECT b FROM t2 WHERE t1.a = t2.c), 0) col18 + from t1; + explain_output: | + name: StreamMaterialize + fields: + columns: + - col1 + - col2 + - col3 + - col4 + - col5 + - col6 + - col7 + - col8 + - col9 + - col10 + - col11 + - col12 + - col13 + - col14 + - col15 + - col16 + - col17 + - col18 + - t1._row_id(hidden) + - t1.b(hidden) + - t1.a(hidden) + - t1.b#1(hidden) + - t1.b#2(hidden) + - t1.b#3(hidden) + - t1.b#4(hidden) + - t1.b#5(hidden) + - t1.b#6(hidden) + - t1.b#7(hidden) + - t1.b#8(hidden) + - t1.b#9(hidden) + - t1.b#10(hidden) + - t1.b#11(hidden) + - t1.b#12(hidden) + - t1.b#13(hidden) + - t1.b#14(hidden) + - t1.b#15(hidden) + - t1.b#16(hidden) + - t1.b#17(hidden) + - t1.b#18(hidden) + pk_columns: + - t1._row_id + - t1.b + - t1.a + - t1.b#1 + - t1.b#2 + - t1.b#3 + - t1.b#4 + - t1.b#5 + - t1.b#6 + - t1.b#7 + - t1.b#8 + - t1.b#9 + - t1.b#10 + - t1.b#11 + - t1.b#12 + - t1.b#13 + - t1.b#14 + - t1.b#15 + - t1.b#16 + - t1.b#17 + - t1.b#18 + pk_conflict: NoCheck + stream_key: + - t1._row_id + - t1.b + - t1.a + - t1.b#1 + - t1.b#2 + - t1.b#3 + - t1.b#4 + - t1.b#5 + - t1.b#6 + - t1.b#7 + - t1.b#8 + - t1.b#9 + - t1.b#10 + - t1.b#11 + - t1.b#12 + - t1.b#13 + - t1.b#14 + - t1.b#15 + - t1.b#16 + - t1.b#17 + - t1.b#18 + children: + - name: StreamProject + fields: + exprs: + - Coalesce(t1.b, 0:Int32) as $expr1 + - Coalesce(t1.b, 0:Int32) as $expr2 + - Coalesce(t1.b, 0:Int32) as $expr3 + - Coalesce(t1.b, 0:Int32) as $expr4 + - Coalesce(t1.b, 0:Int32) as $expr5 + - Coalesce(t1.b, 0:Int32) as $expr6 + - Coalesce(t1.b, 0:Int32) as $expr7 + - Coalesce(t1.b, 0:Int32) as $expr8 + - Coalesce(t1.b, 0:Int32) as $expr9 + - Coalesce(t1.b, 0:Int32) as $expr10 + - Coalesce(t1.b, 0:Int32) as $expr11 + - Coalesce(t1.b, 0:Int32) as $expr12 + - Coalesce(t1.b, 0:Int32) as $expr13 + - Coalesce(t1.b, 0:Int32) as $expr14 + - Coalesce(t1.b, 0:Int32) as $expr15 + - Coalesce(t1.b, 0:Int32) as $expr16 + - Coalesce(t1.b, 0:Int32) as $expr17 + - Coalesce(t1.b, 0:Int32) as $expr18 + - t1._row_id + - t1.b + - t1.a + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamHashJoin + fields: + predicate: t1.a IS NOT DISTINCT FROM t1.a AND t1.b IS NOT DISTINCT FROM t1.b + type: LeftOuter + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + - t1.b + children: + - name: StreamHashJoin + fields: + predicate: t1.a = t2.c + type: Inner + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a) + children: + - name: StreamProject + fields: + exprs: + - t1.a + - t1.b + children: + - name: StreamHashAgg + fields: + aggs: + - count + group_key: + - t1.a + - t1.b + children: + - name: StreamExchange + fields: + dist: HashShard(t1.a, t1.b) + children: + - name: StreamTableScan + fields: + columns: + - a + - b + - _row_id + table: t1 + children: [] + - name: StreamExchange + fields: + dist: HashShard(t2.c) + children: + - name: StreamTableScan + fields: + columns: + - c + table: t2 + children: [] diff --git a/src/frontend/planner_test/tests/testdata/output/index_selection.yaml b/src/frontend/planner_test/tests/testdata/output/index_selection.yaml index a6240c69f395..349c5f7d8901 100644 --- a/src/frontend/planner_test/tests/testdata/output/index_selection.yaml +++ b/src/frontend/planner_test/tests/testdata/output/index_selection.yaml @@ -213,16 +213,18 @@ update t1 set c = 3 where a = 1 and b = 2; batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t1, exprs: [$0, $1, 3:Int64, $3] } + └─BatchUpdate { table: t1, exprs: [$0, $1, $5, $3] } └─BatchExchange { order: [], dist: Single } - └─BatchLookupJoin { type: Inner, predicate: idx2.t1._row_id IS NOT DISTINCT FROM t1._row_id, output: [t1.a, t1.b, t1.c, t1._row_id, t1._rw_timestamp], lookup table: t1 } - └─BatchExchange { order: [], dist: UpstreamHashShard(idx2.t1._row_id) } - └─BatchScan { table: idx2, columns: [idx2.t1._row_id], scan_ranges: [idx2.b = Decimal(Normalized(2)) AND idx2.a = Int32(1)], distribution: SomeShard } + └─BatchProject { exprs: [t1.a, t1.b, t1.c, t1._row_id, t1._rw_timestamp, 3:Int64] } + └─BatchLookupJoin { type: Inner, predicate: idx2.t1._row_id IS NOT DISTINCT FROM t1._row_id, output: [t1.a, t1.b, t1.c, t1._row_id, t1._rw_timestamp], lookup table: t1 } + └─BatchExchange { order: [], dist: UpstreamHashShard(idx2.t1._row_id) } + └─BatchScan { table: idx2, columns: [idx2.t1._row_id], scan_ranges: [idx2.b = Decimal(Normalized(2)) AND idx2.a = Int32(1)], distribution: SomeShard } batch_local_plan: |- - BatchUpdate { table: t1, exprs: [$0, $1, 3:Int64, $3] } - └─BatchLookupJoin { type: Inner, predicate: idx2.t1._row_id IS NOT DISTINCT FROM t1._row_id, output: [t1.a, t1.b, t1.c, t1._row_id, t1._rw_timestamp], lookup table: t1 } - └─BatchExchange { order: [], dist: Single } - └─BatchScan { table: idx2, columns: [idx2.t1._row_id], scan_ranges: [idx2.b = Decimal(Normalized(2)) AND idx2.a = Int32(1)], distribution: SomeShard } + BatchUpdate { table: t1, exprs: [$0, $1, $5, $3] } + └─BatchProject { exprs: [t1.a, t1.b, t1.c, t1._row_id, t1._rw_timestamp, 3:Int64] } + └─BatchLookupJoin { type: Inner, predicate: idx2.t1._row_id IS NOT DISTINCT FROM t1._row_id, output: [t1.a, t1.b, t1.c, t1._row_id, t1._rw_timestamp], lookup table: t1 } + └─BatchExchange { order: [], dist: Single } + └─BatchScan { table: idx2, columns: [idx2.t1._row_id], scan_ranges: [idx2.b = Decimal(Normalized(2)) AND idx2.a = Int32(1)], distribution: SomeShard } - sql: | create table t1 (a int, b numeric, c bigint, p int); create materialized view v as select count(*) as cnt, p from t1 group by p; diff --git a/src/frontend/planner_test/tests/testdata/output/over_window_function.yaml b/src/frontend/planner_test/tests/testdata/output/over_window_function.yaml index 114d9fead0f3..a6a2c284beb0 100644 --- a/src/frontend/planner_test/tests/testdata/output/over_window_function.yaml +++ b/src/frontend/planner_test/tests/testdata/output/over_window_function.yaml @@ -325,33 +325,31 @@ logical_plan: |- LogicalProject { exprs: [t.x, t.y, t.z, $expr4, $expr5] } └─LogicalFilter { predicate: (t.z > 0:Int32) AND (t.y > 0:Int32) AND (t.x > 0:Int32) AND ($expr4 <= 3.0:Decimal) AND ($expr5 > 1.0:Decimal) } - └─LogicalProject { exprs: [t.x, t.y, t.z, Sqrt(((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)) / count::Decimal)) as $expr4, Case((count <= 1:Int64), null:Decimal, Sqrt(((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)) / (count - 1:Int64)::Decimal))) as $expr5] } + └─LogicalProject { exprs: [t.x, t.y, t.z, Case((count = 0:Int32), null:Decimal, Sqrt((Greatest((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)), 0:Int32::Decimal) / count::Decimal))) as $expr4, Case((count <= 1:Int32), null:Decimal, Sqrt((Greatest((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)), 0:Int32::Decimal) / (count - 1:Int32)::Decimal))) as $expr5] } └─LogicalOverWindow { window_functions: [sum($expr1) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum($expr2) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), count($expr2) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum($expr3) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum(t.x) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), count(t.x) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)] } └─LogicalProject { exprs: [t.x, t.y, t.z, t.w, t._row_id, t._rw_timestamp, ((t.x - t.y) * (t.x - t.y)) as $expr1, (t.x - t.y) as $expr2, (t.x * t.x) as $expr3] } └─LogicalScan { table: t, columns: [t.x, t.y, t.z, t.w, t._row_id, t._rw_timestamp] } batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchProject { exprs: [t.x, t.y, t.z, Sqrt(((sum::Decimal - (($expr4 * $expr4) / $expr5)) / $expr5)) as $expr6, Case((count <= 1:Int64), null:Decimal, Sqrt(((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)) / (count - 1:Int64)::Decimal))) as $expr7] } - └─BatchProject { exprs: [t.x, t.y, t.z, $expr2, $expr1, $expr3, sum, sum, count, sum, sum, count, sum::Decimal as $expr4, count::Decimal as $expr5] } - └─BatchFilter { predicate: (t.y > 0:Int32) AND (t.x > 0:Int32) AND (Sqrt(((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)) / count::Decimal)) <= 3.0:Decimal) AND (Case((count <= 1:Int64), null:Decimal, Sqrt(((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)) / (count - 1:Int64)::Decimal))) > 1.0:Decimal) } - └─BatchOverWindow { window_functions: [sum($expr2) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum($expr1) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), count($expr1) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum($expr3) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum(t.x) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), count(t.x) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)] } - └─BatchExchange { order: [t.z ASC, t.x ASC], dist: HashShard(t.z) } - └─BatchSort { order: [t.z ASC, t.x ASC] } - └─BatchProject { exprs: [t.x, t.y, t.z, ($expr1 * $expr1) as $expr2, $expr1, (t.x * t.x) as $expr3] } - └─BatchProject { exprs: [t.x, t.y, t.z, (t.x - t.y) as $expr1] } - └─BatchFilter { predicate: (t.z > 0:Int32) } - └─BatchScan { table: t, columns: [t.x, t.y, t.z], distribution: SomeShard } + └─BatchProject { exprs: [t.x, t.y, t.z, Case((count = 0:Int32), null:Decimal, Sqrt((Greatest((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)), 0:Decimal) / count::Decimal))) as $expr4, Case((count <= 1:Int32), null:Decimal, Sqrt((Greatest((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)), 0:Decimal) / (count - 1:Int32)::Decimal))) as $expr5] } + └─BatchFilter { predicate: (t.y > 0:Int32) AND (t.x > 0:Int32) AND (Case((count = 0:Int32), null:Decimal, Sqrt((Greatest((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)), 0:Decimal) / count::Decimal))) <= 3.0:Decimal) AND (Case((count <= 1:Int32), null:Decimal, Sqrt((Greatest((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)), 0:Decimal) / (count - 1:Int32)::Decimal))) > 1.0:Decimal) } + └─BatchOverWindow { window_functions: [sum($expr2) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum($expr1) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), count($expr1) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum($expr3) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum(t.x) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), count(t.x) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)] } + └─BatchExchange { order: [t.z ASC, t.x ASC], dist: HashShard(t.z) } + └─BatchSort { order: [t.z ASC, t.x ASC] } + └─BatchProject { exprs: [t.x, t.y, t.z, ($expr1 * $expr1) as $expr2, $expr1, (t.x * t.x) as $expr3] } + └─BatchProject { exprs: [t.x, t.y, t.z, (t.x - t.y) as $expr1] } + └─BatchFilter { predicate: (t.z > 0:Int32) } + └─BatchScan { table: t, columns: [t.x, t.y, t.z], distribution: SomeShard } stream_plan: |- StreamMaterialize { columns: [x, y, z, res0, res1, t._row_id(hidden)], stream_key: [t._row_id, z], pk_columns: [t._row_id, z], pk_conflict: NoCheck } - └─StreamProject { exprs: [t.x, t.y, t.z, Sqrt(((sum::Decimal - (($expr4 * $expr4) / $expr5)) / $expr5)) as $expr6, Case((count <= 1:Int64), null:Decimal, Sqrt(((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)) / (count - 1:Int64)::Decimal))) as $expr7, t._row_id] } - └─StreamProject { exprs: [t.x, t.y, t.z, $expr2, $expr1, $expr3, sum, sum, count, sum, sum, count, sum::Decimal as $expr4, count::Decimal as $expr5, t._row_id] } - └─StreamFilter { predicate: (t.y > 0:Int32) AND (t.x > 0:Int32) AND (Sqrt(((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)) / count::Decimal)) <= 3.0:Decimal) AND (Case((count <= 1:Int64), null:Decimal, Sqrt(((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)) / (count - 1:Int64)::Decimal))) > 1.0:Decimal) } - └─StreamOverWindow { window_functions: [sum($expr2) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum($expr1) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), count($expr1) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum($expr3) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum(t.x) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), count(t.x) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)] } - └─StreamExchange { dist: HashShard(t.z) } - └─StreamProject { exprs: [t.x, t.y, t.z, ($expr1 * $expr1) as $expr2, $expr1, (t.x * t.x) as $expr3, t._row_id] } - └─StreamProject { exprs: [t.x, t.y, t.z, (t.x - t.y) as $expr1, t._row_id] } - └─StreamFilter { predicate: (t.z > 0:Int32) } - └─StreamTableScan { table: t, columns: [t.x, t.y, t.z, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } + └─StreamProject { exprs: [t.x, t.y, t.z, Case((count = 0:Int32), null:Decimal, Sqrt((Greatest((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)), 0:Decimal) / count::Decimal))) as $expr4, Case((count <= 1:Int32), null:Decimal, Sqrt((Greatest((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)), 0:Decimal) / (count - 1:Int32)::Decimal))) as $expr5, t._row_id] } + └─StreamFilter { predicate: (t.y > 0:Int32) AND (t.x > 0:Int32) AND (Case((count = 0:Int32), null:Decimal, Sqrt((Greatest((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)), 0:Decimal) / count::Decimal))) <= 3.0:Decimal) AND (Case((count <= 1:Int32), null:Decimal, Sqrt((Greatest((sum::Decimal - ((sum::Decimal * sum::Decimal) / count::Decimal)), 0:Decimal) / (count - 1:Int32)::Decimal))) > 1.0:Decimal) } + └─StreamOverWindow { window_functions: [sum($expr2) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum($expr1) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), count($expr1) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum($expr3) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), sum(t.x) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), count(t.x) OVER(PARTITION BY t.z ORDER BY t.x ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)] } + └─StreamExchange { dist: HashShard(t.z) } + └─StreamProject { exprs: [t.x, t.y, t.z, ($expr1 * $expr1) as $expr2, $expr1, (t.x * t.x) as $expr3, t._row_id] } + └─StreamProject { exprs: [t.x, t.y, t.z, (t.x - t.y) as $expr1, t._row_id] } + └─StreamFilter { predicate: (t.z > 0:Int32) } + └─StreamTableScan { table: t, columns: [t.x, t.y, t.z, t._row_id], stream_scan_type: ArrangementBackfill, stream_key: [t._row_id], pk: [_row_id], dist: UpstreamHashShard(t._row_id) } - id: aggregate with expression in func arguments and over clause sql: | create table t(x int, y int, z int, w int); diff --git a/src/frontend/planner_test/tests/testdata/output/subquery_expr.yaml b/src/frontend/planner_test/tests/testdata/output/subquery_expr.yaml index a43a1d2df716..519b876c8c67 100644 --- a/src/frontend/planner_test/tests/testdata/output/subquery_expr.yaml +++ b/src/frontend/planner_test/tests/testdata/output/subquery_expr.yaml @@ -26,6 +26,14 @@ ├─LogicalScan { table: t, columns: [] } └─LogicalMaxOneRow └─LogicalScan { table: t, columns: [t.x] } + batch_plan: |- + BatchProject { exprs: [t.x, 1:Int32] } + └─BatchNestedLoopJoin { type: LeftOuter, predicate: true, output: all } + ├─BatchExchange { order: [], dist: Single } + │ └─BatchScan { table: t, columns: [], distribution: SomeShard } + └─BatchMaxOneRow + └─BatchExchange { order: [], dist: Single } + └─BatchScan { table: t, columns: [t.x], distribution: SomeShard } stream_error: Scalar subquery might produce more than one row. - sql: | create table t(x int); @@ -76,6 +84,16 @@ └─LogicalMaxOneRow └─LogicalTopN { order: [t.x ASC], limit: 1, offset: 0, with_ties: true } └─LogicalScan { table: t, columns: [t.x] } + batch_plan: |- + BatchProject { exprs: [t.x, 1:Int32] } + └─BatchNestedLoopJoin { type: LeftOuter, predicate: true, output: all } + ├─BatchExchange { order: [], dist: Single } + │ └─BatchScan { table: t, columns: [], distribution: SomeShard } + └─BatchMaxOneRow + └─BatchTopN { order: [t.x ASC], limit: 1, offset: 0, with_ties: true } + └─BatchExchange { order: [], dist: Single } + └─BatchTopN { order: [t.x ASC], limit: 1, offset: 0, with_ties: true } + └─BatchScan { table: t, columns: [t.x], distribution: SomeShard } stream_error: Scalar subquery might produce more than one row. - sql: | create table t(x int); @@ -92,6 +110,14 @@ ├─LogicalScan { table: t, columns: [] } └─LogicalMaxOneRow └─LogicalScan { table: t, columns: [t.x] } + batch_plan: |- + BatchProject { exprs: [(t.x + 1:Int32) as $expr1] } + └─BatchNestedLoopJoin { type: LeftOuter, predicate: true, output: all } + ├─BatchExchange { order: [], dist: Single } + │ └─BatchScan { table: t, columns: [], distribution: SomeShard } + └─BatchMaxOneRow + └─BatchExchange { order: [], dist: Single } + └─BatchScan { table: t, columns: [t.x], distribution: SomeShard } stream_error: Scalar subquery might produce more than one row. - sql: | create table t(x int); @@ -112,6 +138,14 @@ │ └─LogicalMaxOneRow │ └─LogicalScan { table: t, columns: [t.x] } └─LogicalValues { rows: [[1:Int32]], schema: Schema { fields: [1:Int32:Int32] } } + batch_plan: |- + BatchNestedLoopJoin { type: LeftOuter, predicate: true, output: all } + ├─BatchNestedLoopJoin { type: LeftOuter, predicate: true, output: all } + │ ├─BatchValues { rows: [[]] } + │ └─BatchMaxOneRow + │ └─BatchExchange { order: [], dist: Single } + │ └─BatchScan { table: t, columns: [t.x], distribution: SomeShard } + └─BatchValues { rows: [[1:Int32]] } stream_error: Scalar subquery might produce more than one row. - sql: | create table t(x int); @@ -135,6 +169,19 @@ ├─LogicalScan { table: t, columns: [t.x] } └─LogicalMaxOneRow └─LogicalScan { table: t, columns: [t.x] } + batch_plan: |- + BatchProject { exprs: [(t.x + $expr1) as $expr2] } + └─BatchNestedLoopJoin { type: LeftOuter, predicate: true, output: all } + ├─BatchExchange { order: [], dist: Single } + │ └─BatchScan { table: t, columns: [t.x], distribution: SomeShard } + └─BatchMaxOneRow + └─BatchProject { exprs: [(t.x + t.x) as $expr1] } + └─BatchNestedLoopJoin { type: LeftOuter, predicate: true, output: all } + ├─BatchExchange { order: [], dist: Single } + │ └─BatchScan { table: t, columns: [t.x], distribution: SomeShard } + └─BatchMaxOneRow + └─BatchExchange { order: [], dist: Single } + └─BatchScan { table: t, columns: [t.x], distribution: SomeShard } stream_error: Scalar subquery might produce more than one row. - sql: | create table t1 (x int, y int); diff --git a/src/frontend/planner_test/tests/testdata/output/update.yaml b/src/frontend/planner_test/tests/testdata/output/update.yaml index 19d6673d77f9..4a12b492660a 100644 --- a/src/frontend/planner_test/tests/testdata/output/update.yaml +++ b/src/frontend/planner_test/tests/testdata/output/update.yaml @@ -4,9 +4,10 @@ update t set v1 = 0; batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t, exprs: [0:Int32, $1, $2] } + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } └─BatchExchange { order: [], dist: Single } - └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, 0:Int32] } + └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - sql: | create table t (v1 int, v2 int); update t set v1 = true; @@ -16,72 +17,81 @@ update t set v1 = v2 + 1; batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t, exprs: [($1 + 1:Int32), $1, $2] } + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } └─BatchExchange { order: [], dist: Single } - └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, (t.v2 + 1:Int32) as $expr1] } + └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - sql: | create table t (v1 int, v2 real); update t set v1 = v2; batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t, exprs: [$1::Int32, $1, $2] } + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } └─BatchExchange { order: [], dist: Single } - └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, t.v2::Int32 as $expr1] } + └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - sql: | create table t (v1 int, v2 real); update t set v1 = DEFAULT; batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t, exprs: [null:Int32, $1, $2] } + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } └─BatchExchange { order: [], dist: Single } - └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, null:Int32] } + └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - sql: | create table t (v1 int, v2 int); update t set v1 = v2 + 1 where v2 > 0; batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t, exprs: [($1 + 1:Int32), $1, $2] } + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } └─BatchExchange { order: [], dist: Single } - └─BatchFilter { predicate: (t.v2 > 0:Int32) } - └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, (t.v2 + 1:Int32) as $expr1] } + └─BatchFilter { predicate: (t.v2 > 0:Int32) } + └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - sql: | create table t (v1 int, v2 int); update t set (v1, v2) = (v2 + 1, v1 - 1) where v1 != v2; batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t, exprs: [($1 + 1:Int32), ($0 - 1:Int32), $2] } + └─BatchUpdate { table: t, exprs: [$4, $5, $2] } └─BatchExchange { order: [], dist: Single } - └─BatchFilter { predicate: (t.v1 <> t.v2) } - └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, (t.v2 + 1:Int32) as $expr1, (t.v1 - 1:Int32) as $expr2] } + └─BatchFilter { predicate: (t.v1 <> t.v2) } + └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - sql: | create table t (v1 int, v2 int); update t set (v1, v2) = (v2 + 1, v1 - 1) where v1 != v2 returning *, v2+1, v1-1; logical_plan: |- - LogicalProject { exprs: [t.v1, t.v2, (t.v2 + 1:Int32) as $expr1, (t.v1 - 1:Int32) as $expr2] } - └─LogicalUpdate { table: t, exprs: [($1 + 1:Int32), ($0 - 1:Int32), $2], returning: true } - └─LogicalFilter { predicate: (t.v1 <> t.v2) } - └─LogicalScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp] } + LogicalProject { exprs: [, , ( + 1:Int32) as $expr3, ( - 1:Int32) as $expr4] } + └─LogicalUpdate { table: t, exprs: [$4, $5, $2], returning: true } + └─LogicalProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, (t.v2 + 1:Int32) as $expr1, (t.v1 - 1:Int32) as $expr2] } + └─LogicalFilter { predicate: (t.v1 <> t.v2) } + └─LogicalScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp] } batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchProject { exprs: [t.v1, t.v2, (t.v2 + 1:Int32) as $expr1, (t.v1 - 1:Int32) as $expr2] } - └─BatchUpdate { table: t, exprs: [($1 + 1:Int32), ($0 - 1:Int32), $2], returning: true } + └─BatchProject { exprs: [, , ( + 1:Int32) as $expr3, ( - 1:Int32) as $expr4] } + └─BatchUpdate { table: t, exprs: [$4, $5, $2], returning: true } └─BatchExchange { order: [], dist: Single } - └─BatchFilter { predicate: (t.v1 <> t.v2) } - └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, (t.v2 + 1:Int32) as $expr1, (t.v1 - 1:Int32) as $expr2] } + └─BatchFilter { predicate: (t.v1 <> t.v2) } + └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - name: update with returning statement, should keep `Update` sql: | create table t (v int); update t set v = 114 returning 514; logical_plan: |- LogicalProject { exprs: [514:Int32] } - └─LogicalUpdate { table: t, exprs: [114:Int32, $1], returning: true } - └─LogicalScan { table: t, columns: [t.v, t._row_id, t._rw_timestamp] } + └─LogicalUpdate { table: t, exprs: [$3, $1], returning: true } + └─LogicalProject { exprs: [t.v, t._row_id, t._rw_timestamp, 114:Int32] } + └─LogicalScan { table: t, columns: [t.v, t._row_id, t._rw_timestamp] } batch_plan: |- BatchExchange { order: [], dist: Single } └─BatchProject { exprs: [514:Int32] } - └─BatchUpdate { table: t, exprs: [114:Int32, $1], returning: true } + └─BatchUpdate { table: t, exprs: [$3, $1], returning: true } └─BatchExchange { order: [], dist: Single } - └─BatchScan { table: t, columns: [t.v, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchProject { exprs: [t.v, t._row_id, t._rw_timestamp, 114:Int32] } + └─BatchScan { table: t, columns: [t.v, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - sql: | create table t (v1 int primary key, v2 int); update t set (v2, v1) = (v1, v2); @@ -90,22 +100,25 @@ create table t (v1 int default 1+1, v2 int); update t set v1 = default; logical_plan: |- - LogicalUpdate { table: t, exprs: [(1:Int32 + 1:Int32), $1, $2] } - └─LogicalScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp] } + LogicalUpdate { table: t, exprs: [$4, $1, $2] } + └─LogicalProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, (1:Int32 + 1:Int32) as $expr1] } + └─LogicalScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp] } batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t, exprs: [2:Int32, $1, $2] } + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } └─BatchExchange { order: [], dist: Single } - └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, 2:Int32] } + └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - name: update table with generated columns sql: | create table t(v1 int as v2-1, v2 int, v3 int as v2+1); update t set v2 = 3; batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t, exprs: [3:Int32, $3] } + └─BatchUpdate { table: t, exprs: [$5, $3] } └─BatchExchange { order: [], dist: Single } - └─BatchScan { table: t, columns: [t.v1, t.v2, t.v3, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchProject { exprs: [t.v1, t.v2, t.v3, t._row_id, t._rw_timestamp, 3:Int32] } + └─BatchScan { table: t, columns: [t.v1, t.v2, t.v3, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - name: update generated column sql: | create table t(v1 int as v2-1, v2 int, v3 int as v2+1); @@ -116,25 +129,27 @@ create table t(v1 int as v2-1, v2 int, v3 int as v2+1, primary key (v3)); update t set v2 = 3; binder_error: 'Bind error: update modifying the column referenced by generated columns that are part of the primary key is not allowed' -- name: update subquery +- name: update subquery selection sql: | create table t (a int, b int); update t set a = 777 where b not in (select a from t); logical_plan: |- - LogicalUpdate { table: t, exprs: [777:Int32, $1, $2] } - └─LogicalApply { type: LeftAnti, on: (t.b = t.a), correlated_id: 1 } - ├─LogicalScan { table: t, columns: [t.a, t.b, t._row_id, t._rw_timestamp] } - └─LogicalProject { exprs: [t.a] } - └─LogicalScan { table: t, columns: [t.a, t.b, t._row_id, t._rw_timestamp] } + LogicalUpdate { table: t, exprs: [$4, $1, $2] } + └─LogicalProject { exprs: [t.a, t.b, t._row_id, t._rw_timestamp, 777:Int32] } + └─LogicalApply { type: LeftAnti, on: (t.b = t.a), correlated_id: 1 } + ├─LogicalScan { table: t, columns: [t.a, t.b, t._row_id, t._rw_timestamp] } + └─LogicalProject { exprs: [t.a] } + └─LogicalScan { table: t, columns: [t.a, t.b, t._row_id, t._rw_timestamp] } batch_plan: |- BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t, exprs: [777:Int32, $1, $2] } + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } └─BatchExchange { order: [], dist: Single } - └─BatchHashJoin { type: LeftAnti, predicate: t.b = t.a, output: all } - ├─BatchExchange { order: [], dist: HashShard(t.b) } - │ └─BatchScan { table: t, columns: [t.a, t.b, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } - └─BatchExchange { order: [], dist: HashShard(t.a) } - └─BatchScan { table: t, columns: [t.a], distribution: SomeShard } + └─BatchProject { exprs: [t.a, t.b, t._row_id, t._rw_timestamp, 777:Int32] } + └─BatchHashJoin { type: LeftAnti, predicate: t.b = t.a, output: all } + ├─BatchExchange { order: [], dist: HashShard(t.b) } + │ └─BatchScan { table: t, columns: [t.a, t.b, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchExchange { order: [], dist: HashShard(t.a) } + └─BatchScan { table: t, columns: [t.a], distribution: SomeShard } - name: delete subquery sql: | create table t (a int, b int); @@ -163,12 +178,65 @@ batch_distributed_plan: |- BatchSimpleAgg { aggs: [sum()] } └─BatchExchange { order: [], dist: Single } - └─BatchUpdate { table: t, exprs: [($0 + 1:Int32), $1, $2] } - └─BatchExchange { order: [], dist: HashShard(t.a, t.b, t._row_id, t._rw_timestamp) } - └─BatchScan { table: t, columns: [t.a, t.b, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } -- name: update table with subquery in the set clause + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } + └─BatchExchange { order: [], dist: HashShard(t.a, t.b, t._row_id, t._rw_timestamp, $expr1) } + └─BatchProject { exprs: [t.a, t.b, t._row_id, t._rw_timestamp, (t.a + 1:Int32) as $expr1] } + └─BatchScan { table: t, columns: [t.a, t.b, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } +- name: update table to subquery + sql: | + create table t (v1 int, v2 int); + update t set v1 = (select 666); + batch_plan: |- + BatchExchange { order: [], dist: Single } + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } + └─BatchNestedLoopJoin { type: LeftOuter, predicate: true, output: all } + ├─BatchExchange { order: [], dist: Single } + │ └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchValues { rows: [[666:Int32]] } +- name: update table to subquery with runtime cardinality + sql: | + create table t (v1 int, v2 int); + update t set v1 = (select generate_series(888, 888)); + batch_plan: |- + BatchExchange { order: [], dist: Single } + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } + └─BatchNestedLoopJoin { type: LeftOuter, predicate: true, output: all } + ├─BatchExchange { order: [], dist: Single } + │ └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchMaxOneRow + └─BatchProject { exprs: [GenerateSeries(888:Int32, 888:Int32)] } + └─BatchProjectSet { select_list: [GenerateSeries(888:Int32, 888:Int32)] } + └─BatchValues { rows: [[]] } +- name: update table to correlated subquery sql: | - create table t1 (v1 int primary key, v2 int); - create table t2 (v1 int primary key, v2 int); - update t1 set v1 = (select v1 from t2 where t1.v2 = t2.v2); - binder_error: 'Bind error: subquery on the right side of assignment is unsupported' + create table t (v1 int, v2 int); + update t set v1 = (select count(*) from t as source where source.v2 = t.v2); + batch_plan: |- + BatchExchange { order: [], dist: Single } + └─BatchUpdate { table: t, exprs: [$4, $1, $2] } + └─BatchExchange { order: [], dist: Single } + └─BatchProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, count(1:Int32)::Int32 as $expr1] } + └─BatchHashJoin { type: LeftOuter, predicate: t.v2 IS NOT DISTINCT FROM t.v2, output: [t.v1, t.v2, t._row_id, t._rw_timestamp, count(1:Int32)] } + ├─BatchExchange { order: [], dist: HashShard(t.v2) } + │ └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchHashAgg { group_key: [t.v2], aggs: [count(1:Int32)] } + └─BatchHashJoin { type: LeftOuter, predicate: t.v2 IS NOT DISTINCT FROM t.v2, output: [t.v2, 1:Int32] } + ├─BatchHashAgg { group_key: [t.v2], aggs: [] } + │ └─BatchExchange { order: [], dist: HashShard(t.v2) } + │ └─BatchScan { table: t, columns: [t.v2], distribution: SomeShard } + └─BatchExchange { order: [], dist: HashShard(t.v2) } + └─BatchProject { exprs: [t.v2, 1:Int32] } + └─BatchFilter { predicate: IsNotNull(t.v2) } + └─BatchScan { table: t, columns: [t.v2], distribution: SomeShard } +- name: update table to subquery with multiple assignments + sql: | + create table t (v1 int, v2 int); + update t set (v1, v2) = (select 666.66, 777); + batch_plan: |- + BatchExchange { order: [], dist: Single } + └─BatchUpdate { table: t, exprs: [Field($4, 0:Int32), Field($4, 1:Int32), $2] } + └─BatchProject { exprs: [t.v1, t.v2, t._row_id, t._rw_timestamp, $expr10011::Struct(StructType { field_names: [], field_types: [Int32, Int32] }) as $expr1] } + └─BatchNestedLoopJoin { type: LeftOuter, predicate: true, output: all } + ├─BatchExchange { order: [], dist: Single } + │ └─BatchScan { table: t, columns: [t.v1, t.v2, t._row_id, t._rw_timestamp], distribution: UpstreamHashShard(t._row_id) } + └─BatchValues { rows: [['(666.66,777)':Struct(StructType { field_names: [], field_types: [Decimal, Int32] })]] } diff --git a/src/frontend/src/binder/declare_cursor.rs b/src/frontend/src/binder/declare_cursor.rs new file mode 100644 index 000000000000..5666e305ec92 --- /dev/null +++ b/src/frontend/src/binder/declare_cursor.rs @@ -0,0 +1,32 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_sqlparser::ast::ObjectName; + +use super::statement::RewriteExprsRecursive; +use crate::binder::BoundQuery; +use crate::expr::ExprRewriter; + +#[derive(Debug, Clone)] +pub struct BoundDeclareCursor { + pub cursor_name: ObjectName, + // Currently we only support cursor with query + pub query: Box, // reuse the BoundQuery struct +} + +impl RewriteExprsRecursive for BoundDeclareCursor { + fn rewrite_exprs_recursive(&mut self, rewriter: &mut impl ExprRewriter) { + self.query.rewrite_exprs_recursive(rewriter); + } +} diff --git a/src/frontend/src/binder/expr/function/builtin_scalar.rs b/src/frontend/src/binder/expr/function/builtin_scalar.rs index 68b37a3fee4e..66c28b0ba24d 100644 --- a/src/frontend/src/binder/expr/function/builtin_scalar.rs +++ b/src/frontend/src/binder/expr/function/builtin_scalar.rs @@ -661,6 +661,7 @@ impl Binder { ("shobj_description", raw_literal(ExprImpl::literal_varchar("".to_string()))), ("pg_is_in_recovery", raw_call(ExprType::PgIsInRecovery)), ("rw_recovery_status", raw_call(ExprType::RwRecoveryStatus)), + ("rw_epoch_to_ts", raw_call(ExprType::RwEpochToTs)), // internal ("rw_vnode", raw_call(ExprType::VnodeUser)), ("rw_test_paid_tier", raw_call(ExprType::TestPaidTier)), // for testing purposes diff --git a/src/frontend/src/binder/expr/subquery.rs b/src/frontend/src/binder/expr/subquery.rs index 51819116771f..c31a5d653aeb 100644 --- a/src/frontend/src/binder/expr/subquery.rs +++ b/src/frontend/src/binder/expr/subquery.rs @@ -15,20 +15,16 @@ use risingwave_sqlparser::ast::Query; use crate::binder::Binder; -use crate::error::{ErrorCode, Result}; +use crate::error::{bail_bind_error, Result}; use crate::expr::{ExprImpl, Subquery, SubqueryKind}; impl Binder { - pub(super) fn bind_subquery_expr( - &mut self, - query: Query, - kind: SubqueryKind, - ) -> Result { + pub fn bind_subquery_expr(&mut self, query: Query, kind: SubqueryKind) -> Result { let query = self.bind_query(query)?; - if !matches!(kind, SubqueryKind::Existential) && query.data_types().len() != 1 { - return Err( - ErrorCode::BindError("Subquery must return only one column".to_string()).into(), - ); + if !matches!(kind, SubqueryKind::Existential | SubqueryKind::UpdateSet) + && query.data_types().len() != 1 + { + bail_bind_error!("Subquery must return only one column"); } Ok(Subquery::new(query, kind).into()) } diff --git a/src/frontend/src/binder/mod.rs b/src/frontend/src/binder/mod.rs index adb7a1b9d0f2..4560e51bd656 100644 --- a/src/frontend/src/binder/mod.rs +++ b/src/frontend/src/binder/mod.rs @@ -28,6 +28,7 @@ mod bind_context; mod bind_param; mod create; mod create_view; +mod declare_cursor; mod delete; mod expr; pub mod fetch_cursor; @@ -57,7 +58,7 @@ pub use relation::{ pub use select::{BoundDistinct, BoundSelect}; pub use set_expr::*; pub use statement::BoundStatement; -pub use update::BoundUpdate; +pub use update::{BoundUpdate, UpdateProject}; pub use values::BoundValues; use crate::catalog::catalog_service::CatalogReadGuard; diff --git a/src/frontend/src/binder/statement.rs b/src/frontend/src/binder/statement.rs index b73fab90aed9..dc152d466927 100644 --- a/src/frontend/src/binder/statement.rs +++ b/src/frontend/src/binder/statement.rs @@ -14,8 +14,9 @@ use risingwave_common::bail_not_implemented; use risingwave_common::catalog::Field; -use risingwave_sqlparser::ast::Statement; +use risingwave_sqlparser::ast::{DeclareCursor, Statement}; +use super::declare_cursor::BoundDeclareCursor; use super::delete::BoundDelete; use super::fetch_cursor::BoundFetchCursor; use super::update::BoundUpdate; @@ -30,6 +31,7 @@ pub enum BoundStatement { Delete(Box), Update(Box), Query(Box), + DeclareCursor(Box), FetchCursor(Box), CreateView(Box), } @@ -50,6 +52,7 @@ impl BoundStatement { .as_ref() .map_or(vec![], |s| s.fields().into()), BoundStatement::Query(q) => q.schema().fields().into(), + BoundStatement::DeclareCursor(_) => vec![], BoundStatement::FetchCursor(f) => f .returning_schema .as_ref() @@ -92,6 +95,21 @@ impl Binder { Statement::Query(q) => Ok(BoundStatement::Query(self.bind_query(*q)?.into())), + Statement::DeclareCursor { stmt } => { + if let DeclareCursor::Query(body) = stmt.declare_cursor { + let query = self.bind_query(*body)?; + Ok(BoundStatement::DeclareCursor( + BoundDeclareCursor { + cursor_name: stmt.cursor_name, + query: query.into(), + } + .into(), + )) + } else { + bail_not_implemented!("unsupported statement {:?}", stmt) + } + } + // Note(eric): Can I just bind CreateView to Query?? Statement::CreateView { or_replace, @@ -133,6 +151,7 @@ impl RewriteExprsRecursive for BoundStatement { BoundStatement::Delete(inner) => inner.rewrite_exprs_recursive(rewriter), BoundStatement::Update(inner) => inner.rewrite_exprs_recursive(rewriter), BoundStatement::Query(inner) => inner.rewrite_exprs_recursive(rewriter), + BoundStatement::DeclareCursor(inner) => inner.rewrite_exprs_recursive(rewriter), BoundStatement::FetchCursor(_) => {} BoundStatement::CreateView(inner) => inner.rewrite_exprs_recursive(rewriter), } diff --git a/src/frontend/src/binder/update.rs b/src/frontend/src/binder/update.rs index 9cc80dbde447..f57ad1d19798 100644 --- a/src/frontend/src/binder/update.rs +++ b/src/frontend/src/binder/update.rs @@ -12,23 +12,42 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap}; use fixedbitset::FixedBitSet; use itertools::Itertools; use risingwave_common::catalog::{Schema, TableVersionId}; +use risingwave_common::types::DataType; use risingwave_common::util::iter_util::ZipEqFast; use risingwave_sqlparser::ast::{Assignment, AssignmentValue, Expr, ObjectName, SelectItem}; use super::statement::RewriteExprsRecursive; use super::{Binder, BoundBaseTable}; use crate::catalog::TableId; -use crate::error::{ErrorCode, Result, RwError}; -use crate::expr::{Expr as _, ExprImpl, InputRef}; +use crate::error::{bail_bind_error, bind_error, ErrorCode, Result, RwError}; +use crate::expr::{Expr as _, ExprImpl, SubqueryKind}; use crate::user::UserId; use crate::TableCatalog; +/// Project into `exprs` in `BoundUpdate` to get the new values for updating. +#[derive(Debug, Clone, Copy)] +pub enum UpdateProject { + /// Use the expression at the given index in `exprs`. + Simple(usize), + /// Use the `i`-th field of the expression (returning a struct) at the given index in `exprs`. + Composite(usize, usize), +} + +impl UpdateProject { + /// Offset the index by `i`. + pub fn offset(self, i: usize) -> Self { + match self { + UpdateProject::Simple(index) => UpdateProject::Simple(index + i), + UpdateProject::Composite(index, j) => UpdateProject::Composite(index + i, j), + } + } +} + #[derive(Debug, Clone)] pub struct BoundUpdate { /// Id of the table to perform updating. @@ -48,10 +67,14 @@ pub struct BoundUpdate { pub selection: Option, - /// Expression used to project to the updated row. The assigned columns will use the new - /// expression, and the other columns will be simply `InputRef`. + /// Expression used to evaluate the new values for the columns. pub exprs: Vec, + /// Mapping from the index of the column to be updated, to the index of the expression in `exprs`. + /// + /// By constructing two `Project` nodes with `exprs` and `projects`, we can get the new values. + pub projects: HashMap, + // used for the 'RETURNING" keyword to indicate the returning items and schema // if the list is empty and the schema is None, the output schema will be a INT64 as the // affected row cnt @@ -124,107 +147,112 @@ impl Binder { let selection = selection.map(|expr| self.bind_expr(expr)).transpose()?; - let mut assignment_exprs = HashMap::new(); - for Assignment { id, value } in assignments { - // FIXME: Parsing of `id` is not strict. It will even treat `a.b` as `(a, b)`. - let assignments = match (id.as_slice(), value) { - // _ = (subquery) - (_ids, AssignmentValue::Expr(Expr::Subquery(_))) => { - return Err(ErrorCode::BindError( - "subquery on the right side of assignment is unsupported".to_owned(), - ) - .into()) - } - // col = expr - ([id], value) => { - vec![(id.clone(), value)] - } - // (col1, col2) = (expr1, expr2) - // TODO: support `DEFAULT` in multiple assignments - (ids, AssignmentValue::Expr(Expr::Row(values))) if ids.len() == values.len() => id - .into_iter() - .zip_eq_fast(values.into_iter().map(AssignmentValue::Expr)) - .collect(), - // (col1, col2) = - _ => { - return Err(ErrorCode::BindError( - "number of columns does not match number of values".to_owned(), - ) - .into()) - } + let mut exprs = Vec::new(); + let mut projects = HashMap::new(); + + macro_rules! record { + ($id:expr, $project:expr) => { + let id_index = $id.as_input_ref().unwrap().index; + projects + .try_insert(id_index, $project) + .map_err(|_e| bind_error!("multiple assignments to the same column"))?; }; + } - for (id, value) in assignments { - let id_expr = self.bind_expr(Expr::Identifier(id.clone()))?; - let id_index = if let Some(id_input_ref) = id_expr.clone().as_input_ref() { - let id_index = id_input_ref.index; - if table - .table_catalog - .pk() - .iter() - .any(|k| k.column_index == id_index) - { - return Err(ErrorCode::BindError( - "update modifying the PK column is unsupported".to_owned(), - ) - .into()); - } - if table - .table_catalog - .generated_col_idxes() - .contains(&id_index) - { - return Err(ErrorCode::BindError( - "update modifying the generated column is unsupported".to_owned(), - ) - .into()); + for Assignment { id, value } in assignments { + let ids: Vec<_> = id + .into_iter() + .map(|id| self.bind_expr(Expr::Identifier(id))) + .try_collect()?; + + match (ids.as_slice(), value) { + // `SET col1 = DEFAULT`, `SET (col1, col2, ...) = DEFAULT` + (ids, AssignmentValue::Default) => { + for id in ids { + let id_index = id.as_input_ref().unwrap().index; + let expr = default_columns_from_catalog + .get(&id_index) + .cloned() + .unwrap_or_else(|| ExprImpl::literal_null(id.return_type())); + + exprs.push(expr); + record!(id, UpdateProject::Simple(exprs.len() - 1)); } - if cols_refed_by_generated_pk.contains(id_index) { - return Err(ErrorCode::BindError( - "update modifying the column referenced by generated columns that are part of the primary key is not allowed".to_owned(), - ) - .into()); + } + + // `SET col1 = expr` + ([id], AssignmentValue::Expr(expr)) => { + let expr = self.bind_expr(expr)?.cast_assign(id.return_type())?; + exprs.push(expr); + record!(id, UpdateProject::Simple(exprs.len() - 1)); + } + // `SET (col1, col2, ...) = (val1, val2, ...)` + (ids, AssignmentValue::Expr(Expr::Row(values))) => { + if ids.len() != values.len() { + bail_bind_error!("number of columns does not match number of values"); } - id_index - } else { - unreachable!() - }; - - let value_expr = match value { - AssignmentValue::Expr(expr) => { - self.bind_expr(expr)?.cast_assign(id_expr.return_type())? + + for (id, value) in ids.iter().zip_eq_fast(values) { + let expr = self.bind_expr(value)?.cast_assign(id.return_type())?; + exprs.push(expr); + record!(id, UpdateProject::Simple(exprs.len() - 1)); } - AssignmentValue::Default => default_columns_from_catalog - .get(&id_index) - .cloned() - .unwrap_or_else(|| ExprImpl::literal_null(id_expr.return_type())), - }; - - match assignment_exprs.entry(id_expr) { - Entry::Occupied(_) => { - return Err(ErrorCode::BindError( - "multiple assignments to same column".to_owned(), - ) - .into()) + } + // `SET (col1, col2, ...) = (SELECT ...)` + (ids, AssignmentValue::Expr(Expr::Subquery(subquery))) => { + let expr = self.bind_subquery_expr(*subquery, SubqueryKind::UpdateSet)?; + + if expr.return_type().as_struct().len() != ids.len() { + bail_bind_error!("number of columns does not match number of values"); } - Entry::Vacant(v) => { - v.insert(value_expr); + + let target_type = DataType::new_unnamed_struct( + ids.iter().map(|id| id.return_type()).collect(), + ); + let expr = expr.cast_assign(target_type)?; + + exprs.push(expr); + + for (i, id) in ids.iter().enumerate() { + record!(id, UpdateProject::Composite(exprs.len() - 1, i)); } } + + (_ids, AssignmentValue::Expr(_expr)) => { + bail_bind_error!("source for a multiple-column UPDATE item must be a sub-SELECT or ROW() expression"); + } } } - let exprs = table - .table_catalog - .columns() - .iter() - .enumerate() - .filter_map(|(i, c)| { - c.can_dml() - .then_some(InputRef::new(i, c.data_type().clone()).into()) - }) - .map(|c| assignment_exprs.remove(&c).unwrap_or(c)) - .collect_vec(); + // Check whether updating these columns is allowed. + for &id_index in projects.keys() { + if (table.table_catalog.pk()) + .iter() + .any(|k| k.column_index == id_index) + { + return Err(ErrorCode::BindError( + "update modifying the PK column is unsupported".to_owned(), + ) + .into()); + } + if (table.table_catalog.generated_col_idxes()).contains(&id_index) { + return Err(ErrorCode::BindError( + "update modifying the generated column is unsupported".to_owned(), + ) + .into()); + } + if cols_refed_by_generated_pk.contains(id_index) { + return Err(ErrorCode::BindError( + "update modifying the column referenced by generated columns that are part of the primary key is not allowed".to_owned(), + ) + .into()); + } + + let col = &table.table_catalog.columns()[id_index]; + if !col.can_dml() { + bail_bind_error!("update modifying column `{}` is unsupported", col.name()); + } + } let (returning_list, fields) = self.bind_returning_list(returning_items)?; let returning = !returning_list.is_empty(); @@ -236,6 +264,7 @@ impl Binder { owner, table, selection, + projects, exprs, returning_list, returning_schema: if returning { diff --git a/src/frontend/src/catalog/function_catalog.rs b/src/frontend/src/catalog/function_catalog.rs index 8782fc10945a..6069a74b570c 100644 --- a/src/frontend/src/catalog/function_catalog.rs +++ b/src/frontend/src/catalog/function_catalog.rs @@ -32,13 +32,12 @@ pub struct FunctionCatalog { pub arg_types: Vec, pub return_type: DataType, pub language: String, + pub runtime: Option, pub identifier: Option, pub body: Option, pub link: Option, pub compressed_binary: Option>, pub always_retry_on_network_error: bool, - pub function_type: Option, - pub runtime: Option, } #[derive(Clone, Display, PartialEq, Eq, Hash, Debug, EnumAsInner)] @@ -71,13 +70,12 @@ impl From<&PbFunction> for FunctionCatalog { arg_types: prost.arg_types.iter().map(|arg| arg.into()).collect(), return_type: prost.return_type.as_ref().expect("no return type").into(), language: prost.language.clone(), + runtime: prost.runtime.clone(), identifier: prost.identifier.clone(), body: prost.body.clone(), link: prost.link.clone(), compressed_binary: prost.compressed_binary.clone(), always_retry_on_network_error: prost.always_retry_on_network_error, - function_type: prost.function_type.clone(), - runtime: prost.runtime.clone(), } } } @@ -89,12 +87,11 @@ impl From<&FunctionCatalog> for PbUserDefinedFunctionMetadata { arg_types: c.arg_types.iter().map(|t| t.to_protobuf()).collect(), return_type: Some(c.return_type.to_protobuf()), language: c.language.clone(), + runtime: c.runtime.clone(), link: c.link.clone(), identifier: c.identifier.clone(), body: c.body.clone(), compressed_binary: c.compressed_binary.clone(), - function_type: c.function_type.clone(), - runtime: c.runtime.clone(), } } } diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs index 9c546f1ec729..947560e44e62 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/mod.rs @@ -39,6 +39,7 @@ mod rw_indexes; mod rw_internal_tables; mod rw_materialized_views; mod rw_meta_snapshot; +mod rw_rate_limit; mod rw_relation_info; mod rw_relations; mod rw_schemas; diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_fragments.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_fragments.rs index 91f818e7919f..75a040f2733c 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_fragments.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_fragments.rs @@ -32,7 +32,7 @@ struct RwFragment { max_parallelism: i32, } -fn extract_fragment_type_flag(mask: u32) -> Vec { +pub(super) fn extract_fragment_type_flag(mask: u32) -> Vec { let mut result = vec![]; for i in 0..32 { let bit = 1 << i; diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_rate_limit.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_rate_limit.rs new file mode 100644 index 000000000000..34602461ca3b --- /dev/null +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_rate_limit.rs @@ -0,0 +1,50 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_common::types::Fields; +use risingwave_frontend_macro::system_catalog; + +use super::rw_fragments::extract_fragment_type_flag; +use crate::catalog::system_catalog::SysCatalogReaderImpl; +use crate::error::Result; + +#[derive(Fields)] +#[primary_key(fragment_id, node_name)] +struct RwRateLimit { + fragment_id: i32, + fragment_type: Vec, + node_name: String, + table_id: i32, + rate_limit: i32, +} + +#[system_catalog(table, "rw_catalog.rw_rate_limit")] +async fn read_rw_rate_limit(reader: &SysCatalogReaderImpl) -> Result> { + let rate_limits = reader.meta_client.list_rate_limits().await?; + + Ok(rate_limits + .into_iter() + .map(|info| RwRateLimit { + fragment_id: info.fragment_id as i32, + fragment_type: extract_fragment_type_flag(info.fragment_type_mask) + .into_iter() + .flat_map(|t| t.as_str_name().strip_prefix("FRAGMENT_TYPE_FLAG_")) + .map(|s| s.into()) + .collect(), + table_id: info.job_id as i32, + rate_limit: info.rate_limit as i32, + node_name: info.node_name, + }) + .collect()) +} diff --git a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_relation_info.rs b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_relation_info.rs index 83f6aa14bb0d..639897bf6cdc 100644 --- a/src/frontend/src/catalog/system_catalog/rw_catalog/rw_relation_info.rs +++ b/src/frontend/src/catalog/system_catalog/rw_catalog/rw_relation_info.rs @@ -57,6 +57,13 @@ async fn read_relation_info(reader: &SysCatalogReaderImpl) -> Result Result Result Result Result timestamptz")] +fn rw_epoch_to_ts(epoch: i64) -> Result { + Ok(Epoch(epoch as u64).as_timestamptz()) +} diff --git a/src/frontend/src/expr/pure.rs b/src/frontend/src/expr/pure.rs index 5e3bd968a46b..83a8cfa537ba 100644 --- a/src/frontend/src/expr/pure.rs +++ b/src/frontend/src/expr/pure.rs @@ -262,7 +262,8 @@ impl ExprVisitor for ImpureAnalyzer { | Type::MapDelete | Type::MapInsert | Type::MapLength - | Type::VnodeUser => + | Type::VnodeUser + |Type::RwEpochToTs => // expression output is deterministic(same result for the same input) { func_call diff --git a/src/frontend/src/expr/subquery.rs b/src/frontend/src/expr/subquery.rs index 62f59c934dd6..8460f73d5fbb 100644 --- a/src/frontend/src/expr/subquery.rs +++ b/src/frontend/src/expr/subquery.rs @@ -24,6 +24,9 @@ use crate::expr::{CorrelatedId, Depth}; pub enum SubqueryKind { /// Returns a scalar value (single column single row). Scalar, + /// Returns a scalar struct value composed of multiple columns. + /// Used in `UPDATE SET (col1, col2) = (SELECT ...)`. + UpdateSet, /// `EXISTS` | `NOT EXISTS` subquery (semi/anti-semi join). Returns a boolean. Existential, /// `IN` subquery. @@ -88,6 +91,7 @@ impl Expr for Subquery { assert_eq!(types.len(), 1, "Subquery with more than one column"); types[0].clone() } + SubqueryKind::UpdateSet => DataType::new_unnamed_struct(self.query.data_types()), SubqueryKind::Array => { let types = self.query.data_types(); assert_eq!(types.len(), 1, "Subquery with more than one column"); diff --git a/src/frontend/src/expr/table_function.rs b/src/frontend/src/expr/table_function.rs index 668f6d9b75aa..346e5481e22a 100644 --- a/src/frontend/src/expr/table_function.rs +++ b/src/frontend/src/expr/table_function.rs @@ -20,7 +20,9 @@ use mysql_async::consts::ColumnType as MySqlColumnType; use mysql_async::prelude::*; use risingwave_common::array::arrow::IcebergArrowConvert; use risingwave_common::types::{DataType, ScalarImpl, StructType}; -use risingwave_connector::source::iceberg::{create_parquet_stream_builder, list_s3_directory}; +use risingwave_connector::source::iceberg::{ + get_parquet_fields, list_s3_directory, new_s3_operator, +}; pub use risingwave_pb::expr::table_function::PbType as TableFunctionType; use risingwave_pb::expr::PbTableFunction; use thiserror_ext::AsReport; @@ -175,7 +177,7 @@ impl TableFunction { let schema = tokio::task::block_in_place(|| { FRONTEND_RUNTIME.block_on(async { - let parquet_stream_builder = create_parquet_stream_builder( + let op = new_s3_operator( eval_args[2].clone(), eval_args[3].clone(), eval_args[4].clone(), @@ -183,11 +185,18 @@ impl TableFunction { Some(files) => files[0].clone(), None => eval_args[5].clone(), }, + )?; + let fields = get_parquet_fields( + op, + match files.as_ref() { + Some(files) => files[0].clone(), + None => eval_args[5].clone(), + }, ) .await?; let mut rw_types = vec![]; - for field in parquet_stream_builder.schema().fields() { + for field in &fields { rw_types.push(( field.name().to_string(), IcebergArrowConvert.type_from_field(field)?, diff --git a/src/frontend/src/expr/type_inference/func.rs b/src/frontend/src/expr/type_inference/func.rs index 719484b959f3..94a21bb39288 100644 --- a/src/frontend/src/expr/type_inference/func.rs +++ b/src/frontend/src/expr/type_inference/func.rs @@ -841,7 +841,7 @@ fn implicit_ok(source: &DataType, target: &SigDataType, eq_ok: bool) -> bool { /// Find the top `candidates` that match `inputs` on most non-null positions. This covers Rule 2, /// 4a, 4c and 4d in [`PostgreSQL`](https://www.postgresql.org/docs/current/typeconv-func.html). /// -/// * Rule 2 & 4c: Keep candidates that have most exact type matches. Exact match on all posistions +/// * Rule 2 & 4c: Keep candidates that have most exact type matches. Exact match on all positions /// is just a special case. /// * Rule 4d: Break ties by selecting those that accept preferred types at most positions. /// * Rule 4a: If the input cannot implicit cast to expected type at any position, this candidate is diff --git a/src/frontend/src/expr/user_defined_function.rs b/src/frontend/src/expr/user_defined_function.rs index 44abfa1859c4..084fe7387d76 100644 --- a/src/frontend/src/expr/user_defined_function.rs +++ b/src/frontend/src/expr/user_defined_function.rs @@ -47,21 +47,20 @@ impl UserDefinedFunction { let catalog = FunctionCatalog { // FIXME(yuhao): function id is not in udf proto. id: FunctionId::placeholder(), - name: udf.get_name().clone(), + name: udf.name.clone(), // FIXME(yuhao): owner is not in udf proto. owner: u32::MAX - 1, kind: FunctionKind::Scalar, arg_names: udf.arg_names.clone(), arg_types, return_type, - language: udf.get_language().clone(), + language: udf.language.clone(), + runtime: udf.runtime.clone(), identifier: udf.identifier.clone(), body: udf.body.clone(), link: udf.link.clone(), compressed_binary: udf.compressed_binary.clone(), always_retry_on_network_error: udf.always_retry_on_network_error, - function_type: udf.function_type.clone(), - runtime: udf.runtime.clone(), }; Ok(Self { @@ -93,13 +92,12 @@ impl Expr for UserDefinedFunction { .map(|t| t.to_protobuf()) .collect(), language: self.catalog.language.clone(), + runtime: self.catalog.runtime.clone(), identifier: self.catalog.identifier.clone(), link: self.catalog.link.clone(), body: self.catalog.body.clone(), compressed_binary: self.catalog.compressed_binary.clone(), always_retry_on_network_error: self.catalog.always_retry_on_network_error, - function_type: self.catalog.function_type.clone(), - runtime: self.catalog.runtime.clone(), })), } } diff --git a/src/frontend/src/handler/alter_table_column.rs b/src/frontend/src/handler/alter_table_column.rs index cf55b82a4750..19f8355a77bc 100644 --- a/src/frontend/src/handler/alter_table_column.rs +++ b/src/frontend/src/handler/alter_table_column.rs @@ -180,6 +180,7 @@ pub async fn get_replace_table_plan( wildcard_idx, cdc_table_info, format_encode, + include_column_options, .. } = new_definition else { @@ -206,6 +207,7 @@ pub async fn get_replace_table_plan( with_version_column, cdc_table_info, new_version_columns, + include_column_options, ) .await?; diff --git a/src/frontend/src/handler/create_aggregate.rs b/src/frontend/src/handler/create_aggregate.rs index b9b8a391eaff..fe7460ff0997 100644 --- a/src/frontend/src/handler/create_aggregate.rs +++ b/src/frontend/src/handler/create_aggregate.rs @@ -52,6 +52,16 @@ pub async fn handle_create_aggregate( None => return Err(ErrorCode::InvalidParameterValue("no language".into()).into()), }; + let runtime = match params.runtime { + Some(_) => { + return Err(ErrorCode::InvalidParameterValue( + "runtime selection is currently not supported".to_string(), + ) + .into()); + } + None => None, + }; + let return_type = bind_data_type(&returns)?; let mut arg_names = vec![]; @@ -94,13 +104,6 @@ pub async fn handle_create_aggregate( } _ => None, }; - let function_type = match params.function_type { - Some(CreateFunctionType::Sync) => Some("sync".to_string()), - Some(CreateFunctionType::Async) => Some("async".to_string()), - Some(CreateFunctionType::Generator) => Some("generator".to_string()), - Some(CreateFunctionType::AsyncGenerator) => Some("async_generator".to_string()), - None => None, - }; let create_fn = risingwave_expr::sig::find_udf_impl(&language, None, link)?.create_fn; let output = create_fn(CreateFunctionOptions { @@ -124,14 +127,13 @@ pub async fn handle_create_aggregate( arg_types: arg_types.into_iter().map(|t| t.into()).collect(), return_type: Some(return_type.into()), language, + runtime, identifier: Some(output.identifier), link: link.map(|s| s.to_string()), body: output.body, compressed_binary: output.compressed_binary, owner: session.user_id(), always_retry_on_network_error: false, - runtime: None, - function_type, }; let catalog_writer = session.catalog_writer()?; diff --git a/src/frontend/src/handler/create_function.rs b/src/frontend/src/handler/create_function.rs index ccd83a13ed81..b81d2b4514ed 100644 --- a/src/frontend/src/handler/create_function.rs +++ b/src/frontend/src/handler/create_function.rs @@ -60,15 +60,11 @@ pub async fn handle_create_function( }; let runtime = match params.runtime { - Some(runtime) => { - if language == "javascript" { - Some(runtime.real_value()) - } else { - return Err(ErrorCode::InvalidParameterValue( - "runtime is only supported for javascript".to_string(), - ) - .into()); - } + Some(_) => { + return Err(ErrorCode::InvalidParameterValue( + "runtime selection is currently not supported".to_string(), + ) + .into()); } None => None, }; @@ -141,13 +137,6 @@ pub async fn handle_create_function( } _ => None, }; - let function_type = match params.function_type { - Some(CreateFunctionType::Sync) => Some("sync".to_string()), - Some(CreateFunctionType::Async) => Some("async".to_string()), - Some(CreateFunctionType::Generator) => Some("generator".to_string()), - Some(CreateFunctionType::AsyncGenerator) => Some("async_generator".to_string()), - None => None, - }; let create_fn = risingwave_expr::sig::find_udf_impl(&language, runtime.as_deref(), link)?.create_fn; @@ -176,6 +165,7 @@ pub async fn handle_create_function( arg_types: arg_types.into_iter().map(|t| t.into()).collect(), return_type: Some(return_type.into()), language, + runtime, identifier: Some(output.identifier), link: link.map(|s| s.to_string()), body: output.body, @@ -184,8 +174,6 @@ pub async fn handle_create_function( always_retry_on_network_error: with_options .always_retry_on_network_error .unwrap_or_default(), - runtime, - function_type, }; let catalog_writer = session.catalog_writer()?; diff --git a/src/frontend/src/handler/create_sink.rs b/src/frontend/src/handler/create_sink.rs index c33aeccb81e1..e280f9090926 100644 --- a/src/frontend/src/handler/create_sink.rs +++ b/src/frontend/src/handler/create_sink.rs @@ -22,10 +22,10 @@ use maplit::{convert_args, hashmap}; use pgwire::pg_response::{PgResponse, StatementType}; use risingwave_common::array::arrow::arrow_schema_iceberg::DataType as ArrowDataType; use risingwave_common::array::arrow::IcebergArrowConvert; -use risingwave_common::catalog::{ColumnCatalog, DatabaseId, Schema, SchemaId, TableId, UserId}; +use risingwave_common::bail; +use risingwave_common::catalog::{ColumnCatalog, DatabaseId, Schema, SchemaId, UserId}; use risingwave_common::secret::LocalSecretManager; use risingwave_common::types::DataType; -use risingwave_common::{bail, catalog}; use risingwave_connector::sink::catalog::{SinkCatalog, SinkFormatDesc, SinkType}; use risingwave_connector::sink::iceberg::{IcebergConfig, ICEBERG_SINK}; use risingwave_connector::sink::kafka::KAFKA_SINK; @@ -47,9 +47,6 @@ use super::create_source::UPSTREAM_SOURCE_KEY; use super::util::gen_query_from_table_name; use super::RwPgResponse; use crate::binder::Binder; -use crate::catalog::catalog_service::CatalogReadGuard; -use crate::catalog::source_catalog::SourceCatalog; -use crate::catalog::view_catalog::ViewCatalog; use crate::catalog::SinkId; use crate::error::{ErrorCode, Result, RwError}; use crate::expr::{rewrite_now_to_proctime, ExprImpl, InputRef}; @@ -453,8 +450,6 @@ pub async fn handle_create_sink( if let Some(table_catalog) = target_table_catalog { use crate::handler::alter_table_column::hijack_merger_for_target_table; - check_cycle_for_sink(session.as_ref(), sink.clone(), table_catalog.id())?; - let (mut graph, mut table, source) = reparse_table_for_sink(&session, &table_catalog).await?; @@ -530,112 +525,6 @@ pub fn fetch_incoming_sinks( Ok(sinks) } -fn check_cycle_for_sink( - session: &SessionImpl, - sink_catalog: SinkCatalog, - table_id: catalog::TableId, -) -> Result<()> { - let reader = session.env().catalog_reader().read_guard(); - - let mut sinks = HashMap::new(); - let mut sources = HashMap::new(); - let mut views = HashMap::new(); - let db_name = session.database(); - for schema in reader.iter_schemas(db_name)? { - for sink in schema.iter_sink() { - sinks.insert(sink.id.sink_id, sink.as_ref()); - } - - for source in schema.iter_source() { - sources.insert(source.id, source.as_ref()); - } - - for view in schema.iter_view() { - views.insert(view.id, view.as_ref()); - } - } - - struct Context<'a> { - reader: &'a CatalogReadGuard, - sink_index: &'a HashMap, - source_index: &'a HashMap, - view_index: &'a HashMap, - } - - impl Context<'_> { - fn visit_table( - &self, - table: &TableCatalog, - target_table_id: catalog::TableId, - path: &mut Vec, - ) -> Result<()> { - if table.id == target_table_id { - path.reverse(); - path.push(table.name.clone()); - return Err(RwError::from(ErrorCode::BindError( - format!( - "Creating such a sink will result in circular dependency, path = [{}]", - path.join(", ") - ) - .to_string(), - ))); - } - - for sink_id in &table.incoming_sinks { - if let Some(sink) = self.sink_index.get(sink_id) { - path.push(sink.name.clone()); - self.visit_dependent_jobs(&sink.dependent_relations, target_table_id, path)?; - path.pop(); - } else { - bail!("sink not found: {:?}", sink_id); - } - } - - self.visit_dependent_jobs(&table.dependent_relations, target_table_id, path)?; - - Ok(()) - } - - fn visit_dependent_jobs( - &self, - dependent_jobs: &[TableId], - target_table_id: TableId, - path: &mut Vec, - ) -> Result<()> { - for table_id in dependent_jobs { - if let Ok(table) = self.reader.get_any_table_by_id(table_id) { - path.push(table.name.clone()); - self.visit_table(table.as_ref(), target_table_id, path)?; - path.pop(); - } else if self.source_index.contains_key(&table_id.table_id) - || self.view_index.contains_key(&table_id.table_id) - { - continue; - } else { - bail!("streaming job not found: {:?}", table_id); - } - } - - Ok(()) - } - } - - let mut path = vec![]; - - path.push(sink_catalog.name.clone()); - - let ctx = Context { - reader: &reader, - sink_index: &sinks, - source_index: &sources, - view_index: &views, - }; - - ctx.visit_dependent_jobs(&sink_catalog.dependent_relations, table_id, &mut path)?; - - Ok(()) -} - pub(crate) async fn reparse_table_for_sink( session: &Arc, table_catalog: &Arc, @@ -670,6 +559,7 @@ pub(crate) async fn reparse_table_for_sink( append_only, on_conflict, with_version_column, + include_column_options, .. } = definition else { @@ -692,6 +582,7 @@ pub(crate) async fn reparse_table_for_sink( with_version_column, None, None, + include_column_options, ) .await?; diff --git a/src/frontend/src/handler/create_sql_function.rs b/src/frontend/src/handler/create_sql_function.rs index 9b5d34c34abe..c733f603a3c4 100644 --- a/src/frontend/src/handler/create_sql_function.rs +++ b/src/frontend/src/handler/create_sql_function.rs @@ -336,14 +336,13 @@ pub async fn handle_create_sql_function( arg_types: arg_types.into_iter().map(|t| t.into()).collect(), return_type: Some(return_type.into()), language, + runtime: None, identifier: None, body: Some(body), compressed_binary: None, link: None, owner: session.user_id(), always_retry_on_network_error: false, - runtime: None, - function_type: None, }; let catalog_writer = session.catalog_writer()?; diff --git a/src/frontend/src/handler/create_table.rs b/src/frontend/src/handler/create_table.rs index 6118ba5ccd36..ff2e41037078 100644 --- a/src/frontend/src/handler/create_table.rs +++ b/src/frontend/src/handler/create_table.rs @@ -1310,6 +1310,7 @@ pub async fn generate_stream_graph_for_replace_table( with_version_column: Option, cdc_table_info: Option, new_version_columns: Option>, + include_column_options: IncludeOption, ) -> Result<(StreamFragmentGraph, Table, Option, TableJobType)> { use risingwave_pb::catalog::table::OptionalAssociatedSourceId; @@ -1328,7 +1329,7 @@ pub async fn generate_stream_graph_for_replace_table( append_only, on_conflict, with_version_column, - vec![], + include_column_options, ) .await?, TableJobType::General, diff --git a/src/frontend/src/handler/declare_cursor.rs b/src/frontend/src/handler/declare_cursor.rs index 8c521be2adac..e13a0e2c4e91 100644 --- a/src/frontend/src/handler/declare_cursor.rs +++ b/src/frontend/src/handler/declare_cursor.rs @@ -136,6 +136,23 @@ async fn handle_declare_query_cursor( Ok(PgResponse::empty_result(StatementType::DECLARE_CURSOR)) } +pub async fn handle_bound_declare_query_cursor( + handle_args: HandlerArgs, + cursor_name: ObjectName, + plan_fragmenter_result: BatchPlanFragmenterResult, +) -> Result { + let session = handle_args.session.clone(); + let (chunk_stream, fields) = + create_chunk_stream_for_cursor(session, plan_fragmenter_result).await?; + + handle_args + .session + .get_cursor_manager() + .add_query_cursor(cursor_name, chunk_stream, fields) + .await?; + Ok(PgResponse::empty_result(StatementType::DECLARE_CURSOR)) +} + pub async fn create_stream_for_cursor_stmt( handle_args: HandlerArgs, stmt: Statement, diff --git a/src/frontend/src/handler/explain.rs b/src/frontend/src/handler/explain.rs index 018b03feebe5..20a7036dc148 100644 --- a/src/frontend/src/handler/explain.rs +++ b/src/frontend/src/handler/explain.rs @@ -232,6 +232,7 @@ async fn do_handle_explain( ExplainFormat::Json => blocks.push(plan.explain_to_json()), ExplainFormat::Xml => blocks.push(plan.explain_to_xml()), ExplainFormat::Yaml => blocks.push(plan.explain_to_yaml()), + ExplainFormat::Dot => blocks.push(plan.explain_to_dot()), } } } diff --git a/src/frontend/src/handler/extended_handle.rs b/src/frontend/src/handler/extended_handle.rs index f12eaa617352..720e317bcdc9 100644 --- a/src/frontend/src/handler/extended_handle.rs +++ b/src/frontend/src/handler/extended_handle.rs @@ -20,7 +20,7 @@ use bytes::Bytes; use pgwire::types::Format; use risingwave_common::bail_not_implemented; use risingwave_common::types::DataType; -use risingwave_sqlparser::ast::{CreateSink, Query, Statement}; +use risingwave_sqlparser::ast::{CreateSink, DeclareCursor, Query, Statement}; use super::query::BoundResult; use super::{fetch_cursor, handle, query, HandlerArgs, RwPgResponse}; @@ -112,6 +112,13 @@ pub async fn handle_parse( Statement::FetchCursor { .. } => { fetch_cursor::handle_parse(handler_args, statement, specific_param_types).await } + Statement::DeclareCursor { stmt } => { + if let DeclareCursor::Query(_) = stmt.declare_cursor { + query::handle_parse(handler_args, statement, specific_param_types) + } else { + bail_not_implemented!("DECLARE SUBSCRIPTION CURSOR with parameters"); + } + } Statement::CreateView { query, materialized, diff --git a/src/frontend/src/handler/privilege.rs b/src/frontend/src/handler/privilege.rs index ff47dac4af86..c6d74affc9f8 100644 --- a/src/frontend/src/handler/privilege.rs +++ b/src/frontend/src/handler/privilege.rs @@ -115,6 +115,9 @@ pub(crate) fn resolve_privileges(stmt: &BoundStatement) -> Vec objects.push(object); } BoundStatement::Query(ref query) => objects.extend(resolve_query_privileges(query)), + BoundStatement::DeclareCursor(ref declare_cursor) => { + objects.extend(resolve_query_privileges(&declare_cursor.query)) + } BoundStatement::FetchCursor(_) => unimplemented!(), BoundStatement::CreateView(ref create_view) => { objects.extend(resolve_query_privileges(&create_view.query)) diff --git a/src/frontend/src/handler/query.rs b/src/frontend/src/handler/query.rs index a8201d3c40cc..66a0dbcf7ca4 100644 --- a/src/frontend/src/handler/query.rs +++ b/src/frontend/src/handler/query.rs @@ -28,7 +28,7 @@ use risingwave_common::types::{DataType, Datum}; use risingwave_sqlparser::ast::{SetExpr, Statement}; use super::extended_handle::{PortalResult, PrepareStatement, PreparedResult}; -use super::{create_mv, PgResponseStream, RwPgResponse}; +use super::{create_mv, declare_cursor, PgResponseStream, RwPgResponse}; use crate::binder::{Binder, BoundCreateView, BoundStatement}; use crate::catalog::TableId; use crate::error::{ErrorCode, Result, RwError}; @@ -149,6 +149,20 @@ pub async fn handle_execute( ) .await } + Statement::DeclareCursor { stmt } => { + let session = handler_args.session.clone(); + let plan_fragmenter_result = { + let context = OptimizerContext::from_handler_args(handler_args.clone()); + let plan_result = gen_batch_query_plan(&session, context.into(), bound_result)?; + gen_batch_plan_fragmenter(&session, plan_result)? + }; + declare_cursor::handle_bound_declare_query_cursor( + handler_args, + stmt.cursor_name, + plan_fragmenter_result, + ) + .await + } _ => unreachable!(), } } diff --git a/src/frontend/src/lib.rs b/src/frontend/src/lib.rs index 9f29fd82e066..bf03edb6a87d 100644 --- a/src/frontend/src/lib.rs +++ b/src/frontend/src/lib.rs @@ -33,6 +33,7 @@ #![feature(error_generic_member_access)] #![feature(iterator_try_collect)] #![feature(used_with_arg)] +#![feature(try_trait_v2)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/src/frontend/src/meta_client.rs b/src/frontend/src/meta_client.rs index a91a0d8abc87..760c7bd450e1 100644 --- a/src/frontend/src/meta_client.rs +++ b/src/frontend/src/meta_client.rs @@ -33,6 +33,7 @@ use risingwave_pb::meta::list_actor_splits_response::ActorSplit; use risingwave_pb::meta::list_actor_states_response::ActorState; use risingwave_pb::meta::list_fragment_distribution_response::FragmentDistribution; use risingwave_pb::meta::list_object_dependencies_response::PbObjectDependencies; +use risingwave_pb::meta::list_rate_limits_response::RateLimitInfo; use risingwave_pb::meta::list_table_fragment_states_response::TableFragmentState; use risingwave_pb::meta::list_table_fragments_response::TableFragmentInfo; use risingwave_pb::meta::{EventLog, PbThrottleTarget, RecoveryStatus}; @@ -125,6 +126,8 @@ pub trait FrontendMetaClient: Send + Sync { async fn get_cluster_recovery_status(&self) -> Result; async fn get_cluster_limits(&self) -> Result>; + + async fn list_rate_limits(&self) -> Result>; } pub struct FrontendMetaClientImpl(pub MetaClient); @@ -300,4 +303,8 @@ impl FrontendMetaClient for FrontendMetaClientImpl { async fn get_cluster_limits(&self) -> Result> { self.0.get_cluster_limits().await } + + async fn list_rate_limits(&self) -> Result> { + self.0.list_rate_limits().await + } } diff --git a/src/frontend/src/optimizer/heuristic_optimizer.rs b/src/frontend/src/optimizer/heuristic_optimizer.rs index 1e8ee5eb068a..bb0513521068 100644 --- a/src/frontend/src/optimizer/heuristic_optimizer.rs +++ b/src/frontend/src/optimizer/heuristic_optimizer.rs @@ -18,6 +18,8 @@ use std::fmt; use itertools::Itertools; +use super::ApplyResult; +use crate::error::Result; use crate::optimizer::plan_node::PlanTreeNode; use crate::optimizer::rule::BoxedRule; use crate::optimizer::PlanRef; @@ -48,41 +50,46 @@ impl<'a> HeuristicOptimizer<'a> { } } - fn optimize_node(&mut self, mut plan: PlanRef) -> PlanRef { + fn optimize_node(&mut self, mut plan: PlanRef) -> Result { for rule in self.rules { - if let Some(applied) = rule.apply(plan.clone()) { - #[cfg(debug_assertions)] - Self::check_equivalent_plan(rule.description(), &plan, &applied); - - plan = applied; - self.stats.count_rule(rule); + match rule.apply(plan.clone()) { + ApplyResult::Ok(applied) => { + #[cfg(debug_assertions)] + Self::check_equivalent_plan(rule.description(), &plan, &applied); + + plan = applied; + self.stats.count_rule(rule); + } + ApplyResult::NotApplicable => {} + ApplyResult::Err(error) => return Err(error), } } - plan + Ok(plan) } - fn optimize_inputs(&mut self, plan: PlanRef) -> PlanRef { + fn optimize_inputs(&mut self, plan: PlanRef) -> Result { let pre_applied = self.stats.total_applied(); - let inputs = plan + let inputs: Vec<_> = plan .inputs() .into_iter() .map(|sub_tree| self.optimize(sub_tree)) - .collect_vec(); - if pre_applied != self.stats.total_applied() { + .try_collect()?; + + Ok(if pre_applied != self.stats.total_applied() { plan.clone_with_inputs(&inputs) } else { plan - } + }) } - pub fn optimize(&mut self, mut plan: PlanRef) -> PlanRef { + pub fn optimize(&mut self, mut plan: PlanRef) -> Result { match self.apply_order { ApplyOrder::TopDown => { - plan = self.optimize_node(plan); + plan = self.optimize_node(plan)?; self.optimize_inputs(plan) } ApplyOrder::BottomUp => { - plan = self.optimize_inputs(plan); + plan = self.optimize_inputs(plan)?; self.optimize_node(plan) } } diff --git a/src/frontend/src/optimizer/logical_optimization.rs b/src/frontend/src/optimizer/logical_optimization.rs index dc2d1e98bb96..0d6e2356c901 100644 --- a/src/frontend/src/optimizer/logical_optimization.rs +++ b/src/frontend/src/optimizer/logical_optimization.rs @@ -14,6 +14,7 @@ use itertools::Itertools; use risingwave_common::bail; +use thiserror_ext::AsReport as _; use super::plan_node::RewriteExprsRecursive; use super::plan_rewriter::IcebergMergeOnReadRewriter; @@ -36,53 +37,53 @@ use crate::utils::Condition; use crate::{Explain, OptimizerContextRef}; impl PlanRef { - pub(crate) fn optimize_by_rules(self, stage: &OptimizationStage) -> PlanRef { - let OptimizationStage { - stage_name, - rules, - apply_order, - } = stage; + fn optimize_by_rules_inner( + self, + heuristic_optimizer: &mut HeuristicOptimizer<'_>, + stage_name: &str, + ) -> Result { + let ctx = self.ctx(); - let mut heuristic_optimizer = HeuristicOptimizer::new(apply_order, rules); - let plan = heuristic_optimizer.optimize(self); + let result = heuristic_optimizer.optimize(self); let stats = heuristic_optimizer.get_stats(); - let ctx = plan.ctx(); - let explain_trace = ctx.is_explain_trace(); - if explain_trace && stats.has_applied_rule() { + if ctx.is_explain_trace() && stats.has_applied_rule() { ctx.trace(format!("{}:", stage_name)); ctx.trace(format!("{}", stats)); - ctx.trace(plan.explain_to_string()); + ctx.trace(match &result { + Ok(plan) => plan.explain_to_string(), + Err(error) => format!("Optimization failed: {}", error.as_report()), + }); } ctx.add_rule_applied(stats.total_applied()); - plan + result } - pub(crate) fn optimize_by_rules_until_fix_point(self, stage: &OptimizationStage) -> PlanRef { - let OptimizationStage { + pub(crate) fn optimize_by_rules( + self, + OptimizationStage { stage_name, rules, apply_order, - } = stage; + }: &OptimizationStage, + ) -> Result { + self.optimize_by_rules_inner(&mut HeuristicOptimizer::new(apply_order, rules), stage_name) + } - let mut output_plan = self; + pub(crate) fn optimize_by_rules_until_fix_point( + mut self, + OptimizationStage { + stage_name, + rules, + apply_order, + }: &OptimizationStage, + ) -> Result { loop { let mut heuristic_optimizer = HeuristicOptimizer::new(apply_order, rules); - output_plan = heuristic_optimizer.optimize(output_plan); - let stats = heuristic_optimizer.get_stats(); - - let ctx = output_plan.ctx(); - let explain_trace = ctx.is_explain_trace(); - if explain_trace && stats.has_applied_rule() { - ctx.trace(format!("{}:", stage_name)); - ctx.trace(format!("{}", stats)); - ctx.trace(output_plan.explain_to_string()); - } - ctx.add_rule_applied(stats.total_applied()); - - if !stats.has_applied_rule() { - return output_plan; + self = self.optimize_by_rules_inner(&mut heuristic_optimizer, stage_name)?; + if !heuristic_optimizer.get_stats().has_applied_rule() { + return Ok(self); } } } @@ -494,22 +495,22 @@ impl LogicalOptimizer { return Ok(plan); } // Simple Unnesting. - plan = plan.optimize_by_rules(&SIMPLE_UNNESTING); + plan = plan.optimize_by_rules(&SIMPLE_UNNESTING)?; debug_assert!(!HasMaxOneRowApply().visit(plan.clone())); // Predicate push down before translate apply, because we need to calculate the domain // and predicate push down can reduce the size of domain. plan = Self::predicate_pushdown(plan, explain_trace, ctx); // In order to unnest values with correlated input ref, we need to extract project first. - plan = plan.optimize_by_rules(&VALUES_EXTRACT_PROJECT); + plan = plan.optimize_by_rules(&VALUES_EXTRACT_PROJECT)?; // General Unnesting. // Translate Apply, push Apply down the plan and finally replace Apply with regular inner // join. plan = if enable_share_plan { - plan.optimize_by_rules(&GENERAL_UNNESTING_TRANS_APPLY_WITH_SHARE) + plan.optimize_by_rules(&GENERAL_UNNESTING_TRANS_APPLY_WITH_SHARE)? } else { - plan.optimize_by_rules(&GENERAL_UNNESTING_TRANS_APPLY_WITHOUT_SHARE) + plan.optimize_by_rules(&GENERAL_UNNESTING_TRANS_APPLY_WITHOUT_SHARE)? }; - plan = plan.optimize_by_rules_until_fix_point(&GENERAL_UNNESTING_PUSH_DOWN_APPLY); + plan = plan.optimize_by_rules_until_fix_point(&GENERAL_UNNESTING_PUSH_DOWN_APPLY)?; // Check if all `Apply`s are eliminated and the subquery is unnested. plan.check_apply_elimination()?; @@ -572,9 +573,9 @@ impl LogicalOptimizer { } // Convert grouping sets at first because other agg rule can't handle grouping sets. - plan = plan.optimize_by_rules(&GROUPING_SETS); + plan = plan.optimize_by_rules(&GROUPING_SETS)?; // Remove project to make common sub-plan sharing easier. - plan = plan.optimize_by_rules(&PROJECT_REMOVE); + plan = plan.optimize_by_rules(&PROJECT_REMOVE)?; // If share plan is disable, we need to remove all the share operator generated by the // binder, e.g. CTE and View. However, we still need to share source to ensure self @@ -589,7 +590,7 @@ impl LogicalOptimizer { ctx.trace(plan.explain_to_string()); } } else { - plan = plan.optimize_by_rules(&DAG_TO_TREE); + plan = plan.optimize_by_rules(&DAG_TO_TREE)?; // Replace source to share source. // Perform share source at the beginning so that we can benefit from predicate pushdown @@ -600,13 +601,13 @@ impl LogicalOptimizer { ctx.trace(plan.explain_to_string()); } } - plan = plan.optimize_by_rules(&SET_OPERATION_MERGE); - plan = plan.optimize_by_rules(&SET_OPERATION_TO_JOIN); + plan = plan.optimize_by_rules(&SET_OPERATION_MERGE)?; + plan = plan.optimize_by_rules(&SET_OPERATION_TO_JOIN)?; // Convert `generate_series` ends with `now()` to a `Now` source. Only for streaming mode. // Should be applied before converting table function to project set. - plan = plan.optimize_by_rules(&STREAM_GENERATE_SERIES_WITH_NOW); + plan = plan.optimize_by_rules(&STREAM_GENERATE_SERIES_WITH_NOW)?; // In order to unnest a table function, we need to convert it into a `project_set` first. - plan = plan.optimize_by_rules(&TABLE_FUNCTION_CONVERT); + plan = plan.optimize_by_rules(&TABLE_FUNCTION_CONVERT)?; plan = Self::subquery_unnesting(plan, enable_share_plan, explain_trace, &ctx)?; if has_logical_max_one_row(plan.clone()) { @@ -618,7 +619,7 @@ impl LogicalOptimizer { // Same to batch plan optimization, this rule shall be applied before // predicate push down - plan = plan.optimize_by_rules(&LOGICAL_FILTER_EXPRESSION_SIMPLIFY); + plan = plan.optimize_by_rules(&LOGICAL_FILTER_EXPRESSION_SIMPLIFY)?; // Predicate Push-down plan = Self::predicate_pushdown(plan, explain_trace, &ctx); @@ -627,7 +628,7 @@ impl LogicalOptimizer { // Merge inner joins and intermediate filters into multijoin // This rule assumes that filters have already been pushed down near to // their relevant joins. - plan = plan.optimize_by_rules(&TO_MULTI_JOIN); + plan = plan.optimize_by_rules(&TO_MULTI_JOIN)?; // Reorder multijoin into join tree. if plan @@ -636,9 +637,9 @@ impl LogicalOptimizer { .config() .streaming_enable_bushy_join() { - plan = plan.optimize_by_rules(&BUSHY_TREE_JOIN_ORDERING); + plan = plan.optimize_by_rules(&BUSHY_TREE_JOIN_ORDERING)?; } else { - plan = plan.optimize_by_rules(&LEFT_DEEP_JOIN_ORDERING); + plan = plan.optimize_by_rules(&LEFT_DEEP_JOIN_ORDERING)?; } } @@ -647,38 +648,38 @@ impl LogicalOptimizer { plan = Self::predicate_pushdown(plan, explain_trace, &ctx); // For stream, push down predicates with now into a left-semi join - plan = plan.optimize_by_rules(&FILTER_WITH_NOW_TO_JOIN); + plan = plan.optimize_by_rules(&FILTER_WITH_NOW_TO_JOIN)?; // Push down the calculation of inputs of join's condition. - plan = plan.optimize_by_rules(&PUSH_CALC_OF_JOIN); + plan = plan.optimize_by_rules(&PUSH_CALC_OF_JOIN)?; - plan = plan.optimize_by_rules(&SPLIT_OVER_WINDOW); + plan = plan.optimize_by_rules(&SPLIT_OVER_WINDOW)?; // Must push down predicates again after split over window so that OverWindow can be // optimized to TopN. plan = Self::predicate_pushdown(plan, explain_trace, &ctx); - plan = plan.optimize_by_rules(&CONVERT_OVER_WINDOW); - plan = plan.optimize_by_rules(&MERGE_OVER_WINDOW); + plan = plan.optimize_by_rules(&CONVERT_OVER_WINDOW)?; + plan = plan.optimize_by_rules(&MERGE_OVER_WINDOW)?; let force_split_distinct_agg = ctx.session_ctx().config().force_split_distinct_agg(); // TODO: better naming of the OptimizationStage // Convert distinct aggregates. plan = if force_split_distinct_agg { - plan.optimize_by_rules(&CONVERT_DISTINCT_AGG_FOR_BATCH) + plan.optimize_by_rules(&CONVERT_DISTINCT_AGG_FOR_BATCH)? } else { - plan.optimize_by_rules(&CONVERT_DISTINCT_AGG_FOR_STREAM) + plan.optimize_by_rules(&CONVERT_DISTINCT_AGG_FOR_STREAM)? }; - plan = plan.optimize_by_rules(&SIMPLIFY_AGG); + plan = plan.optimize_by_rules(&SIMPLIFY_AGG)?; - plan = plan.optimize_by_rules(&JOIN_COMMUTE); + plan = plan.optimize_by_rules(&JOIN_COMMUTE)?; // Do a final column pruning and predicate pushing down to clean up the plan. plan = Self::column_pruning(plan, explain_trace, &ctx); plan = Self::predicate_pushdown(plan, explain_trace, &ctx); - plan = plan.optimize_by_rules(&PROJECT_REMOVE); + plan = plan.optimize_by_rules(&PROJECT_REMOVE)?; - plan = plan.optimize_by_rules(&COMMON_SUB_EXPR_EXTRACT); + plan = plan.optimize_by_rules(&COMMON_SUB_EXPR_EXTRACT)?; #[cfg(debug_assertions)] InputRefValidator.validate(plan.clone()); @@ -697,6 +698,9 @@ impl LogicalOptimizer { ExplainFormat::Yaml => { ctx.store_logical(plan.explain_to_yaml()); } + ExplainFormat::Dot => { + ctx.store_logical(plan.explain_to_dot()); + } } } @@ -716,28 +720,28 @@ impl LogicalOptimizer { plan = Self::inline_now_proc_time(plan, &ctx); // Convert the dag back to the tree, because we don't support DAG plan for batch. - plan = plan.optimize_by_rules(&DAG_TO_TREE); + plan = plan.optimize_by_rules(&DAG_TO_TREE)?; plan = IcebergMergeOnReadRewriter::rewrite(plan)?; - plan = plan.optimize_by_rules(&REWRITE_SOURCE_FOR_BATCH); - plan = plan.optimize_by_rules(&GROUPING_SETS); - plan = plan.optimize_by_rules(&REWRITE_LIKE_EXPR); - plan = plan.optimize_by_rules(&SET_OPERATION_MERGE); - plan = plan.optimize_by_rules(&SET_OPERATION_TO_JOIN); - plan = plan.optimize_by_rules(&ALWAYS_FALSE_FILTER); + plan = plan.optimize_by_rules(&REWRITE_SOURCE_FOR_BATCH)?; + plan = plan.optimize_by_rules(&GROUPING_SETS)?; + plan = plan.optimize_by_rules(&REWRITE_LIKE_EXPR)?; + plan = plan.optimize_by_rules(&SET_OPERATION_MERGE)?; + plan = plan.optimize_by_rules(&SET_OPERATION_TO_JOIN)?; + plan = plan.optimize_by_rules(&ALWAYS_FALSE_FILTER)?; // Table function should be converted into `file_scan` before `project_set`. - plan = plan.optimize_by_rules(&TABLE_FUNCTION_TO_FILE_SCAN); - plan = plan.optimize_by_rules(&TABLE_FUNCTION_TO_POSTGRES_QUERY); - plan = plan.optimize_by_rules(&TABLE_FUNCTION_TO_MYSQL_QUERY); + plan = plan.optimize_by_rules(&TABLE_FUNCTION_TO_FILE_SCAN)?; + plan = plan.optimize_by_rules(&TABLE_FUNCTION_TO_POSTGRES_QUERY)?; + plan = plan.optimize_by_rules(&TABLE_FUNCTION_TO_MYSQL_QUERY)?; // In order to unnest a table function, we need to convert it into a `project_set` first. - plan = plan.optimize_by_rules(&TABLE_FUNCTION_CONVERT); + plan = plan.optimize_by_rules(&TABLE_FUNCTION_CONVERT)?; plan = Self::subquery_unnesting(plan, false, explain_trace, &ctx)?; // Filter simplification must be applied before predicate push-down // otherwise the filter for some nodes (e.g., `LogicalScan`) // may not be properly applied. - plan = plan.optimize_by_rules(&LOGICAL_FILTER_EXPRESSION_SIMPLIFY); + plan = plan.optimize_by_rules(&LOGICAL_FILTER_EXPRESSION_SIMPLIFY)?; // Predicate Push-down let mut last_total_rule_applied_before_predicate_pushdown = ctx.total_rule_applied(); @@ -747,10 +751,10 @@ impl LogicalOptimizer { // Merge inner joins and intermediate filters into multijoin // This rule assumes that filters have already been pushed down near to // their relevant joins. - plan = plan.optimize_by_rules(&TO_MULTI_JOIN); + plan = plan.optimize_by_rules(&TO_MULTI_JOIN)?; // Reorder multijoin into left-deep join tree. - plan = plan.optimize_by_rules(&LEFT_DEEP_JOIN_ORDERING); + plan = plan.optimize_by_rules(&LEFT_DEEP_JOIN_ORDERING)?; } // Predicate Push-down: apply filter pushdown rules again since we pullup all join @@ -761,24 +765,24 @@ impl LogicalOptimizer { } // Push down the calculation of inputs of join's condition. - plan = plan.optimize_by_rules(&PUSH_CALC_OF_JOIN); + plan = plan.optimize_by_rules(&PUSH_CALC_OF_JOIN)?; - plan = plan.optimize_by_rules(&SPLIT_OVER_WINDOW); + plan = plan.optimize_by_rules(&SPLIT_OVER_WINDOW)?; // Must push down predicates again after split over window so that OverWindow can be // optimized to TopN. if last_total_rule_applied_before_predicate_pushdown != ctx.total_rule_applied() { last_total_rule_applied_before_predicate_pushdown = ctx.total_rule_applied(); plan = Self::predicate_pushdown(plan, explain_trace, &ctx); } - plan = plan.optimize_by_rules(&CONVERT_OVER_WINDOW); - plan = plan.optimize_by_rules(&MERGE_OVER_WINDOW); + plan = plan.optimize_by_rules(&CONVERT_OVER_WINDOW)?; + plan = plan.optimize_by_rules(&MERGE_OVER_WINDOW)?; // Convert distinct aggregates. - plan = plan.optimize_by_rules(&CONVERT_DISTINCT_AGG_FOR_BATCH); + plan = plan.optimize_by_rules(&CONVERT_DISTINCT_AGG_FOR_BATCH)?; - plan = plan.optimize_by_rules(&SIMPLIFY_AGG); + plan = plan.optimize_by_rules(&SIMPLIFY_AGG)?; - plan = plan.optimize_by_rules(&JOIN_COMMUTE); + plan = plan.optimize_by_rules(&JOIN_COMMUTE)?; // Do a final column pruning and predicate pushing down to clean up the plan. plan = Self::column_pruning(plan, explain_trace, &ctx); @@ -788,17 +792,17 @@ impl LogicalOptimizer { plan = Self::predicate_pushdown(plan, explain_trace, &ctx); } - plan = plan.optimize_by_rules(&PROJECT_REMOVE); + plan = plan.optimize_by_rules(&PROJECT_REMOVE)?; - plan = plan.optimize_by_rules(&COMMON_SUB_EXPR_EXTRACT); + plan = plan.optimize_by_rules(&COMMON_SUB_EXPR_EXTRACT)?; - plan = plan.optimize_by_rules(&PULL_UP_HOP); + plan = plan.optimize_by_rules(&PULL_UP_HOP)?; - plan = plan.optimize_by_rules(&TOP_N_AGG_ON_INDEX); + plan = plan.optimize_by_rules(&TOP_N_AGG_ON_INDEX)?; - plan = plan.optimize_by_rules(&LIMIT_PUSH_DOWN); + plan = plan.optimize_by_rules(&LIMIT_PUSH_DOWN)?; - plan = plan.optimize_by_rules(&DAG_TO_TREE); + plan = plan.optimize_by_rules(&DAG_TO_TREE)?; #[cfg(debug_assertions)] InputRefValidator.validate(plan.clone()); @@ -817,6 +821,9 @@ impl LogicalOptimizer { ExplainFormat::Yaml => { ctx.store_logical(plan.explain_to_yaml()); } + ExplainFormat::Dot => { + ctx.store_logical(plan.explain_to_dot()); + } } } diff --git a/src/frontend/src/optimizer/mod.rs b/src/frontend/src/optimizer/mod.rs index 6ebb4c187a38..323e38f99032 100644 --- a/src/frontend/src/optimizer/mod.rs +++ b/src/frontend/src/optimizer/mod.rs @@ -334,7 +334,7 @@ impl PlanRoot { "Merge BatchProject", vec![BatchProjectMergeRule::create()], ApplyOrder::BottomUp, - )); + ))?; // Inline session timezone plan = inline_session_timezone_in_exprs(ctx.clone(), plan)?; @@ -400,7 +400,7 @@ impl PlanRoot { "Push Limit To Scan", vec![BatchPushLimitToScanRule::create()], ApplyOrder::BottomUp, - )); + ))?; // For iceberg scan, we do iceberg predicate pushdown // BatchFilter -> BatchIcebergScan @@ -408,7 +408,7 @@ impl PlanRoot { "Iceberg Predicate Pushdown", vec![BatchIcebergPredicatePushDownRule::create()], ApplyOrder::BottomUp, - )); + ))?; assert_eq!(plan.convention(), Convention::Batch); Ok(plan) @@ -451,7 +451,7 @@ impl PlanRoot { "Push Limit To Scan", vec![BatchPushLimitToScanRule::create()], ApplyOrder::BottomUp, - )); + ))?; assert_eq!(plan.convention(), Convention::Batch); Ok(plan) @@ -491,7 +491,7 @@ impl PlanRoot { "Merge StreamProject", vec![StreamProjectMergeRule::create()], ApplyOrder::BottomUp, - )); + ))?; if ctx.session_ctx().config().streaming_enable_delta_join() { // TODO: make it a logical optimization. @@ -500,7 +500,7 @@ impl PlanRoot { "To IndexDeltaJoin", vec![IndexDeltaJoinRule::create()], ApplyOrder::BottomUp, - )); + ))?; } // Inline session timezone diff --git a/src/frontend/src/optimizer/plan_expr_visitor/strong.rs b/src/frontend/src/optimizer/plan_expr_visitor/strong.rs index 890152f00e33..c53bde642ad3 100644 --- a/src/frontend/src/optimizer/plan_expr_visitor/strong.rs +++ b/src/frontend/src/optimizer/plan_expr_visitor/strong.rs @@ -326,7 +326,8 @@ impl Strong { | ExprType::HasAnyColumnPrivilege | ExprType::HasSchemaPrivilege | ExprType::InetAton - | ExprType::InetNtoa => false, + | ExprType::InetNtoa + | ExprType::RwEpochToTs => false, ExprType::Unspecified => unreachable!(), } } diff --git a/src/frontend/src/optimizer/plan_node/batch_max_one_row.rs b/src/frontend/src/optimizer/plan_node/batch_max_one_row.rs index e35d6976672c..4b3d0de43cb6 100644 --- a/src/frontend/src/optimizer/plan_node/batch_max_one_row.rs +++ b/src/frontend/src/optimizer/plan_node/batch_max_one_row.rs @@ -25,6 +25,7 @@ use super::{ use crate::error::Result; use crate::optimizer::plan_node::expr_visitable::ExprVisitable; use crate::optimizer::plan_node::ToLocalBatch; +use crate::optimizer::property::{Order, RequiredDist}; /// [`BatchMaxOneRow`] fetches up to one row from the input, returning an error /// if the input contains more than one row at runtime. @@ -66,7 +67,9 @@ impl Distill for BatchMaxOneRow { impl ToDistributedBatch for BatchMaxOneRow { fn to_distributed(&self) -> Result { - Ok(self.clone_with_input(self.input().to_distributed()?).into()) + let new_input = RequiredDist::single() + .enforce_if_not_satisfies(self.input().to_distributed()?, &Order::any())?; + Ok(self.clone_with_input(new_input).into()) } } @@ -78,7 +81,9 @@ impl ToBatchPb for BatchMaxOneRow { impl ToLocalBatch for BatchMaxOneRow { fn to_local(&self) -> Result { - Ok(self.clone_with_input(self.input().to_local()?).into()) + let new_input = RequiredDist::single() + .enforce_if_not_satisfies(self.input().to_local()?, &Order::any())?; + Ok(self.clone_with_input(new_input).into()) } } diff --git a/src/frontend/src/optimizer/plan_node/batch_update.rs b/src/frontend/src/optimizer/plan_node/batch_update.rs index d0351e6fdec2..28dfa79916cc 100644 --- a/src/frontend/src/optimizer/plan_node/batch_update.rs +++ b/src/frontend/src/optimizer/plan_node/batch_update.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use itertools::Itertools; use risingwave_common::catalog::Schema; use risingwave_pb::batch_plan::plan_node::NodeBody; use risingwave_pb::batch_plan::UpdateNode; @@ -84,20 +83,21 @@ impl ToDistributedBatch for BatchUpdate { impl ToBatchPb for BatchUpdate { fn to_batch_prost_body(&self) -> NodeBody { - let exprs = self.core.exprs.iter().map(|x| x.to_expr_proto()).collect(); - - let update_column_indices = self - .core - .update_column_indices + let old_exprs = (self.core.old_exprs) + .iter() + .map(|x| x.to_expr_proto()) + .collect(); + let new_exprs = (self.core.new_exprs) .iter() - .map(|i| *i as _) - .collect_vec(); + .map(|x| x.to_expr_proto()) + .collect(); + NodeBody::Update(UpdateNode { - exprs, table_id: self.core.table_id.table_id(), table_version_id: self.core.table_version_id, returning: self.core.returning, - update_column_indices, + old_exprs, + new_exprs, session_id: self.base.ctx().session_ctx().session_id().0 as u32, }) } @@ -125,6 +125,6 @@ impl ExprRewritable for BatchUpdate { impl ExprVisitable for BatchUpdate { fn visit_exprs(&self, v: &mut dyn ExprVisitor) { - self.core.exprs.iter().for_each(|e| v.visit_expr(e)); + self.core.visit_exprs(v); } } diff --git a/src/frontend/src/optimizer/plan_node/generic/update.rs b/src/frontend/src/optimizer/plan_node/generic/update.rs index 61d044f53c99..d68af1a01ae3 100644 --- a/src/frontend/src/optimizer/plan_node/generic/update.rs +++ b/src/frontend/src/optimizer/plan_node/generic/update.rs @@ -21,7 +21,7 @@ use risingwave_common::types::DataType; use super::{DistillUnit, GenericPlanNode, GenericPlanRef}; use crate::catalog::TableId; -use crate::expr::{ExprImpl, ExprRewriter}; +use crate::expr::{Expr, ExprImpl, ExprRewriter, ExprVisitor}; use crate::optimizer::plan_node::utils::childless_record; use crate::optimizer::property::FunctionalDependencySet; use crate::OptimizerContextRef; @@ -35,15 +35,15 @@ pub struct Update { pub table_id: TableId, pub table_version_id: TableVersionId, pub input: PlanRef, - pub exprs: Vec, + pub old_exprs: Vec, + pub new_exprs: Vec, pub returning: bool, - pub update_column_indices: Vec, } impl Update { pub fn output_len(&self) -> usize { if self.returning { - self.input.schema().len() + self.new_exprs.len() } else { 1 } @@ -56,18 +56,19 @@ impl GenericPlanNode for Update { fn schema(&self) -> Schema { if self.returning { - self.input.schema().clone() + Schema::new( + self.new_exprs + .iter() + .map(|e| Field::unnamed(e.return_type())) + .collect(), + ) } else { Schema::new(vec![Field::unnamed(DataType::Int64)]) } } fn stream_key(&self) -> Option> { - if self.returning { - Some(self.input.stream_key()?.to_vec()) - } else { - Some(vec![]) - } + None } fn ctx(&self) -> OptimizerContextRef { @@ -81,27 +82,31 @@ impl Update { table_name: String, table_id: TableId, table_version_id: TableVersionId, - exprs: Vec, + old_exprs: Vec, + new_exprs: Vec, returning: bool, - update_column_indices: Vec, ) -> Self { Self { table_name, table_id, table_version_id, input, - exprs, + old_exprs, + new_exprs, returning, - update_column_indices, } } pub(crate) fn rewrite_exprs(&mut self, r: &mut dyn ExprRewriter) { - self.exprs = self - .exprs - .iter() - .map(|e| r.rewrite_expr(e.clone())) - .collect(); + for exprs in [&mut self.old_exprs, &mut self.new_exprs] { + *exprs = exprs.iter().map(|e| r.rewrite_expr(e.clone())).collect(); + } + } + + pub(crate) fn visit_exprs(&self, v: &mut dyn ExprVisitor) { + for exprs in [&self.old_exprs, &self.new_exprs] { + exprs.iter().for_each(|e| v.visit_expr(e)); + } } } @@ -109,7 +114,7 @@ impl DistillUnit for Update { fn distill_with_name<'a>(&self, name: impl Into>) -> XmlNode<'a> { let mut vec = Vec::with_capacity(if self.returning { 3 } else { 2 }); vec.push(("table", Pretty::from(self.table_name.clone()))); - vec.push(("exprs", Pretty::debug(&self.exprs))); + vec.push(("exprs", Pretty::debug(&self.new_exprs))); if self.returning { vec.push(("returning", Pretty::display(&true))); } diff --git a/src/frontend/src/optimizer/plan_node/logical_agg.rs b/src/frontend/src/optimizer/plan_node/logical_agg.rs index 7f2b52797924..5a146c37398a 100644 --- a/src/frontend/src/optimizer/plan_node/logical_agg.rs +++ b/src/frontend/src/optimizer/plan_node/logical_agg.rs @@ -14,7 +14,7 @@ use fixedbitset::FixedBitSet; use itertools::Itertools; -use risingwave_common::types::{DataType, Datum, ScalarImpl}; +use risingwave_common::types::{DataType, ScalarImpl}; use risingwave_common::util::sort_util::{ColumnOrder, OrderType}; use risingwave_common::{bail, bail_not_implemented, not_implemented}; use risingwave_expr::aggregate::{agg_types, AggType, PbAggKind}; @@ -684,17 +684,15 @@ impl LogicalAggBuilder { agg_call.direct_args.clone(), )?)?); - let one = ExprImpl::from(Literal::new( - Datum::from(ScalarImpl::Int64(1)), - DataType::Int64, - )); + let zero = ExprImpl::literal_int(0); + let one = ExprImpl::literal_int(1); let squared_sum = ExprImpl::from(FunctionCall::new( ExprType::Multiply, vec![sum.clone(), sum], )?); - let numerator = ExprImpl::from(FunctionCall::new( + let raw_numerator = ExprImpl::from(FunctionCall::new( ExprType::Subtract, vec![ sum_of_sq, @@ -705,6 +703,13 @@ impl LogicalAggBuilder { ], )?); + // We need to check for potential accuracy issues that may occasionally lead to results less than 0. + let numerator_type = raw_numerator.return_type(); + let numerator = ExprImpl::from(FunctionCall::new( + ExprType::Greatest, + vec![raw_numerator, zero.clone().cast_explicit(numerator_type)?], + )?); + let denominator = match kind { PbAggKind::VarPop | PbAggKind::StddevPop => count.clone(), PbAggKind::VarSamp | PbAggKind::StddevSamp => ExprImpl::from( @@ -722,22 +727,21 @@ impl LogicalAggBuilder { target = ExprImpl::from(FunctionCall::new(ExprType::Sqrt, vec![target])?); } - match kind { - PbAggKind::VarPop | PbAggKind::StddevPop => Ok(target), - PbAggKind::StddevSamp | PbAggKind::VarSamp => { - let case_cond = ExprImpl::from(FunctionCall::new( - ExprType::LessThanOrEqual, - vec![count, one], - )?); - let null = ExprImpl::from(Literal::new(None, agg_call.return_type())); - - Ok(ExprImpl::from(FunctionCall::new( - ExprType::Case, - vec![case_cond, null, target], - )?)) + let null = ExprImpl::from(Literal::new(None, agg_call.return_type())); + let case_cond = match kind { + PbAggKind::VarPop | PbAggKind::StddevPop => { + ExprImpl::from(FunctionCall::new(ExprType::Equal, vec![count, zero])?) } + PbAggKind::VarSamp | PbAggKind::StddevSamp => ExprImpl::from( + FunctionCall::new(ExprType::LessThanOrEqual, vec![count, one])?, + ), _ => unreachable!(), - } + }; + + Ok(ExprImpl::from(FunctionCall::new( + ExprType::Case, + vec![case_cond, null, target], + )?)) } AggType::Builtin(PbAggKind::ApproxPercentile) => { if agg_call.order_by.sort_exprs[0].order_type == OrderType::descending() { diff --git a/src/frontend/src/optimizer/plan_node/logical_scan.rs b/src/frontend/src/optimizer/plan_node/logical_scan.rs index ccb90889cb94..209e3e22ec72 100644 --- a/src/frontend/src/optimizer/plan_node/logical_scan.rs +++ b/src/frontend/src/optimizer/plan_node/logical_scan.rs @@ -40,6 +40,7 @@ use crate::optimizer::plan_node::{ }; use crate::optimizer::property::{Cardinality, Order}; use crate::optimizer::rule::IndexSelectionRule; +use crate::optimizer::ApplyResult; use crate::utils::{ColIndexMapping, Condition, ConditionDisplay}; use crate::TableCatalog; @@ -494,7 +495,7 @@ impl ToBatch for LogicalScan { if !new.indexes().is_empty() { let index_selection_rule = IndexSelectionRule::create(); - if let Some(applied) = index_selection_rule.apply(new.clone().into()) { + if let ApplyResult::Ok(applied) = index_selection_rule.apply(new.clone().into()) { if let Some(scan) = applied.as_logical_scan() { // covering index return required_order.enforce_if_not_satisfies(scan.to_batch()?); diff --git a/src/frontend/src/optimizer/plan_node/logical_update.rs b/src/frontend/src/optimizer/plan_node/logical_update.rs index 127b6ed8b317..a5590501715b 100644 --- a/src/frontend/src/optimizer/plan_node/logical_update.rs +++ b/src/frontend/src/optimizer/plan_node/logical_update.rs @@ -12,17 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use risingwave_common::catalog::TableVersionId; - use super::generic::GenericPlanRef; use super::utils::impl_distill_by_unit; use super::{ gen_filter_and_pushdown, generic, BatchUpdate, ColPrunable, ExprRewritable, Logical, LogicalProject, PlanBase, PlanRef, PlanTreeNodeUnary, PredicatePushdown, ToBatch, ToStream, }; -use crate::catalog::TableId; use crate::error::Result; -use crate::expr::{ExprImpl, ExprRewriter, ExprVisitor}; +use crate::expr::{ExprRewriter, ExprVisitor}; use crate::optimizer::plan_node::expr_visitable::ExprVisitable; use crate::optimizer::plan_node::{ ColumnPruningContext, PredicatePushdownContext, RewriteStreamContext, ToStreamContext, @@ -46,25 +43,6 @@ impl From> for LogicalUpdate { } } -impl LogicalUpdate { - #[must_use] - pub fn table_id(&self) -> TableId { - self.core.table_id - } - - pub fn exprs(&self) -> &[ExprImpl] { - self.core.exprs.as_ref() - } - - pub fn has_returning(&self) -> bool { - self.core.returning - } - - pub fn table_version_id(&self) -> TableVersionId { - self.core.table_version_id - } -} - impl PlanTreeNodeUnary for LogicalUpdate { fn input(&self) -> PlanRef { self.core.input.clone() @@ -86,15 +64,15 @@ impl ExprRewritable for LogicalUpdate { } fn rewrite_exprs(&self, r: &mut dyn ExprRewriter) -> PlanRef { - let mut new = self.core.clone(); - new.exprs = new.exprs.into_iter().map(|e| r.rewrite_expr(e)).collect(); - Self::from(new).into() + let mut core = self.core.clone(); + core.rewrite_exprs(r); + Self::from(core).into() } } impl ExprVisitable for LogicalUpdate { fn visit_exprs(&self, v: &mut dyn ExprVisitor) { - self.core.exprs.iter().for_each(|e| v.visit_expr(e)); + self.core.visit_exprs(v); } } diff --git a/src/frontend/src/optimizer/plan_node/mod.rs b/src/frontend/src/optimizer/plan_node/mod.rs index 9986b8800b66..165f867b3c76 100644 --- a/src/frontend/src/optimizer/plan_node/mod.rs +++ b/src/frontend/src/optimizer/plan_node/mod.rs @@ -27,6 +27,7 @@ //! - all field should be valued in construction, so the properties' derivation should be finished //! in the `new()` function. +use std::collections::HashMap; use std::fmt::Debug; use std::hash::Hash; use std::ops::Deref; @@ -37,6 +38,8 @@ use dyn_clone::DynClone; use fixedbitset::FixedBitSet; use itertools::Itertools; use paste::paste; +use petgraph::dot::{Config, Dot}; +use petgraph::graph::{Graph, NodeIndex}; use pretty_xmlish::{Pretty, PrettyConfig}; use risingwave_common::catalog::Schema; use risingwave_common::util::recursive::{self, Recurse}; @@ -642,6 +645,9 @@ pub trait Explain { /// Write explain the whole plan tree. fn explain<'a>(&self) -> Pretty<'a>; + /// Write explain the whole plan tree with node id. + fn explain_with_id<'a>(&self) -> Pretty<'a>; + /// Explain the plan node and return a string. fn explain_to_string(&self) -> String; @@ -653,6 +659,9 @@ pub trait Explain { /// Explain the plan node and return a yaml string. fn explain_to_yaml(&self) -> String; + + /// Explain the plan node and return a dot format string. + fn explain_to_dot(&self) -> String; } impl Explain for PlanRef { @@ -666,6 +675,21 @@ impl Explain for PlanRef { Pretty::Record(node) } + /// Write explain the whole plan tree with node id. + fn explain_with_id<'a>(&self) -> Pretty<'a> { + let node_id = self.id(); + let mut node = self.distill(); + // NOTE(kwannoel): Can lead to poor performance if plan is very large, + // but we want to show the id first. + node.fields + .insert(0, ("id".into(), Pretty::display(&node_id.0))); + let inputs = self.inputs(); + for input in inputs.iter().peekable() { + node.children.push(input.explain_with_id()); + } + Pretty::Record(node) + } + /// Explain the plan node and return a string. fn explain_to_string(&self) -> String { let plan = reorganize_elements_id(self.clone()); @@ -680,7 +704,7 @@ impl Explain for PlanRef { fn explain_to_json(&self) -> String { let plan = reorganize_elements_id(self.clone()); let explain_ir = plan.explain(); - serde_json::to_string_pretty(&PrettySerde(explain_ir)) + serde_json::to_string_pretty(&PrettySerde(explain_ir, true)) .expect("failed to serialize plan to json") } @@ -688,14 +712,66 @@ impl Explain for PlanRef { fn explain_to_xml(&self) -> String { let plan = reorganize_elements_id(self.clone()); let explain_ir = plan.explain(); - quick_xml::se::to_string(&PrettySerde(explain_ir)).expect("failed to serialize plan to xml") + quick_xml::se::to_string(&PrettySerde(explain_ir, true)) + .expect("failed to serialize plan to xml") } /// Explain the plan node and return a yaml string. fn explain_to_yaml(&self) -> String { let plan = reorganize_elements_id(self.clone()); let explain_ir = plan.explain(); - serde_yaml::to_string(&PrettySerde(explain_ir)).expect("failed to serialize plan to yaml") + serde_yaml::to_string(&PrettySerde(explain_ir, true)) + .expect("failed to serialize plan to yaml") + } + + /// Explain the plan node and return a dot format string. + fn explain_to_dot(&self) -> String { + let plan = reorganize_elements_id(self.clone()); + let explain_ir = plan.explain_with_id(); + let mut graph = Graph::::new(); + let mut nodes = HashMap::new(); + build_graph_from_pretty(&explain_ir, &mut graph, &mut nodes, None); + let dot = Dot::with_config(&graph, &[Config::EdgeNoLabel]); + dot.to_string() + } +} + +fn build_graph_from_pretty( + pretty: &Pretty<'_>, + graph: &mut Graph, + nodes: &mut HashMap, + parent_label: Option<&str>, +) { + if let Pretty::Record(r) = pretty { + let mut label = String::new(); + label.push_str(&r.name); + for (k, v) in &r.fields { + label.push('\n'); + label.push_str(k); + label.push_str(": "); + label.push_str( + &serde_json::to_string(&PrettySerde(v.clone(), false)) + .expect("failed to serialize plan to dot"), + ); + } + // output alignment. + if !r.fields.is_empty() { + label.push('\n'); + } + + let current_node = *nodes + .entry(label.clone()) + .or_insert_with(|| graph.add_node(label.clone())); + + if let Some(parent_label) = parent_label { + if let Some(&parent_node) = nodes.get(parent_label) { + graph.add_edge(parent_node, current_node, "contains".to_string()); + } + } + + for child in &r.children { + build_graph_from_pretty(child, graph, nodes, Some(&label)); + } } } diff --git a/src/frontend/src/optimizer/rule/always_false_filter_rule.rs b/src/frontend/src/optimizer/rule/always_false_filter_rule.rs index ed38364ce72c..fc4de7e404b3 100644 --- a/src/frontend/src/optimizer/rule/always_false_filter_rule.rs +++ b/src/frontend/src/optimizer/rule/always_false_filter_rule.rs @@ -14,7 +14,7 @@ use risingwave_common::types::ScalarImpl; -use super::Rule; +use super::{BoxedRule, Rule}; use crate::optimizer::plan_node::generic::GenericPlanRef; use crate::optimizer::plan_node::{LogicalFilter, LogicalValues}; use crate::PlanRef; @@ -43,7 +43,7 @@ impl Rule for AlwaysFalseFilterRule { } impl AlwaysFalseFilterRule { - pub fn create() -> Box { + pub fn create() -> BoxedRule { Box::new(AlwaysFalseFilterRule) } } diff --git a/src/frontend/src/optimizer/rule/batch/batch_project_merge_rule.rs b/src/frontend/src/optimizer/rule/batch/batch_project_merge_rule.rs index 4193d55b2275..732ac01f3ae7 100644 --- a/src/frontend/src/optimizer/rule/batch/batch_project_merge_rule.rs +++ b/src/frontend/src/optimizer/rule/batch/batch_project_merge_rule.rs @@ -15,7 +15,8 @@ use crate::expr::{ExprImpl, ExprRewriter, ExprVisitor}; use crate::optimizer::plan_expr_visitor::InputRefCounter; use crate::optimizer::plan_node::{generic, BatchProject, PlanTreeNodeUnary}; -use crate::optimizer::{BoxedRule, PlanRef, Rule}; +use crate::optimizer::rule::Rule; +use crate::optimizer::{BoxedRule, PlanRef}; use crate::utils::Substitute; /// Merge contiguous [`BatchProject`] nodes. diff --git a/src/frontend/src/optimizer/rule/join_project_transpose_rule.rs b/src/frontend/src/optimizer/rule/join_project_transpose_rule.rs index be1e4dc55313..82d2722d7765 100644 --- a/src/frontend/src/optimizer/rule/join_project_transpose_rule.rs +++ b/src/frontend/src/optimizer/rule/join_project_transpose_rule.rs @@ -16,7 +16,7 @@ use itertools::Itertools; use risingwave_common::util::column_index_mapping::ColIndexMapping; use risingwave_pb::plan_common::JoinType; -use super::Rule; +use super::{BoxedRule, Rule}; use crate::expr::{ExprRewriter, InputRef}; use crate::optimizer::plan_node::{LogicalJoin, LogicalProject}; use crate::utils::IndexRewriter; @@ -162,7 +162,7 @@ impl Rule for JoinProjectTransposeRule { } impl JoinProjectTransposeRule { - pub fn create() -> Box { + pub fn create() -> BoxedRule { Box::new(JoinProjectTransposeRule {}) } } diff --git a/src/frontend/src/optimizer/rule/mod.rs b/src/frontend/src/optimizer/rule/mod.rs index 7ee0b7ca6afb..df64eb178ff2 100644 --- a/src/frontend/src/optimizer/rule/mod.rs +++ b/src/frontend/src/optimizer/rule/mod.rs @@ -14,20 +14,103 @@ //! Define all [`Rule`] +use std::convert::Infallible; +use std::ops::FromResidual; + +use thiserror_ext::AsReport; + use super::PlanRef; +use crate::error::RwError; + +/// Result when applying a [`Rule`] to a [`PlanNode`](super::plan_node::PlanNode). +pub enum ApplyResult { + /// Successfully applied the rule and returned a new plan. + Ok(T), + /// The current rule is not applicable to the input. + /// The optimizer may try another rule. + NotApplicable, + /// An unrecoverable error occurred while applying the rule. + /// The optimizer should stop applying other rules and report the error to the user. + Err(RwError), +} + +impl ApplyResult { + /// Unwrap the result, panicking if it's not `Ok`. + pub fn unwrap(self) -> PlanRef { + match self { + ApplyResult::Ok(plan) => plan, + ApplyResult::NotApplicable => panic!("unwrap ApplyResult::NotApplicable"), + ApplyResult::Err(e) => panic!("unwrap ApplyResult::Err, error: {:?}", e.as_report()), + } + } +} + +/// Allow calling `?` on an `Option` in a function returning `ApplyResult`. +impl FromResidual> for ApplyResult { + fn from_residual(residual: Option) -> Self { + match residual { + Some(i) => match i {}, + None => Self::NotApplicable, + } + } +} + +/// Allow calling `?` on a `Result` in a function returning `ApplyResult`. +impl FromResidual> for ApplyResult +where + E: Into, +{ + fn from_residual(residual: Result) -> Self { + match residual { + Ok(i) => match i {}, + Err(e) => Self::Err(e.into()), + } + } +} -/// A one-to-one transform for the [`PlanNode`](super::plan_node::PlanNode), every [`Rule`] should -/// downcast and check if the node matches the rule. -pub trait Rule: Send + Sync + Description { - /// return err(()) if not match +/// An one-to-one transform for the [`PlanNode`](super::plan_node::PlanNode). +/// +/// It's a convenient trait to implement [`FallibleRule`], thus made available only within this module. +trait InfallibleRule: Send + Sync + Description { + /// Apply the rule to the plan node. + /// + /// - Returns `Some` if the apply is successful. + /// - Returns `None` if it's not applicable. The optimizer may try other rules. fn apply(&self, plan: PlanRef) -> Option; } +use InfallibleRule as Rule; + +/// An one-to-one transform for the [`PlanNode`](super::plan_node::PlanNode) that may return an +/// unrecoverable error that stops further optimization. +/// +/// An [`InfallibleRule`] is always a [`FallibleRule`]. +pub trait FallibleRule: Send + Sync + Description { + /// Apply the rule to the plan node, which may return an unrecoverable error. + /// + /// - Returns `ApplyResult::Ok` if the apply is successful. + /// - Returns `ApplyResult::NotApplicable` if it's not applicable. The optimizer may try other rules. + /// - Returns `ApplyResult::Err` if an unrecoverable error occurred. The optimizer should stop applying + /// other rules and report the error to the user. + fn apply(&self, plan: PlanRef) -> ApplyResult; +} + +impl FallibleRule for T +where + T: InfallibleRule, +{ + fn apply(&self, plan: PlanRef) -> ApplyResult { + match InfallibleRule::apply(self, plan) { + Some(plan) => ApplyResult::Ok(plan), + None => ApplyResult::NotApplicable, + } + } +} pub trait Description { fn description(&self) -> &str; } -pub(super) type BoxedRule = Box; +pub(super) type BoxedRule = Box; mod logical_filter_expression_simplify_rule; pub use logical_filter_expression_simplify_rule::*; diff --git a/src/frontend/src/optimizer/rule/over_window_merge_rule.rs b/src/frontend/src/optimizer/rule/over_window_merge_rule.rs index 6915ebb2b740..212e7d12922b 100644 --- a/src/frontend/src/optimizer/rule/over_window_merge_rule.rs +++ b/src/frontend/src/optimizer/rule/over_window_merge_rule.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use super::Rule; +use super::{BoxedRule, Rule}; use crate::optimizer::plan_node::{LogicalOverWindow, PlanTreeNodeUnary}; use crate::PlanRef; @@ -21,7 +21,7 @@ use crate::PlanRef; pub struct OverWindowMergeRule; impl OverWindowMergeRule { - pub fn create() -> Box { + pub fn create() -> BoxedRule { Box::new(OverWindowMergeRule) } } diff --git a/src/frontend/src/optimizer/rule/over_window_split_rule.rs b/src/frontend/src/optimizer/rule/over_window_split_rule.rs index 85510a5ff974..b0852a5c8331 100644 --- a/src/frontend/src/optimizer/rule/over_window_split_rule.rs +++ b/src/frontend/src/optimizer/rule/over_window_split_rule.rs @@ -16,13 +16,13 @@ use std::collections::HashMap; use itertools::Itertools; -use super::Rule; +use super::{BoxedRule, Rule}; use crate::PlanRef; pub struct OverWindowSplitRule; impl OverWindowSplitRule { - pub fn create() -> Box { + pub fn create() -> BoxedRule { Box::new(OverWindowSplitRule) } } diff --git a/src/frontend/src/optimizer/rule/over_window_to_agg_and_join_rule.rs b/src/frontend/src/optimizer/rule/over_window_to_agg_and_join_rule.rs index 259d8333aea7..5f96efb7c982 100644 --- a/src/frontend/src/optimizer/rule/over_window_to_agg_and_join_rule.rs +++ b/src/frontend/src/optimizer/rule/over_window_to_agg_and_join_rule.rs @@ -17,7 +17,7 @@ use risingwave_expr::window_function::WindowFuncKind; use risingwave_pb::expr::expr_node::Type; use risingwave_pb::plan_common::JoinType; -use super::Rule; +use super::{BoxedRule, Rule}; use crate::expr::{AggCall, ExprImpl, FunctionCall, InputRef, OrderBy}; use crate::optimizer::plan_node::{ LogicalAgg, LogicalJoin, LogicalProject, LogicalShare, PlanTreeNodeUnary, @@ -27,7 +27,7 @@ use crate::PlanRef; pub struct OverWindowToAggAndJoinRule; impl OverWindowToAggAndJoinRule { - pub fn create() -> Box { + pub fn create() -> BoxedRule { Box::new(OverWindowToAggAndJoinRule) } } diff --git a/src/frontend/src/optimizer/rule/over_window_to_topn_rule.rs b/src/frontend/src/optimizer/rule/over_window_to_topn_rule.rs index 10ab630a64e3..722709ed73c9 100644 --- a/src/frontend/src/optimizer/rule/over_window_to_topn_rule.rs +++ b/src/frontend/src/optimizer/rule/over_window_to_topn_rule.rs @@ -16,7 +16,7 @@ use fixedbitset::FixedBitSet; use risingwave_common::types::DataType; use risingwave_expr::window_function::WindowFuncKind; -use super::Rule; +use super::{BoxedRule, Rule}; use crate::expr::{collect_input_refs, ExprImpl, ExprType}; use crate::optimizer::plan_node::generic::GenericPlanRef; use crate::optimizer::plan_node::{LogicalFilter, LogicalTopN, PlanTreeNodeUnary}; @@ -45,7 +45,7 @@ use crate::PlanRef; pub struct OverWindowToTopNRule; impl OverWindowToTopNRule { - pub fn create() -> Box { + pub fn create() -> BoxedRule { Box::new(OverWindowToTopNRule) } } diff --git a/src/frontend/src/optimizer/rule/stream/stream_project_merge_rule.rs b/src/frontend/src/optimizer/rule/stream/stream_project_merge_rule.rs index 91ab942e3a7f..ae6fb51b7684 100644 --- a/src/frontend/src/optimizer/rule/stream/stream_project_merge_rule.rs +++ b/src/frontend/src/optimizer/rule/stream/stream_project_merge_rule.rs @@ -15,7 +15,8 @@ use crate::expr::{ExprImpl, ExprRewriter, ExprVisitor}; use crate::optimizer::plan_expr_visitor::InputRefCounter; use crate::optimizer::plan_node::{generic, PlanTreeNodeUnary, StreamProject}; -use crate::optimizer::{BoxedRule, PlanRef, Rule}; +use crate::optimizer::rule::Rule; +use crate::optimizer::{BoxedRule, PlanRef}; use crate::utils::Substitute; /// Merge contiguous [`StreamProject`] nodes. diff --git a/src/frontend/src/planner/select.rs b/src/frontend/src/planner/select.rs index a9e7dd3526ed..ebed01351f7d 100644 --- a/src/frontend/src/planner/select.rs +++ b/src/frontend/src/planner/select.rs @@ -320,7 +320,7 @@ impl Planner { /// /// The [`InputRef`]s' indexes start from `root.schema().len()`, /// which means they are additional columns beyond the original `root`. - fn substitute_subqueries( + pub(super) fn substitute_subqueries( &mut self, mut root: PlanRef, mut exprs: Vec, @@ -366,10 +366,27 @@ impl Planner { .zip_eq_fast(rewriter.correlated_indices_collection) .zip_eq_fast(rewriter.correlated_ids) { + let return_type = subquery.return_type(); let subroot = self.plan_query(subquery.query)?; let right = match subquery.kind { SubqueryKind::Scalar => subroot.into_unordered_subplan(), + SubqueryKind::UpdateSet => { + let plan = subroot.into_unordered_subplan(); + + // Compose all input columns into a struct with `ROW` function. + let all_input_refs = plan + .schema() + .data_types() + .into_iter() + .enumerate() + .map(|(i, data_type)| InputRef::new(i, data_type).into()) + .collect::>(); + let call = + FunctionCall::new_unchecked(ExprType::Row, all_input_refs, return_type); + + LogicalProject::create(plan, vec![call.into()]) + } SubqueryKind::Existential => { self.create_exists(subroot.into_unordered_subplan())? } diff --git a/src/frontend/src/planner/statement.rs b/src/frontend/src/planner/statement.rs index 91c1b9edfc61..ba607f5096d2 100644 --- a/src/frontend/src/planner/statement.rs +++ b/src/frontend/src/planner/statement.rs @@ -24,6 +24,7 @@ impl Planner { BoundStatement::Delete(d) => self.plan_delete(*d), BoundStatement::Update(u) => self.plan_update(*u), BoundStatement::Query(q) => self.plan_query(*q), + BoundStatement::DeclareCursor(d) => self.plan_query(*d.query), BoundStatement::FetchCursor(_) => unimplemented!(), BoundStatement::CreateView(c) => self.plan_query(*c.query), } diff --git a/src/frontend/src/planner/update.rs b/src/frontend/src/planner/update.rs index ddf9ab0bdf9a..2db18ac0e292 100644 --- a/src/frontend/src/planner/update.rs +++ b/src/frontend/src/planner/update.rs @@ -13,41 +13,92 @@ // limitations under the License. use fixedbitset::FixedBitSet; -use itertools::Itertools; +use risingwave_common::types::{DataType, Scalar}; +use risingwave_pb::expr::expr_node::Type; use super::Planner; -use crate::binder::BoundUpdate; +use crate::binder::{BoundUpdate, UpdateProject}; use crate::error::Result; +use crate::expr::{ExprImpl, FunctionCall, InputRef, Literal}; +use crate::optimizer::plan_node::generic::GenericPlanRef; use crate::optimizer::plan_node::{generic, LogicalProject, LogicalUpdate}; use crate::optimizer::property::{Order, RequiredDist}; use crate::optimizer::{PlanRef, PlanRoot}; impl Planner { pub(super) fn plan_update(&mut self, update: BoundUpdate) -> Result { + let returning = !update.returning_list.is_empty(); + let scan = self.plan_base_table(&update.table)?; let input = if let Some(expr) = update.selection { self.plan_where(scan, expr)? } else { scan }; - let returning = !update.returning_list.is_empty(); - let update_column_indices = update - .table - .table_catalog - .columns() - .iter() - .enumerate() - .filter_map(|(i, c)| c.can_dml().then_some(i)) - .collect_vec(); + let old_schema_len = input.schema().len(); + + // Extend table scan with updated columns. + let with_new: PlanRef = { + let mut plan = input; + + let mut exprs: Vec = plan + .schema() + .data_types() + .into_iter() + .enumerate() + .map(|(index, data_type)| InputRef::new(index, data_type).into()) + .collect(); + + exprs.extend(update.exprs); + + // Substitute subqueries into `LogicalApply`s. + if exprs.iter().any(|e| e.has_subquery()) { + (plan, exprs) = self.substitute_subqueries(plan, exprs)?; + } + + LogicalProject::new(plan, exprs).into() + }; + + let mut olds = Vec::new(); + let mut news = Vec::new(); + + for (i, col) in update.table.table_catalog.columns().iter().enumerate() { + // Skip generated columns and system columns. + if !col.can_dml() { + continue; + } + let data_type = col.data_type(); + + let old: ExprImpl = InputRef::new(i, data_type.clone()).into(); + + let new: ExprImpl = match (update.projects.get(&i)).map(|p| p.offset(old_schema_len)) { + Some(UpdateProject::Simple(j)) => InputRef::new(j, data_type.clone()).into(), + Some(UpdateProject::Composite(j, field)) => FunctionCall::new_unchecked( + Type::Field, + vec![ + InputRef::new(j, with_new.schema().data_types()[j].clone()).into(), // struct + Literal::new(Some((field as i32).to_scalar_value()), DataType::Int32) + .into(), + ], + data_type.clone(), + ) + .into(), + + None => old.clone(), + }; + + olds.push(old); + news.push(new); + } let mut plan: PlanRef = LogicalUpdate::from(generic::Update::new( - input, + with_new, update.table_name.clone(), update.table_id, update.table_version_id, - update.exprs, + olds, + news, returning, - update_column_indices, )) .into(); diff --git a/src/frontend/src/stream_fragmenter/mod.rs b/src/frontend/src/stream_fragmenter/mod.rs index daa48d99969c..f30b0abf5b4c 100644 --- a/src/frontend/src/stream_fragmenter/mod.rs +++ b/src/frontend/src/stream_fragmenter/mod.rs @@ -361,6 +361,10 @@ fn build_fragment( current_fragment.requires_singleton = true; } + NodeBody::StreamFsFetch(_) => { + current_fragment.fragment_type_mask |= FragmentTypeFlag::FsFetch as u32; + } + _ => {} }; diff --git a/src/frontend/src/test_utils.rs b/src/frontend/src/test_utils.rs index d94b1dd2652d..15a5281dec5e 100644 --- a/src/frontend/src/test_utils.rs +++ b/src/frontend/src/test_utils.rs @@ -56,6 +56,7 @@ use risingwave_pb::meta::list_actor_splits_response::ActorSplit; use risingwave_pb::meta::list_actor_states_response::ActorState; use risingwave_pb::meta::list_fragment_distribution_response::FragmentDistribution; use risingwave_pb::meta::list_object_dependencies_response::PbObjectDependencies; +use risingwave_pb::meta::list_rate_limits_response::RateLimitInfo; use risingwave_pb::meta::list_table_fragment_states_response::TableFragmentState; use risingwave_pb::meta::list_table_fragments_response::TableFragmentInfo; use risingwave_pb::meta::{ @@ -1065,6 +1066,10 @@ impl FrontendMetaClient for MockFrontendMetaClient { async fn get_cluster_limits(&self) -> RpcResult> { Ok(vec![]) } + + async fn list_rate_limits(&self) -> RpcResult> { + Ok(vec![]) + } } #[cfg(test)] diff --git a/src/frontend/src/utils/pretty_serde.rs b/src/frontend/src/utils/pretty_serde.rs index 705267c3163b..e92bb37267a1 100644 --- a/src/frontend/src/utils/pretty_serde.rs +++ b/src/frontend/src/utils/pretty_serde.rs @@ -28,7 +28,9 @@ use pretty_xmlish::Pretty; use serde::ser::{SerializeSeq, SerializeStruct}; use serde::{Serialize, Serializer}; -pub struct PrettySerde<'a>(pub Pretty<'a>); +// Second anymous field is include_children. +// If true the children information will be serialized. +pub struct PrettySerde<'a>(pub Pretty<'a>, pub bool); impl Serialize for PrettySerde<'_> { fn serialize(&self, serializer: S) -> Result @@ -46,31 +48,33 @@ impl Serialize for PrettySerde<'_> { &node .fields .iter() - .map(|(k, v)| (k.as_ref(), PrettySerde(v.clone()))) + .map(|(k, v)| (k.as_ref(), PrettySerde(v.clone(), self.1))) .collect::>(), )?; - state.serialize_field( - "children", - &node - .children - .iter() - .map(|c| PrettySerde(c.clone())) - .collect::>(), - )?; + if self.1 { + state.serialize_field( + "children", + &node + .children + .iter() + .map(|c| PrettySerde(c.clone(), self.1)) + .collect::>(), + )?; + } state.end() } Pretty::Array(elements) => { let mut seq = serializer.serialize_seq(Some(elements.len()))?; for element in elements { - seq.serialize_element(&PrettySerde((*element).clone()))?; + seq.serialize_element(&PrettySerde((*element).clone(), self.1))?; } seq.end() } Pretty::Linearized(inner, size) => { let mut state = serializer.serialize_struct("Linearized", 2)?; - state.serialize_field("inner", &PrettySerde((**inner).clone()))?; + state.serialize_field("inner", &PrettySerde((**inner).clone(), self.1))?; state.serialize_field("size", size)?; state.end() } @@ -94,7 +98,7 @@ mod tests { #[test] fn test_pretty_serde() { let pretty = Pretty::childless_record("root", vec![("a", Pretty::Text("1".into()))]); - let pretty_serde = PrettySerde(pretty); + let pretty_serde = PrettySerde(pretty, true); let serialized = serde_json::to_string(&pretty_serde).unwrap(); check( serialized, diff --git a/src/meta/model/migration/src/lib.rs b/src/meta/model/migration/src/lib.rs index b84a29891eee..a27258dca1c1 100644 --- a/src/meta/model/migration/src/lib.rs +++ b/src/meta/model/migration/src/lib.rs @@ -24,6 +24,8 @@ mod m20240820_081248_add_time_travel_per_table_epoch; mod m20240911_083152_variable_vnode_count; mod m20241016_065621_hummock_gc_history; mod m20241025_062548_singleton_vnode_count; +mod m20241115_085007_remove_function_type; +mod m20241120_182555_hummock_add_time_travel_sst_index; mod utils; pub struct Migrator; @@ -86,6 +88,8 @@ impl MigratorTrait for Migrator { Box::new(m20240911_083152_variable_vnode_count::Migration), Box::new(m20241016_065621_hummock_gc_history::Migration), Box::new(m20241025_062548_singleton_vnode_count::Migration), + Box::new(m20241115_085007_remove_function_type::Migration), + Box::new(m20241120_182555_hummock_add_time_travel_sst_index::Migration), ] } } diff --git a/src/meta/model/migration/src/m20240701_060504_hummock_time_travel.rs b/src/meta/model/migration/src/m20240701_060504_hummock_time_travel.rs index 619537078e28..587cd2692396 100644 --- a/src/meta/model/migration/src/m20240701_060504_hummock_time_travel.rs +++ b/src/meta/model/migration/src/m20240701_060504_hummock_time_travel.rs @@ -110,7 +110,7 @@ impl MigrationTrait for Migration { } #[derive(DeriveIden)] -enum HummockSstableInfo { +pub(crate) enum HummockSstableInfo { Table, SstId, ObjectId, diff --git a/src/meta/model/migration/src/m20241115_085007_remove_function_type.rs b/src/meta/model/migration/src/m20241115_085007_remove_function_type.rs new file mode 100644 index 000000000000..b74382991c88 --- /dev/null +++ b/src/meta/model/migration/src/m20241115_085007_remove_function_type.rs @@ -0,0 +1,35 @@ +use sea_orm_migration::prelude::*; + +#[derive(DeriveMigrationName)] +pub struct Migration; + +#[async_trait::async_trait] +impl MigrationTrait for Migration { + async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> { + manager + .alter_table( + Table::alter() + .table(Function::Table) + .drop_column(Function::FunctionType) + .to_owned(), + ) + .await + } + + async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> { + manager + .alter_table( + Table::alter() + .table(Function::Table) + .add_column(ColumnDef::new(Function::FunctionType).string()) + .to_owned(), + ) + .await + } +} + +#[derive(DeriveIden)] +enum Function { + Table, + FunctionType, +} diff --git a/src/meta/model/migration/src/m20241120_182555_hummock_add_time_travel_sst_index.rs b/src/meta/model/migration/src/m20241120_182555_hummock_add_time_travel_sst_index.rs new file mode 100644 index 000000000000..b948dd674ec1 --- /dev/null +++ b/src/meta/model/migration/src/m20241120_182555_hummock_add_time_travel_sst_index.rs @@ -0,0 +1,36 @@ +use sea_orm_migration::prelude::*; + +use crate::m20240701_060504_hummock_time_travel::HummockSstableInfo; + +#[derive(DeriveMigrationName)] +pub struct Migration; + +const IDX_HUMMOCK_SSTABLE_INFO_OBJECT_ID: &str = "idx_hummock_sstable_info_object_id"; + +#[async_trait::async_trait] +impl MigrationTrait for Migration { + async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> { + manager + .create_index( + Index::create() + .table(HummockSstableInfo::Table) + .name(IDX_HUMMOCK_SSTABLE_INFO_OBJECT_ID) + .col(HummockSstableInfo::ObjectId) + .to_owned(), + ) + .await?; + Ok(()) + } + + async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> { + manager + .drop_index( + Index::drop() + .table(HummockSstableInfo::Table) + .name(IDX_HUMMOCK_SSTABLE_INFO_OBJECT_ID) + .to_owned(), + ) + .await?; + Ok(()) + } +} diff --git a/src/meta/model/src/function.rs b/src/meta/model/src/function.rs index 0fea52c6c348..48e9812999d6 100644 --- a/src/meta/model/src/function.rs +++ b/src/meta/model/src/function.rs @@ -42,14 +42,13 @@ pub struct Model { pub arg_types: DataTypeArray, pub return_type: DataType, pub language: String, + pub runtime: Option, pub link: Option, pub identifier: Option, pub body: Option, pub compressed_binary: Option>, pub kind: FunctionKind, pub always_retry_on_network_error: bool, - pub runtime: Option, - pub function_type: Option, } #[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] @@ -101,14 +100,13 @@ impl From for ActiveModel { arg_types: Set(DataTypeArray::from(function.arg_types)), return_type: Set(DataType::from(&function.return_type.unwrap())), language: Set(function.language), + runtime: Set(function.runtime), link: Set(function.link), identifier: Set(function.identifier), body: Set(function.body), compressed_binary: Set(function.compressed_binary), kind: Set(function.kind.unwrap().into()), always_retry_on_network_error: Set(function.always_retry_on_network_error), - runtime: Set(function.runtime), - function_type: Set(function.function_type), } } } diff --git a/src/meta/node/src/server.rs b/src/meta/node/src/server.rs index f64398c209f0..b929e4b32690 100644 --- a/src/meta/node/src/server.rs +++ b/src/meta/node/src/server.rs @@ -578,7 +578,7 @@ pub async fn start_service_as_election_leader( // sub_tasks executed concurrently. Can be shutdown via shutdown_all sub_tasks.extend(hummock::start_hummock_workers( hummock_manager.clone(), - // compaction_scheduler, + backup_manager.clone(), &env.opts, )); sub_tasks.push(start_worker_info_monitor( diff --git a/src/meta/service/src/stream_service.rs b/src/meta/service/src/stream_service.rs index 91f73a292025..4bb9bfb2d448 100644 --- a/src/meta/service/src/stream_service.rs +++ b/src/meta/service/src/stream_service.rs @@ -114,12 +114,12 @@ impl StreamManagerService for StreamServiceImpl { } ThrottleTarget::Mv => { self.metadata_manager - .update_mv_rate_limit_by_table_id(TableId::from(request.id), request.rate) + .update_backfill_rate_limit_by_table_id(TableId::from(request.id), request.rate) .await? } ThrottleTarget::CdcTable => { self.metadata_manager - .update_mv_rate_limit_by_table_id(TableId::from(request.id), request.rate) + .update_backfill_rate_limit_by_table_id(TableId::from(request.id), request.rate) .await? } ThrottleTarget::Unspecified => { @@ -433,4 +433,16 @@ impl StreamManagerService for StreamServiceImpl { Ok(Response::new(ListActorSplitsResponse { actor_splits })) } + + async fn list_rate_limits( + &self, + _request: Request, + ) -> Result, Status> { + let rate_limits = self + .metadata_manager + .catalog_controller + .list_rate_limits() + .await?; + Ok(Response::new(ListRateLimitsResponse { rate_limits })) + } } diff --git a/src/meta/src/barrier/command.rs b/src/meta/src/barrier/command.rs index d2dd3058544c..f19feadabed5 100644 --- a/src/meta/src/barrier/command.rs +++ b/src/meta/src/barrier/command.rs @@ -101,7 +101,7 @@ pub struct ReplaceTablePlan { /// The `StreamingJob` info of the table to be replaced. Must be `StreamingJob::Table` pub streaming_job: StreamingJob, /// The temporary dummy table fragments id of new table fragment - pub dummy_id: u32, + pub tmp_id: u32, } impl ReplaceTablePlan { diff --git a/src/meta/src/barrier/context/recovery.rs b/src/meta/src/barrier/context/recovery.rs index 83e7cd13919a..344d8af0a1e6 100644 --- a/src/meta/src/barrier/context/recovery.rs +++ b/src/meta/src/barrier/context/recovery.rs @@ -142,6 +142,12 @@ impl GlobalBarrierWorkerContextImpl { .list_background_creating_jobs() .await?; + info!( + "background streaming jobs: {:?} total {}", + background_streaming_jobs, + background_streaming_jobs.len() + ); + // Resolve actor info for recovery. If there's no actor to recover, most of the // following steps will be no-op, while the compute nodes will still be reset. // FIXME: Transactions should be used. @@ -149,6 +155,7 @@ impl GlobalBarrierWorkerContextImpl { let mut info = if !self.env.opts.disable_automatic_parallelism_control && background_streaming_jobs.is_empty() { + info!("trigger offline scaling"); self.scale_actors(&active_streaming_nodes) .await .inspect_err(|err| { @@ -159,6 +166,7 @@ impl GlobalBarrierWorkerContextImpl { warn!(error = %err.as_report(), "resolve actor info failed"); })? } else { + info!("trigger actor migration"); // Migrate actors in expired CN to newly joined one. self.migrate_actors(&mut active_streaming_nodes) .await @@ -376,7 +384,7 @@ impl GlobalBarrierWorkerContextImpl { mgr.catalog_controller.migrate_actors(plan).await?; - debug!("migrate actors succeed."); + info!("migrate actors succeed."); self.resolve_graph_info().await } @@ -447,6 +455,11 @@ impl GlobalBarrierWorkerContextImpl { result }; + info!( + "target table parallelisms for offline scaling: {:?}", + table_parallelisms + ); + let schedulable_worker_ids = active_nodes .current() .values() @@ -460,6 +473,11 @@ impl GlobalBarrierWorkerContextImpl { .map(|worker| worker.id as WorkerId) .collect(); + info!( + "target worker ids for offline scaling: {:?}", + schedulable_worker_ids + ); + let plan = self .scale_controller .generate_table_resize_plan(TableResizePolicy { @@ -497,6 +515,8 @@ impl GlobalBarrierWorkerContextImpl { // Because custom parallelism doesn't exist, this function won't result in a no-shuffle rewrite for table parallelisms. debug_assert_eq!(compared_table_parallelisms, table_parallelisms); + info!("post applying reschedule for offline scaling"); + if let Err(e) = self .scale_controller .post_apply_reschedule(&reschedule_fragment, &table_parallelisms) @@ -510,7 +530,7 @@ impl GlobalBarrierWorkerContextImpl { return Err(e); } - debug!("scaling actors succeed."); + info!("scaling actors succeed."); Ok(()) } diff --git a/src/meta/src/controller/catalog.rs b/src/meta/src/controller/catalog.rs index 6af435309064..f3dac5c39e2f 100644 --- a/src/meta/src/controller/catalog.rs +++ b/src/meta/src/controller/catalog.rs @@ -2259,7 +2259,6 @@ impl CatalogController { .filter(|obj| { obj.obj_type == ObjectType::Table || obj.obj_type == ObjectType::Sink - || obj.obj_type == ObjectType::Subscription || obj.obj_type == ObjectType::Index }) .map(|obj| obj.oid) diff --git a/src/meta/src/controller/mod.rs b/src/meta/src/controller/mod.rs index c7cf45daad9e..0c64461ab2ee 100644 --- a/src/meta/src/controller/mod.rs +++ b/src/meta/src/controller/mod.rs @@ -357,14 +357,13 @@ impl From> for PbFunction { arg_types: value.0.arg_types.to_protobuf(), return_type: Some(value.0.return_type.to_protobuf()), language: value.0.language, + runtime: value.0.runtime, link: value.0.link, identifier: value.0.identifier, body: value.0.body, compressed_binary: value.0.compressed_binary, kind: Some(value.0.kind.into()), always_retry_on_network_error: value.0.always_retry_on_network_error, - runtime: value.0.runtime, - function_type: value.0.function_type, } } } diff --git a/src/meta/src/controller/streaming_job.rs b/src/meta/src/controller/streaming_job.rs index 2da096268cff..d5ee31efae24 100644 --- a/src/meta/src/controller/streaming_job.rs +++ b/src/meta/src/controller/streaming_job.rs @@ -39,6 +39,7 @@ use risingwave_meta_model::{ use risingwave_pb::catalog::source::PbOptionalAssociatedTableId; use risingwave_pb::catalog::table::{PbOptionalAssociatedSourceId, PbTableVersion}; use risingwave_pb::catalog::{PbCreateType, PbTable}; +use risingwave_pb::meta::list_rate_limits_response::RateLimitInfo; use risingwave_pb::meta::relation::{PbRelationInfo, RelationInfo}; use risingwave_pb::meta::subscribe_response::{ Info as NotificationInfo, Info, Operation as NotificationOperation, Operation, @@ -53,12 +54,12 @@ use risingwave_pb::stream_plan::update_mutation::PbMergeUpdate; use risingwave_pb::stream_plan::{ PbDispatcher, PbDispatcherType, PbFragmentTypeFlag, PbStreamActor, }; -use sea_orm::sea_query::{Expr, Query, SimpleExpr}; +use sea_orm::sea_query::{BinOper, Expr, Query, SimpleExpr}; use sea_orm::ActiveValue::Set; use sea_orm::{ ActiveEnum, ActiveModelTrait, ColumnTrait, DatabaseTransaction, EntityTrait, IntoActiveModel, - JoinType, ModelTrait, NotSet, PaginatorTrait, QueryFilter, QuerySelect, RelationTrait, - TransactionTrait, + IntoSimpleExpr, JoinType, ModelTrait, NotSet, PaginatorTrait, QueryFilter, QuerySelect, + RelationTrait, TransactionTrait, }; use crate::barrier::{ReplaceTablePlan, Reschedule}; @@ -909,13 +910,13 @@ impl CatalogController { Some(ReplaceTablePlan { streaming_job, merge_updates, - dummy_id, + tmp_id, .. }) => { let incoming_sink_id = job_id; let (relations, fragment_mapping) = Self::finish_replace_streaming_job_inner( - dummy_id as ObjectId, + tmp_id as ObjectId, merge_updates, None, Some(incoming_sink_id as _), @@ -964,7 +965,7 @@ impl CatalogController { pub async fn finish_replace_streaming_job( &self, - dummy_id: ObjectId, + tmp_id: ObjectId, streaming_job: StreamingJob, merge_updates: Vec, table_col_index_mapping: Option, @@ -976,7 +977,7 @@ impl CatalogController { let txn = inner.db.begin().await?; let (relations, fragment_mapping) = Self::finish_replace_streaming_job_inner( - dummy_id, + tmp_id, merge_updates, table_col_index_mapping, creating_sink_id, @@ -1007,7 +1008,7 @@ impl CatalogController { } pub async fn finish_replace_streaming_job_inner( - dummy_id: ObjectId, + tmp_id: ObjectId, merge_updates: Vec, table_col_index_mapping: Option, creating_sink_id: Option, @@ -1065,7 +1066,7 @@ impl CatalogController { fragment::Column::FragmentId, fragment::Column::StateTableIds, ]) - .filter(fragment::Column::JobId.eq(dummy_id)) + .filter(fragment::Column::JobId.eq(tmp_id)) .into_tuple() .all(txn) .await?; @@ -1090,7 +1091,7 @@ impl CatalogController { .await?; Fragment::update_many() .col_expr(fragment::Column::JobId, SimpleExpr::from(job_id)) - .filter(fragment::Column::JobId.eq(dummy_id)) + .filter(fragment::Column::JobId.eq(tmp_id)) .exec(txn) .await?; @@ -1189,7 +1190,7 @@ impl CatalogController { } // 3. remove dummy object. - Object::delete_by_id(dummy_id).exec(txn).await?; + Object::delete_by_id(tmp_id).exec(txn).await?; // 4. update catalogs and notify. let mut relations = vec![]; @@ -1317,7 +1318,6 @@ impl CatalogController { .map(|(id, mask, stream_node)| (id, mask, stream_node.to_protobuf())) .collect_vec(); - // TODO: limit source backfill? fragments.retain_mut(|(_, fragment_type_mask, stream_node)| { let mut found = false; if *fragment_type_mask & PbFragmentTypeFlag::Source as i32 != 0 { @@ -1332,11 +1332,12 @@ impl CatalogController { } }); } - if is_fs_source && *fragment_type_mask == PbFragmentTypeFlag::FragmentUnspecified as i32 - { - // when create table with fs connector, the fragment type is unspecified + if is_fs_source { + // in older versions, there's no fragment type flag for `FsFetch` node, + // so we just scan all fragments for StreamFsFetch node if using fs connector visit_stream_node(stream_node, |node| { if let PbNodeBody::StreamFsFetch(node) = node { + *fragment_type_mask |= PbFragmentTypeFlag::FsFetch as i32; if let Some(node_inner) = &mut node.node_inner && node_inner.source_id == source_id as u32 { @@ -1354,9 +1355,10 @@ impl CatalogController { "source id should be used by at least one fragment" ); let fragment_ids = fragments.iter().map(|(id, _, _)| *id).collect_vec(); - for (id, _, stream_node) in fragments { + for (id, fragment_type_mask, stream_node) in fragments { fragment::ActiveModel { fragment_id: Set(id), + fragment_type_mask: Set(fragment_type_mask), stream_node: Set(StreamNode::from(&stream_node)), ..Default::default() } @@ -1385,7 +1387,7 @@ impl CatalogController { // edit the `rate_limit` of the `Chain` node in given `table_id`'s fragments // return the actor_ids to be applied - pub async fn update_mv_rate_limit_by_job_id( + pub async fn update_backfill_rate_limit_by_job_id( &self, job_id: ObjectId, rate_limit: Option, @@ -1411,9 +1413,7 @@ impl CatalogController { fragments.retain_mut(|(_, fragment_type_mask, stream_node)| { let mut found = false; - if (*fragment_type_mask & PbFragmentTypeFlag::StreamScan as i32 != 0) - || (*fragment_type_mask & PbFragmentTypeFlag::Source as i32 != 0) - { + if *fragment_type_mask & PbFragmentTypeFlag::backfill_rate_limit_fragments() != 0 { visit_stream_node(stream_node, |node| match node { PbNodeBody::StreamCdcScan(node) => { node.rate_limit = rate_limit; @@ -1423,11 +1423,9 @@ impl CatalogController { node.rate_limit = rate_limit; found = true; } - PbNodeBody::Source(node) => { - if let Some(inner) = node.source_inner.as_mut() { - inner.rate_limit = rate_limit; - found = true; - } + PbNodeBody::SourceBackfill(node) => { + node.rate_limit = rate_limit; + found = true; } _ => {} }); @@ -1782,4 +1780,107 @@ impl CatalogController { Ok(()) } + + /// Note: `FsFetch` created in old versions are not included. + /// Since this is only used for debugging, it should be fine. + pub async fn list_rate_limits(&self) -> MetaResult> { + let inner = self.inner.read().await; + let txn = inner.db.begin().await?; + + let fragments: Vec<(FragmentId, ObjectId, i32, StreamNode)> = Fragment::find() + .select_only() + .columns([ + fragment::Column::FragmentId, + fragment::Column::JobId, + fragment::Column::FragmentTypeMask, + fragment::Column::StreamNode, + ]) + .filter(fragment_type_mask_intersects( + PbFragmentTypeFlag::rate_limit_fragments(), + )) + .into_tuple() + .all(&txn) + .await?; + + let mut rate_limits = Vec::new(); + for (fragment_id, job_id, fragment_type_mask, stream_node) in fragments { + let mut stream_node = stream_node.to_protobuf(); + let mut rate_limit = None; + let mut node_name = None; + + visit_stream_node(&mut stream_node, |node| { + match node { + // source rate limit + PbNodeBody::Source(node) => { + if let Some(node_inner) = &mut node.source_inner { + debug_assert!( + rate_limit.is_none(), + "one fragment should only have 1 rate limit node" + ); + rate_limit = node_inner.rate_limit; + node_name = Some("SOURCE"); + } + } + PbNodeBody::StreamFsFetch(node) => { + if let Some(node_inner) = &mut node.node_inner { + debug_assert!( + rate_limit.is_none(), + "one fragment should only have 1 rate limit node" + ); + rate_limit = node_inner.rate_limit; + node_name = Some("FS_FETCH"); + } + } + // backfill rate limit + PbNodeBody::SourceBackfill(node) => { + debug_assert!( + rate_limit.is_none(), + "one fragment should only have 1 rate limit node" + ); + rate_limit = node.rate_limit; + node_name = Some("SOURCE_BACKFILL"); + } + PbNodeBody::StreamScan(node) => { + debug_assert!( + rate_limit.is_none(), + "one fragment should only have 1 rate limit node" + ); + rate_limit = node.rate_limit; + node_name = Some("STREAM_SCAN"); + } + PbNodeBody::StreamCdcScan(node) => { + debug_assert!( + rate_limit.is_none(), + "one fragment should only have 1 rate limit node" + ); + rate_limit = node.rate_limit; + node_name = Some("STREAM_CDC_SCAN"); + } + _ => {} + } + }); + + if let Some(rate_limit) = rate_limit { + rate_limits.push(RateLimitInfo { + fragment_id: fragment_id as u32, + job_id: job_id as u32, + fragment_type_mask: fragment_type_mask as u32, + rate_limit, + node_name: node_name.unwrap().to_string(), + }); + } + } + + Ok(rate_limits) + } +} + +fn bitflag_intersects(column: SimpleExpr, value: i32) -> SimpleExpr { + column + .binary(BinOper::Custom("&"), value) + .binary(BinOper::NotEqual, 0) +} + +fn fragment_type_mask_intersects(value: i32) -> SimpleExpr { + bitflag_intersects(fragment::Column::FragmentTypeMask.into_simple_expr(), value) } diff --git a/src/meta/src/hummock/manager/checkpoint.rs b/src/meta/src/hummock/manager/checkpoint.rs index 394933109844..12f831fca3c6 100644 --- a/src/meta/src/hummock/manager/checkpoint.rs +++ b/src/meta/src/hummock/manager/checkpoint.rs @@ -220,8 +220,17 @@ impl HummockManager { .collect(), }); } - // We can directly discard reference to stale objects that will no longer be used. let min_pinned_version_id = self.context_info.read().await.min_pinned_version_id(); + let may_delete_object = stale_objects + .iter() + .filter_map(|(version_id, object_ids)| { + if *version_id >= min_pinned_version_id { + return None; + } + Some(object_ids.id.clone()) + }) + .flatten(); + self.gc_manager.add_may_delete_object_ids(may_delete_object); stale_objects.retain(|version_id, _| *version_id >= min_pinned_version_id); let new_checkpoint = HummockVersionCheckpoint { version: current_version.clone(), diff --git a/src/meta/src/hummock/manager/compaction/mod.rs b/src/meta/src/hummock/manager/compaction/mod.rs index cf2c448f1002..6b8df2012052 100644 --- a/src/meta/src/hummock/manager/compaction/mod.rs +++ b/src/meta/src/hummock/manager/compaction/mod.rs @@ -643,6 +643,11 @@ impl HummockManager { self.env.notification_manager(), &self.metrics, ); + // Apply stats changes. + let mut version_stats = HummockVersionStatsTransaction::new( + &mut versioning.version_stats, + self.env.notification_manager(), + ); if deterministic_mode { version.disable_apply_to_txn(); @@ -808,6 +813,10 @@ impl HummockManager { .sorted_output_ssts .clone_from(&compact_task.input_ssts[0].table_infos); } + update_table_stats_for_vnode_watermark_trivial_reclaim( + &mut version_stats.table_stats, + &compact_task, + ); self.metrics .compact_frequency .with_label_values(&[ @@ -878,7 +887,8 @@ impl HummockManager { self.meta_store_ref(), compaction_statuses, compact_task_assignment, - version + version, + version_stats )?; self.metrics .compact_task_batch_count @@ -1674,3 +1684,34 @@ pub struct CompactionGroupStatistic { pub table_statistic: BTreeMap, pub compaction_group_config: CompactionGroup, } + +/// Updates table stats caused by vnode watermark trivial reclaim compaction. +fn update_table_stats_for_vnode_watermark_trivial_reclaim( + table_stats: &mut PbTableStatsMap, + task: &CompactTask, +) { + if task.task_type != TaskType::VnodeWatermark { + return; + } + let mut deleted_table_keys: HashMap = HashMap::default(); + for s in task.input_ssts.iter().flat_map(|l| l.table_infos.iter()) { + assert_eq!(s.table_ids.len(), 1); + let e = deleted_table_keys.entry(s.table_ids[0]).or_insert(0); + *e += s.total_key_count; + } + for (table_id, delete_count) in deleted_table_keys { + let Some(stats) = table_stats.get_mut(&table_id) else { + continue; + }; + if stats.total_key_count == 0 { + continue; + } + let new_total_key_count = stats.total_key_count.saturating_sub(delete_count as i64); + let ratio = new_total_key_count as f64 / stats.total_key_count as f64; + // total_key_count is updated accurately. + stats.total_key_count = new_total_key_count; + // others are updated approximately. + stats.total_key_size = (stats.total_key_size as f64 * ratio).ceil() as i64; + stats.total_value_size = (stats.total_value_size as f64 * ratio).ceil() as i64; + } +} diff --git a/src/meta/src/hummock/manager/gc.rs b/src/meta/src/hummock/manager/gc.rs index 1cf3ae25d171..082a19545e6d 100644 --- a/src/meta/src/hummock/manager/gc.rs +++ b/src/meta/src/hummock/manager/gc.rs @@ -48,6 +48,8 @@ pub(crate) struct GcManager { store: ObjectStoreRef, path_prefix: String, use_new_object_prefix_strategy: bool, + /// These objects may still be used by backup or time travel. + may_delete_object_ids: parking_lot::Mutex>, } impl GcManager { @@ -60,6 +62,7 @@ impl GcManager { store, path_prefix: path_prefix.to_owned(), use_new_object_prefix_strategy, + may_delete_object_ids: Default::default(), } } @@ -101,7 +104,7 @@ impl GcManager { prefix: Option, start_after: Option, limit: Option, - ) -> Result<(Vec, u64, u64, Option)> { + ) -> Result<(HashSet, u64, u64, Option)> { tracing::debug!( sst_retention_watermark, prefix, @@ -139,7 +142,7 @@ impl GcManager { }; async move { result } }) - .try_collect::>() + .try_collect::>() .await?; Ok(( filtered, @@ -148,6 +151,28 @@ impl GcManager { next_start_after, )) } + + pub fn add_may_delete_object_ids( + &self, + may_delete_object_ids: impl Iterator, + ) { + self.may_delete_object_ids + .lock() + .extend(may_delete_object_ids); + } + + /// Takes if `least_count` elements available. + pub fn try_take_may_delete_object_ids( + &self, + least_count: usize, + ) -> Option> { + let mut guard = self.may_delete_object_ids.lock(); + if guard.len() < least_count { + None + } else { + Some(std::mem::take(&mut *guard)) + } + } } impl HummockManager { @@ -295,6 +320,15 @@ impl HummockManager { tracing::info!(total_object_count, total_object_size, "Finish GC"); self.metrics.total_object_size.set(total_object_size as _); self.metrics.total_object_count.set(total_object_count as _); + match self.time_travel_pinned_object_count().await { + Ok(count) => { + self.metrics.time_travel_object_count.set(count as _); + } + Err(err) => { + use thiserror_ext::AsReport; + tracing::warn!(error = %err.as_report(), "Failed to count time travel objects."); + } + } Ok(()) } @@ -302,7 +336,7 @@ impl HummockManager { /// Returns number of SSTs to delete. pub(crate) async fn complete_gc_batch( &self, - object_ids: Vec, + object_ids: HashSet, backup_manager: Option, ) -> Result { if object_ids.is_empty() { @@ -324,31 +358,23 @@ impl HummockManager { metrics .full_gc_candidate_object_count .observe(candidate_object_number as _); - let pinned_object_ids = self - .all_object_ids_in_time_travel() - .await? - .collect::>(); - self.metrics - .time_travel_object_count - .set(pinned_object_ids.len() as _); - // filter by SST id watermark, i.e. minimum id of uncommitted SSTs reported by compute nodes. + // filter by metadata backup let object_ids = object_ids .into_iter() - .filter(|id| *id < min_sst_id) + .filter(|s| !pinned_by_metadata_backup.contains(s)) .collect_vec(); - let after_min_sst_id = object_ids.len(); + let after_metadata_backup = object_ids.len(); // filter by time travel archive - let object_ids = object_ids - .into_iter() - .filter(|s| !pinned_object_ids.contains(s)) - .collect_vec(); + let object_ids = self + .filter_out_objects_by_time_travel(object_ids.into_iter()) + .await?; let after_time_travel = object_ids.len(); - // filter by metadata backup + // filter by SST id watermark, i.e. minimum id of uncommitted SSTs reported by compute nodes. let object_ids = object_ids .into_iter() - .filter(|s| !pinned_by_metadata_backup.contains(s)) + .filter(|id| *id < min_sst_id) .collect_vec(); - let after_metadata_backup = object_ids.len(); + let after_min_sst_id = object_ids.len(); // filter by version let after_version = self .finalize_objects_to_delete(object_ids.into_iter()) @@ -359,9 +385,9 @@ impl HummockManager { .observe(after_version_count as _); tracing::info!( candidate_object_number, - after_min_sst_id, - after_time_travel, after_metadata_backup, + after_time_travel, + after_min_sst_id, after_version_count, "complete gc batch" ); @@ -501,6 +527,27 @@ impl HummockManager { } Ok(total) } + + /// Minor GC attempts to delete objects that were part of Hummock version but are no longer in use. + pub async fn try_start_minor_gc(&self, backup_manager: BackupManagerRef) -> Result<()> { + const MIN_MINOR_GC_OBJECT_COUNT: usize = 1000; + let Some(object_ids) = self + .gc_manager + .try_take_may_delete_object_ids(MIN_MINOR_GC_OBJECT_COUNT) + else { + return Ok(()); + }; + // Objects pinned by either meta backup or time travel should be filtered out. + let backup_pinned: HashSet<_> = backup_manager.list_pinned_ssts(); + let object_ids = object_ids + .into_iter() + .filter(|s| !backup_pinned.contains(s)); + let object_ids = self.filter_out_objects_by_time_travel(object_ids).await?; + // Retry is not necessary. Full GC will handle these objects eventually. + self.delete_objects(object_ids.into_iter().collect()) + .await?; + Ok(()) + } } async fn collect_min_uncommitted_sst_id( @@ -580,7 +627,7 @@ mod tests { // Empty input results immediate return, without waiting heartbeat. hummock_manager - .complete_gc_batch(vec![], None) + .complete_gc_batch(vec![].into_iter().collect(), None) .await .unwrap(); @@ -590,7 +637,9 @@ mod tests { 3, hummock_manager .complete_gc_batch( - vec![i64::MAX as u64 - 2, i64::MAX as u64 - 1, i64::MAX as u64], + vec![i64::MAX as u64 - 2, i64::MAX as u64 - 1, i64::MAX as u64] + .into_iter() + .collect(), None, ) .await @@ -616,7 +665,10 @@ mod tests { 1, hummock_manager .complete_gc_batch( - [committed_object_ids, vec![max_committed_object_id + 1]].concat(), + [committed_object_ids, vec![max_committed_object_id + 1]] + .concat() + .into_iter() + .collect(), None, ) .await diff --git a/src/meta/src/hummock/manager/time_travel.rs b/src/meta/src/hummock/manager/time_travel.rs index 1ceaad5cfd39..9391169f00a2 100644 --- a/src/meta/src/hummock/manager/time_travel.rs +++ b/src/meta/src/hummock/manager/time_travel.rs @@ -15,7 +15,6 @@ use std::collections::{HashMap, HashSet, VecDeque}; use anyhow::anyhow; -use itertools::Itertools; use risingwave_common::catalog::TableId; use risingwave_common::util::epoch::Epoch; use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; @@ -35,8 +34,8 @@ use risingwave_meta_model::{ use risingwave_pb::hummock::{PbHummockVersion, PbHummockVersionDelta}; use sea_orm::ActiveValue::Set; use sea_orm::{ - ColumnTrait, Condition, DatabaseTransaction, EntityTrait, QueryFilter, QueryOrder, QuerySelect, - TransactionTrait, + ColumnTrait, Condition, DatabaseTransaction, EntityTrait, PaginatorTrait, QueryFilter, + QueryOrder, QuerySelect, TransactionTrait, }; use crate::hummock::error::{Error, Result}; @@ -54,7 +53,7 @@ impl HummockManager { .order_by_desc(hummock_time_travel_version::Column::VersionId) .one(&sql_store.conn) .await? - .map(|v| HummockVersion::from_persisted_protobuf(&v.version.to_protobuf())) + .map(|v| IncompleteHummockVersion::from_persisted_protobuf(&v.version.to_protobuf())) else { return Ok(()); }; @@ -66,9 +65,9 @@ impl HummockManager { &self, epoch_watermark: HummockEpoch, ) -> Result<()> { + let min_pinned_version_id = self.context_info.read().await.min_pinned_version_id(); let sql_store = self.env.meta_store_ref(); let txn = sql_store.conn.begin().await?; - let version_watermark = hummock_epoch_to_version::Entity::find() .filter( hummock_epoch_to_version::Column::Epoch @@ -82,6 +81,10 @@ impl HummockManager { txn.commit().await?; return Ok(()); }; + let watermark_version_id = std::cmp::min( + version_watermark.version_id, + min_pinned_version_id.to_u64().try_into().unwrap(), + ); let res = hummock_epoch_to_version::Entity::delete_many() .filter( hummock_epoch_to_version::Column::Epoch @@ -94,31 +97,35 @@ impl HummockManager { "delete {} rows from hummock_epoch_to_version", res.rows_affected ); - let earliest_valid_version = hummock_time_travel_version::Entity::find() - .filter( - hummock_time_travel_version::Column::VersionId.lte(version_watermark.version_id), - ) + let latest_valid_version = hummock_time_travel_version::Entity::find() + .filter(hummock_time_travel_version::Column::VersionId.lte(watermark_version_id)) .order_by_desc(hummock_time_travel_version::Column::VersionId) .one(&txn) .await? - .map(|m| HummockVersion::from_persisted_protobuf(&m.version.to_protobuf())); - let Some(earliest_valid_version) = earliest_valid_version else { + .map(|m| IncompleteHummockVersion::from_persisted_protobuf(&m.version.to_protobuf())); + let Some(latest_valid_version) = latest_valid_version else { txn.commit().await?; return Ok(()); }; - let (earliest_valid_version_id, earliest_valid_version_sst_ids) = { + let ( + latest_valid_version_id, + latest_valid_version_sst_ids, + latest_valid_version_object_ids, + ) = { ( - earliest_valid_version.id, - earliest_valid_version.get_sst_ids(), + latest_valid_version.id, + latest_valid_version.get_sst_ids(), + latest_valid_version.get_object_ids(), ) }; + let mut object_ids_to_delete: HashSet<_> = HashSet::default(); let version_ids_to_delete: Vec = hummock_time_travel_version::Entity::find() .select_only() .column(hummock_time_travel_version::Column::VersionId) .filter( hummock_time_travel_version::Column::VersionId - .lt(earliest_valid_version_id.to_u64()), + .lt(latest_valid_version_id.to_u64()), ) .order_by_desc(hummock_time_travel_version::Column::VersionId) .into_tuple() @@ -130,7 +137,7 @@ impl HummockManager { .column(hummock_time_travel_delta::Column::VersionId) .filter( hummock_time_travel_delta::Column::VersionId - .lt(earliest_valid_version_id.to_u64()), + .lt(latest_valid_version_id.to_u64()), ) .into_tuple() .all(&txn) @@ -145,25 +152,27 @@ impl HummockManager { delta_id_to_delete ))) })?; - let new_sst_ids = HummockVersionDelta::from_persisted_protobuf( + let delta_to_delete = IncompleteHummockVersionDelta::from_persisted_protobuf( &delta_to_delete.version_delta.to_protobuf(), - ) - .newly_added_sst_ids(); + ); + let new_sst_ids = delta_to_delete.newly_added_sst_ids(); // The SST ids added and then deleted by compaction between the 2 versions. - let sst_ids_to_delete = &new_sst_ids - &earliest_valid_version_sst_ids; + let sst_ids_to_delete = &new_sst_ids - &latest_valid_version_sst_ids; let res = hummock_sstable_info::Entity::delete_many() .filter(hummock_sstable_info::Column::SstId.is_in(sst_ids_to_delete)) .exec(&txn) .await?; + let new_object_ids = delta_to_delete.newly_added_object_ids(); + object_ids_to_delete.extend(&new_object_ids - &latest_valid_version_object_ids); tracing::debug!( - delta_id = delta_to_delete.version_id, + delta_id = delta_to_delete.id.to_u64(), "delete {} rows from hummock_sstable_info", res.rows_affected ); } - let mut next_version_sst_ids = earliest_valid_version_sst_ids; + let mut next_version_sst_ids = latest_valid_version_sst_ids; for prev_version_id in version_ids_to_delete { - let sst_ids = { + let prev_version = { let prev_version = hummock_time_travel_version::Entity::find_by_id(prev_version_id) .one(&txn) .await? @@ -173,15 +182,19 @@ impl HummockManager { prev_version_id ))) })?; - HummockVersion::from_persisted_protobuf(&prev_version.version.to_protobuf()) - .get_sst_ids() + IncompleteHummockVersion::from_persisted_protobuf( + &prev_version.version.to_protobuf(), + ) }; + let sst_ids = prev_version.get_sst_ids(); // The SST ids deleted by compaction between the 2 versions. let sst_ids_to_delete = &sst_ids - &next_version_sst_ids; let res = hummock_sstable_info::Entity::delete_many() .filter(hummock_sstable_info::Column::SstId.is_in(sst_ids_to_delete)) .exec(&txn) .await?; + let new_object_ids = prev_version.get_object_ids(); + object_ids_to_delete.extend(&new_object_ids - &latest_valid_version_object_ids); tracing::debug!( prev_version_id, "delete {} rows from hummock_sstable_info", @@ -189,30 +202,33 @@ impl HummockManager { ); next_version_sst_ids = sst_ids; } + if !object_ids_to_delete.is_empty() { + self.gc_manager + .add_may_delete_object_ids(object_ids_to_delete.into_iter()); + } let res = hummock_time_travel_version::Entity::delete_many() .filter( - hummock_time_travel_version::Column::VersionId - .lt(earliest_valid_version_id.to_u64()), + hummock_time_travel_version::Column::VersionId.lt(latest_valid_version_id.to_u64()), ) .exec(&txn) .await?; tracing::debug!( - epoch_watermark_version_id = ?version_watermark.version_id, - ?earliest_valid_version_id, + epoch_watermark_version_id = ?watermark_version_id, + ?latest_valid_version_id, "delete {} rows from hummock_time_travel_version", res.rows_affected ); let res = hummock_time_travel_delta::Entity::delete_many() .filter( - hummock_time_travel_delta::Column::VersionId.lt(earliest_valid_version_id.to_u64()), + hummock_time_travel_delta::Column::VersionId.lt(latest_valid_version_id.to_u64()), ) .exec(&txn) .await?; tracing::debug!( - epoch_watermark_version_id = ?version_watermark.version_id, - ?earliest_valid_version_id, + epoch_watermark_version_id = ?watermark_version_id, + ?latest_valid_version_id, "delete {} rows from hummock_time_travel_delta", res.rows_affected ); @@ -221,21 +237,38 @@ impl HummockManager { Ok(()) } - pub(crate) async fn all_object_ids_in_time_travel( + pub(crate) async fn filter_out_objects_by_time_travel( &self, - ) -> Result> { - let object_ids: Vec = - hummock_sstable_info::Entity::find() - .select_only() - .column(hummock_sstable_info::Column::ObjectId) - .into_tuple() - .all(&self.env.meta_store_ref().conn) - .await?; - let object_ids = object_ids - .into_iter() - .unique() - .map(|object_id| HummockSstableObjectId::try_from(object_id).unwrap()); - Ok(object_ids) + objects: impl Iterator, + ) -> Result> { + // The input object count is much smaller than time travel pinned object count in meta store. + // So search input object in meta store. + let mut result: HashSet<_> = objects.collect(); + let mut remain: VecDeque<_> = result.iter().copied().collect(); + const FILTER_BATCH_SIZE: usize = 1000; + while !remain.is_empty() { + let batch = remain.drain(..std::cmp::min(remain.len(), FILTER_BATCH_SIZE)); + let reject_object_ids: Vec = + hummock_sstable_info::Entity::find() + .filter(hummock_sstable_info::Column::ObjectId.is_in(batch)) + .select_only() + .column(hummock_sstable_info::Column::ObjectId) + .into_tuple() + .all(&self.env.meta_store_ref().conn) + .await?; + for reject in reject_object_ids { + let object_id = HummockSstableObjectId::try_from(reject).unwrap(); + result.remove(&object_id); + } + } + Ok(result) + } + + pub(crate) async fn time_travel_pinned_object_count(&self) -> Result { + let count = hummock_sstable_info::Entity::find() + .count(&self.env.meta_store_ref().conn) + .await?; + Ok(count) } /// Attempt to locate the version corresponding to `query_epoch`. @@ -457,10 +490,13 @@ impl HummockManager { } } +/// The `HummockVersion` is actually `InHummockVersion`. It requires `refill_version`. fn replay_archive( version: PbHummockVersion, deltas: impl Iterator, ) -> HummockVersion { + // The pb version ann pb version delta are actually written by InHummockVersion and InHummockVersionDelta, respectively. + // Using HummockVersion make it easier for `refill_version` later. let mut last_version = HummockVersion::from_persisted_protobuf(&version); for d in deltas { let d = HummockVersionDelta::from_persisted_protobuf(&d); diff --git a/src/meta/src/hummock/mod.rs b/src/meta/src/hummock/mod.rs index 7b8af79ae960..cb4be256a455 100644 --- a/src/meta/src/hummock/mod.rs +++ b/src/meta/src/hummock/mod.rs @@ -15,6 +15,7 @@ pub mod compaction; pub mod compactor_manager; pub mod error; mod manager; + pub use manager::*; use thiserror_ext::AsReport; @@ -33,11 +34,13 @@ pub use mock_hummock_meta_client::MockHummockMetaClient; use tokio::sync::oneshot::Sender; use tokio::task::JoinHandle; +use crate::backup_restore::BackupManagerRef; use crate::MetaOpts; /// Start hummock's asynchronous tasks. pub fn start_hummock_workers( hummock_manager: HummockManagerRef, + backup_manager: BackupManagerRef, meta_opts: &MetaOpts, ) -> Vec<(JoinHandle<()>, Sender<()>)> { // These critical tasks are put in their own timer loop deliberately, to avoid long-running ones @@ -45,6 +48,7 @@ pub fn start_hummock_workers( let workers = vec![ start_checkpoint_loop( hummock_manager.clone(), + backup_manager, Duration::from_secs(meta_opts.hummock_version_checkpoint_interval_sec), meta_opts.min_delta_log_num_for_hummock_version_checkpoint, ), @@ -85,6 +89,7 @@ pub fn start_vacuum_metadata_loop( pub fn start_checkpoint_loop( hummock_manager: HummockManagerRef, + backup_manager: BackupManagerRef, interval: Duration, min_delta_log_num: u64, ) -> (JoinHandle<()>, Sender<()>) { @@ -111,7 +116,18 @@ pub fn start_checkpoint_loop( .create_version_checkpoint(min_delta_log_num) .await { - tracing::warn!(error = %err.as_report(), "Hummock version checkpoint error"); + tracing::warn!(error = %err.as_report(), "Hummock version checkpoint error."); + } else { + let backup_manager_2 = backup_manager.clone(); + let hummock_manager_2 = hummock_manager.clone(); + tokio::task::spawn(async move { + let _ = hummock_manager_2 + .try_start_minor_gc(backup_manager_2) + .await + .inspect_err(|err| { + tracing::warn!(error = %err.as_report(), "Hummock minor GC error."); + }); + }); } } }); diff --git a/src/meta/src/manager/metadata.rs b/src/meta/src/manager/metadata.rs index 40d3c025c0c8..b974ad82b053 100644 --- a/src/meta/src/manager/metadata.rs +++ b/src/meta/src/manager/metadata.rs @@ -25,6 +25,7 @@ use risingwave_pb::catalog::{PbSink, PbSource, PbTable}; use risingwave_pb::common::worker_node::{PbResource, State}; use risingwave_pb::common::{HostAddress, PbWorkerNode, PbWorkerType, WorkerNode, WorkerType}; use risingwave_pb::meta::add_worker_node_request::Property as AddNodeProperty; +use risingwave_pb::meta::list_rate_limits_response::RateLimitInfo; use risingwave_pb::meta::table_fragments::{Fragment, PbFragment}; use risingwave_pb::stream_plan::{PbDispatchStrategy, StreamActor}; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver}; @@ -657,14 +658,14 @@ impl MetadataManager { .collect()) } - pub async fn update_mv_rate_limit_by_table_id( + pub async fn update_backfill_rate_limit_by_table_id( &self, table_id: TableId, rate_limit: Option, ) -> MetaResult>> { let fragment_actors = self .catalog_controller - .update_mv_rate_limit_by_job_id(table_id.table_id as _, rate_limit) + .update_backfill_rate_limit_by_job_id(table_id.table_id as _, rate_limit) .await?; Ok(fragment_actors .into_iter() @@ -720,6 +721,11 @@ impl MetadataManager { pub fn cluster_id(&self) -> &ClusterId { self.cluster_controller.cluster_id() } + + pub async fn list_rate_limits(&self) -> MetaResult> { + let rate_limits = self.catalog_controller.list_rate_limits().await?; + Ok(rate_limits) + } } impl MetadataManager { diff --git a/src/meta/src/rpc/ddl_controller.rs b/src/meta/src/rpc/ddl_controller.rs index 8ef2ec8dc4c9..5b6b7033719c 100644 --- a/src/meta/src/rpc/ddl_controller.rs +++ b/src/meta/src/rpc/ddl_controller.rs @@ -659,7 +659,7 @@ impl DdlController { // Meanwhile, the Dispatcher corresponding to the upstream of the merge will also be added to the replace table context here. pub(crate) async fn inject_replace_table_job_for_table_sink( &self, - dummy_id: u32, + tmp_id: u32, mgr: &MetadataManager, stream_ctx: StreamContext, sink: Option<&Sink>, @@ -669,13 +669,7 @@ impl DdlController { fragment_graph: StreamFragmentGraph, ) -> MetaResult<(ReplaceTableContext, TableFragments)> { let (mut replace_table_ctx, mut table_fragments) = self - .build_replace_table( - stream_ctx, - streaming_job, - fragment_graph, - None, - dummy_id as _, - ) + .build_replace_table(stream_ctx, streaming_job, fragment_graph, None, tmp_id as _) .await?; let mut union_fragment_id = None; @@ -1192,7 +1186,7 @@ impl DdlController { let table = streaming_job.table().unwrap(); tracing::debug!(id = streaming_job.id(), "replacing table for dropped sink"); - let dummy_id = self + let tmp_id = self .metadata_manager .catalog_controller .create_job_catalog_for_replace( @@ -1206,7 +1200,7 @@ impl DdlController { let (ctx, table_fragments) = self .inject_replace_table_job_for_table_sink( - dummy_id, + tmp_id, &self.metadata_manager, stream_ctx, None, @@ -1238,7 +1232,7 @@ impl DdlController { .metadata_manager .catalog_controller .finish_replace_streaming_job( - dummy_id as _, + tmp_id as _, streaming_job, merge_updates, None, @@ -1253,7 +1247,7 @@ impl DdlController { tracing::error!(id = object_id, error = ?err.as_report(), "failed to replace table"); let _ = self.metadata_manager .catalog_controller - .try_abort_replacing_streaming_job(dummy_id as _) + .try_abort_replacing_streaming_job(tmp_id as _) .await .inspect_err(|err| { tracing::error!(id = object_id, error = ?err.as_report(), "failed to abort replacing table"); @@ -1340,7 +1334,7 @@ impl DdlController { let StreamingJob::Table(_, table, ..) = &streaming_job else { unreachable!("unexpected job: {streaming_job:?}") }; - let dummy_id = self + let tmp_id = self .metadata_manager .catalog_controller .create_job_catalog_for_replace( @@ -1362,7 +1356,7 @@ impl DdlController { &streaming_job, fragment_graph, table_col_index_mapping.clone(), - dummy_id as _, + tmp_id as _, ) .await?; @@ -1437,7 +1431,7 @@ impl DdlController { .metadata_manager .catalog_controller .finish_replace_streaming_job( - dummy_id, + tmp_id, streaming_job, merge_updates, table_col_index_mapping, @@ -1452,7 +1446,7 @@ impl DdlController { tracing::error!(id = job_id, error = ?err.as_report(), "failed to replace table"); let _ = self.metadata_manager .catalog_controller - .try_abort_replacing_streaming_job(dummy_id) + .try_abort_replacing_streaming_job(tmp_id) .await.inspect_err(|err| { tracing::error!(id = job_id, error = ?err.as_report(), "failed to abort replacing table"); }); @@ -1651,7 +1645,7 @@ impl DdlController { }; let table = streaming_job.table().unwrap(); - let dummy_id = self + let tmp_id = self .metadata_manager .catalog_controller .create_job_catalog_for_replace( @@ -1665,7 +1659,7 @@ impl DdlController { let (context, table_fragments) = self .inject_replace_table_job_for_table_sink( - dummy_id, + tmp_id, &self.metadata_manager, stream_ctx, Some(s), @@ -1718,7 +1712,7 @@ impl DdlController { stream_job: &StreamingJob, mut fragment_graph: StreamFragmentGraph, table_col_index_mapping: Option, - dummy_table_id: TableId, + tmp_table_id: TableId, ) -> MetaResult<(ReplaceTableContext, TableFragments)> { let id = stream_job.id(); let expr_context = stream_ctx.to_expr_context(); @@ -1828,7 +1822,7 @@ impl DdlController { // the context that contains all information needed for building the actors on the compute // nodes. let table_fragments = TableFragments::new( - (dummy_table_id as u32).into(), + (tmp_table_id as u32).into(), graph, &building_locations.actor_locations, stream_ctx, @@ -1846,7 +1840,7 @@ impl DdlController { building_locations, existing_locations, streaming_job: stream_job.clone(), - dummy_id: dummy_table_id as _, + tmp_id: tmp_table_id as _, }; Ok((ctx, table_fragments)) diff --git a/src/meta/src/stream/stream_manager.rs b/src/meta/src/stream/stream_manager.rs index d15a73ecfa9c..2d7ab47a7c78 100644 --- a/src/meta/src/stream/stream_manager.rs +++ b/src/meta/src/stream/stream_manager.rs @@ -185,7 +185,7 @@ pub struct ReplaceTableContext { pub streaming_job: StreamingJob, - pub dummy_id: u32, + pub tmp_id: u32, } /// `GlobalStreamManager` manages all the streams in the system. @@ -354,11 +354,10 @@ impl GlobalStreamManager { .prepare_streaming_job(&table_fragments, &streaming_job, true) .await?; - let dummy_table_id = table_fragments.table_id(); - let init_split_assignment = - self.source_manager.allocate_splits(&dummy_table_id).await?; + let tmp_table_id = table_fragments.table_id(); + let init_split_assignment = self.source_manager.allocate_splits(&tmp_table_id).await?; - replace_table_id = Some(dummy_table_id); + replace_table_id = Some(tmp_table_id); replace_table_command = Some(ReplaceTablePlan { old_table_fragments: context.old_table_fragments, @@ -367,7 +366,7 @@ impl GlobalStreamManager { dispatchers: context.dispatchers, init_split_assignment, streaming_job, - dummy_id: dummy_table_id.table_id, + tmp_id: tmp_table_id.table_id, }); } @@ -447,8 +446,8 @@ impl GlobalStreamManager { if create_type == CreateType::Foreground || err.is_cancelled() { let mut table_ids: HashSet = HashSet::from_iter(std::iter::once(table_id)); - if let Some(dummy_table_id) = replace_table_id { - table_ids.insert(dummy_table_id); + if let Some(tmp_table_id) = replace_table_id { + table_ids.insert(tmp_table_id); } } @@ -465,13 +464,13 @@ impl GlobalStreamManager { old_table_fragments, merge_updates, dispatchers, - dummy_id, + tmp_id, streaming_job, .. }: ReplaceTableContext, ) -> MetaResult<()> { - let dummy_table_id = table_fragments.table_id(); - let init_split_assignment = self.source_manager.allocate_splits(&dummy_table_id).await?; + let tmp_table_id = table_fragments.table_id(); + let init_split_assignment = self.source_manager.allocate_splits(&tmp_table_id).await?; self.barrier_scheduler .run_config_change_command_with_pause( @@ -482,7 +481,7 @@ impl GlobalStreamManager { merge_updates, dispatchers, init_split_assignment, - dummy_id, + tmp_id, streaming_job, }), ) diff --git a/src/prost/src/lib.rs b/src/prost/src/lib.rs index 5974a0566472..a4678df09127 100644 --- a/src/prost/src/lib.rs +++ b/src/prost/src/lib.rs @@ -302,6 +302,25 @@ impl stream_plan::StreamNode { } } +impl stream_plan::FragmentTypeFlag { + /// Fragments that may be affected by `BACKFILL_RATE_LIMIT`. + pub fn backfill_rate_limit_fragments() -> i32 { + stream_plan::FragmentTypeFlag::SourceScan as i32 + | stream_plan::FragmentTypeFlag::StreamScan as i32 + } + + /// Fragments that may be affected by `SOURCE_RATE_LIMIT`. + /// Note: for `FsFetch`, old fragments don't have this flag set, so don't use this to check. + pub fn source_rate_limit_fragments() -> i32 { + stream_plan::FragmentTypeFlag::Source as i32 | stream_plan::FragmentTypeFlag::FsFetch as i32 + } + + /// Note: this doesn't include `FsFetch` created in old versions. + pub fn rate_limit_fragments() -> i32 { + Self::backfill_rate_limit_fragments() | Self::source_rate_limit_fragments() + } +} + impl catalog::StreamSourceInfo { /// Refer to [`Self::cdc_source_job`] for details. pub fn is_shared(&self) -> bool { diff --git a/src/rpc_client/src/meta_client.rs b/src/rpc_client/src/meta_client.rs index be733e8d4ec1..80213d0deda6 100644 --- a/src/rpc_client/src/meta_client.rs +++ b/src/rpc_client/src/meta_client.rs @@ -25,6 +25,7 @@ use async_trait::async_trait; use cluster_limit_service_client::ClusterLimitServiceClient; use either::Either; use futures::stream::BoxStream; +use list_rate_limits_response::RateLimitInfo; use lru::LruCache; use risingwave_common::catalog::{FunctionId, IndexId, SecretId, TableId}; use risingwave_common::config::{MetaConfig, MAX_CONNECTION_WINDOW_SIZE}; @@ -1494,6 +1495,13 @@ impl MetaClient { self.inner.merge_compaction_group(req).await?; Ok(()) } + + /// List all rate limits for sources and backfills + pub async fn list_rate_limits(&self) -> Result> { + let request = ListRateLimitsRequest {}; + let resp = self.inner.list_rate_limits(request).await?; + Ok(resp.rate_limits) + } } #[async_trait] @@ -2044,6 +2052,7 @@ macro_rules! for_all_meta_rpc { ,{ stream_client, list_actor_splits, ListActorSplitsRequest, ListActorSplitsResponse } ,{ stream_client, list_object_dependencies, ListObjectDependenciesRequest, ListObjectDependenciesResponse } ,{ stream_client, recover, RecoverRequest, RecoverResponse } + ,{ stream_client, list_rate_limits, ListRateLimitsRequest, ListRateLimitsResponse } ,{ ddl_client, create_table, CreateTableRequest, CreateTableResponse } ,{ ddl_client, alter_name, AlterNameRequest, AlterNameResponse } ,{ ddl_client, alter_owner, AlterOwnerRequest, AlterOwnerResponse } diff --git a/src/sqlparser/src/ast/mod.rs b/src/sqlparser/src/ast/mod.rs index 9c989db50e97..be9d4ae489f5 100644 --- a/src/sqlparser/src/ast/mod.rs +++ b/src/sqlparser/src/ast/mod.rs @@ -1141,6 +1141,7 @@ pub enum ExplainFormat { Json, Xml, Yaml, + Dot, } impl fmt::Display for ExplainFormat { @@ -1150,6 +1151,7 @@ impl fmt::Display for ExplainFormat { ExplainFormat::Json => f.write_str("JSON"), ExplainFormat::Xml => f.write_str("XML"), ExplainFormat::Yaml => f.write_str("YAML"), + ExplainFormat::Dot => f.write_str("DOT"), } } } @@ -3098,8 +3100,6 @@ pub struct CreateFunctionBody { pub return_: Option, /// USING ... pub using: Option, - - pub function_type: Option, } impl fmt::Display for CreateFunctionBody { @@ -3122,9 +3122,6 @@ impl fmt::Display for CreateFunctionBody { if let Some(using) = &self.using { write!(f, " {using}")?; } - if let Some(function_type) = &self.function_type { - write!(f, " {function_type}")?; - } Ok(()) } } @@ -3197,26 +3194,6 @@ impl fmt::Display for CreateFunctionUsing { } } -#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub enum CreateFunctionType { - Sync, - Async, - Generator, - AsyncGenerator, -} - -impl fmt::Display for CreateFunctionType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - CreateFunctionType::Sync => write!(f, "SYNC"), - CreateFunctionType::Async => write!(f, "ASYNC"), - CreateFunctionType::Generator => write!(f, "SYNC GENERATOR"), - CreateFunctionType::AsyncGenerator => write!(f, "ASYNC GENERATOR"), - } - } -} - #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum SetVariableValue { @@ -3460,12 +3437,11 @@ mod tests { returns: Some(CreateFunctionReturns::Value(DataType::Int)), params: CreateFunctionBody { language: Some(Ident::new_unchecked("python")), + runtime: None, behavior: Some(FunctionBehavior::Immutable), as_: Some(FunctionDefinition::SingleQuotedDef("SELECT 1".to_string())), return_: None, using: None, - runtime: None, - function_type: None, }, with_options: CreateFunctionWithOptions { always_retry_on_network_error: None, @@ -3483,12 +3459,11 @@ mod tests { returns: Some(CreateFunctionReturns::Value(DataType::Int)), params: CreateFunctionBody { language: Some(Ident::new_unchecked("python")), + runtime: None, behavior: Some(FunctionBehavior::Immutable), as_: Some(FunctionDefinition::SingleQuotedDef("SELECT 1".to_string())), return_: None, using: None, - runtime: None, - function_type: None, }, with_options: CreateFunctionWithOptions { always_retry_on_network_error: Some(true), @@ -3498,29 +3473,5 @@ mod tests { "CREATE FUNCTION foo(INT) RETURNS INT LANGUAGE python IMMUTABLE AS 'SELECT 1' WITH ( ALWAYS_RETRY_NETWORK_ERRORS = true )", format!("{}", create_function) ); - - let create_function = Statement::CreateFunction { - temporary: false, - or_replace: false, - name: ObjectName(vec![Ident::new_unchecked("foo")]), - args: Some(vec![OperateFunctionArg::unnamed(DataType::Int)]), - returns: Some(CreateFunctionReturns::Value(DataType::Int)), - params: CreateFunctionBody { - language: Some(Ident::new_unchecked("javascript")), - behavior: None, - as_: Some(FunctionDefinition::SingleQuotedDef("SELECT 1".to_string())), - return_: None, - using: None, - runtime: Some(Ident::new_unchecked("deno")), - function_type: Some(CreateFunctionType::AsyncGenerator), - }, - with_options: CreateFunctionWithOptions { - always_retry_on_network_error: None, - }, - }; - assert_eq!( - "CREATE FUNCTION foo(INT) RETURNS INT LANGUAGE javascript RUNTIME deno AS 'SELECT 1' ASYNC GENERATOR", - format!("{}", create_function) - ); } } diff --git a/src/sqlparser/src/keywords.rs b/src/sqlparser/src/keywords.rs index ead0bec453f0..ccc0ef23502b 100644 --- a/src/sqlparser/src/keywords.rs +++ b/src/sqlparser/src/keywords.rs @@ -90,7 +90,6 @@ define_keywords!( ASENSITIVE, ASOF, ASYMMETRIC, - ASYNC, AT, ATOMIC, AUTHORIZATION, @@ -203,6 +202,7 @@ define_keywords!( DISTRIBUTED, DISTSQL, DO, + DOT, DOUBLE, DROP, DYNAMIC, @@ -252,7 +252,6 @@ define_keywords!( FUNCTIONS, FUSION, GAP, - GENERATOR, GET, GLOBAL, GRANT, diff --git a/src/sqlparser/src/parser.rs b/src/sqlparser/src/parser.rs index d39a2c1a8f64..cf7d8e439457 100644 --- a/src/sqlparser/src/parser.rs +++ b/src/sqlparser/src/parser.rs @@ -2365,15 +2365,6 @@ impl Parser<'_> { } else if self.parse_keyword(Keyword::USING) { ensure_not_set(&body.using, "USING")?; body.using = Some(self.parse_create_function_using()?); - } else if self.parse_keyword(Keyword::SYNC) { - ensure_not_set(&body.function_type, "SYNC | ASYNC")?; - body.function_type = Some(self.parse_function_type(false, false)?); - } else if self.parse_keyword(Keyword::ASYNC) { - ensure_not_set(&body.function_type, "SYNC | ASYNC")?; - body.function_type = Some(self.parse_function_type(true, false)?); - } else if self.parse_keyword(Keyword::GENERATOR) { - ensure_not_set(&body.function_type, "SYNC | ASYNC")?; - body.function_type = Some(self.parse_function_type(false, true)?); } else { return Ok(body); } @@ -2396,25 +2387,6 @@ impl Parser<'_> { } } - fn parse_function_type( - &mut self, - is_async: bool, - is_generator: bool, - ) -> PResult { - let is_generator = if is_generator { - true - } else { - self.parse_keyword(Keyword::GENERATOR) - }; - - match (is_async, is_generator) { - (false, false) => Ok(CreateFunctionType::Sync), - (true, false) => Ok(CreateFunctionType::Async), - (false, true) => Ok(CreateFunctionType::Generator), - (true, true) => Ok(CreateFunctionType::AsyncGenerator), - } - } - // CREATE USER name [ [ WITH ] option [ ... ] ] // where option can be: // SUPERUSER | NOSUPERUSER @@ -3490,7 +3462,7 @@ impl Parser<'_> { } else if let Some(rate_limit) = self.parse_alter_source_rate_limit(false)? { AlterSourceOperation::SetSourceRateLimit { rate_limit } } else { - return self.expected("SCHEMA after SET"); + return self.expected("SCHEMA or SOURCE_RATE_LIMIT after SET"); } } else if self.peek_nth_any_of_keywords(0, &[Keyword::FORMAT]) { let format_encode = self.parse_schema()?.unwrap(); @@ -4067,11 +4039,13 @@ impl Parser<'_> { Keyword::JSON, Keyword::XML, Keyword::YAML, + Keyword::DOT, ])? { Keyword::TEXT => ExplainFormat::Text, Keyword::JSON => ExplainFormat::Json, Keyword::XML => ExplainFormat::Xml, Keyword::YAML => ExplainFormat::Yaml, + Keyword::DOT => ExplainFormat::Dot, _ => unreachable!("{}", keyword), } } diff --git a/src/sqlparser/tests/sqlparser_postgres.rs b/src/sqlparser/tests/sqlparser_postgres.rs index 311b2ba213c4..549920d1c758 100644 --- a/src/sqlparser/tests/sqlparser_postgres.rs +++ b/src/sqlparser/tests/sqlparser_postgres.rs @@ -874,31 +874,6 @@ fn parse_create_function() { with_options: Default::default(), } ); - - let sql = "CREATE FUNCTION add(INT, INT) RETURNS INT LANGUAGE SQL IMMUTABLE AS 'select $1 + $2;' ASYNC"; - assert_eq!( - verified_stmt(sql), - Statement::CreateFunction { - or_replace: false, - temporary: false, - name: ObjectName(vec![Ident::new_unchecked("add")]), - args: Some(vec![ - OperateFunctionArg::unnamed(DataType::Int), - OperateFunctionArg::unnamed(DataType::Int), - ]), - returns: Some(CreateFunctionReturns::Value(DataType::Int)), - params: CreateFunctionBody { - language: Some("SQL".into()), - behavior: Some(FunctionBehavior::Immutable), - as_: Some(FunctionDefinition::SingleQuotedDef( - "select $1 + $2;".into() - )), - function_type: Some(CreateFunctionType::Async), - ..Default::default() - }, - with_options: Default::default(), - } - ); } #[test] diff --git a/src/storage/backup/src/lib.rs b/src/storage/backup/src/lib.rs index 37fa28f4fe26..847efba5f26b 100644 --- a/src/storage/backup/src/lib.rs +++ b/src/storage/backup/src/lib.rs @@ -34,7 +34,6 @@ use std::collections::{HashMap, HashSet}; use std::hash::Hasher; use itertools::Itertools; -use risingwave_common::catalog::TableId; use risingwave_common::RW_VERSION; use risingwave_hummock_sdk::state_table_info::StateTableInfo; use risingwave_hummock_sdk::version::HummockVersion; @@ -56,8 +55,8 @@ pub struct MetaSnapshotMetadata { #[serde(default)] pub format_version: u32, pub remarks: Option, - #[serde(default, with = "table_id_key_map")] - pub state_table_info: HashMap, + #[serde(default)] + pub state_table_info: HashMap, pub rw_version: Option, } @@ -78,7 +77,7 @@ impl MetaSnapshotMetadata { .state_table_info .info() .iter() - .map(|(id, info)| (*id, info.into())) + .map(|(id, info)| (id.table_id, info.into())) .collect(), rw_version: Some(RW_VERSION.to_owned()), } @@ -119,7 +118,7 @@ impl From<&MetaSnapshotMetadata> for PbMetaSnapshotMetadata { state_table_info: m .state_table_info .iter() - .map(|(t, i)| (t.table_id, i.into())) + .map(|(t, i)| (*t, i.into())) .collect(), rw_version: m.rw_version.clone(), } @@ -134,42 +133,3 @@ impl From<&MetaSnapshotManifest> for PbMetaSnapshotManifest { } } } - -mod table_id_key_map { - use std::collections::HashMap; - use std::str::FromStr; - - use risingwave_common::catalog::TableId; - use serde::{Deserialize, Deserializer, Serialize, Serializer}; - - use crate::StateTableInfo; - - pub fn serialize( - map: &HashMap, - serializer: S, - ) -> Result - where - S: Serializer, - { - let map_as_str: HashMap = - map.iter().map(|(k, v)| (k.to_string(), v)).collect(); - map_as_str.serialize(serializer) - } - - pub fn deserialize<'de, D>( - deserializer: D, - ) -> Result, D::Error> - where - D: Deserializer<'de>, - { - let map_as_str: HashMap = - HashMap::deserialize(deserializer).unwrap_or_else(|_| HashMap::new()); - map_as_str - .into_iter() - .map(|(k, v)| { - let key = u32::from_str(&k).map_err(serde::de::Error::custom)?; - Ok((TableId::new(key), v)) - }) - .collect() - } -} diff --git a/src/storage/compactor/src/server.rs b/src/storage/compactor/src/server.rs index bc787e5b18ca..ded76b2bdd94 100644 --- a/src/storage/compactor/src/server.rs +++ b/src/storage/compactor/src/server.rs @@ -362,7 +362,9 @@ pub async fn shared_compactor_serve( await_tree_reg, }; - risingwave_storage::hummock::compactor::start_shared_compactor( + // TODO(shutdown): don't collect there's no need to gracefully shutdown them. + // Hold the join handle and tx to keep the compactor running. + let _compactor_handle = risingwave_storage::hummock::compactor::start_shared_compactor( grpc_proxy_client, receiver, compactor_context, diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs index 7a1f0851d3d4..ac9521b0f527 100644 --- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs +++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs @@ -31,12 +31,13 @@ use super::{group_split, StateTableId}; use crate::change_log::{ChangeLogDeltaCommon, TableChangeLogCommon}; use crate::compaction_group::StaticCompactionGroupId; use crate::key_range::KeyRangeCommon; -use crate::level::{Level, Levels, OverlappingLevel}; +use crate::level::{Level, LevelCommon, Levels, OverlappingLevel}; use crate::sstable_info::SstableInfo; use crate::table_watermark::{ReadTableWatermark, TableWatermarks}; use crate::version::{ GroupDelta, GroupDeltaCommon, HummockVersion, HummockVersionCommon, HummockVersionDelta, - HummockVersionStateTableInfo, IntraLevelDelta, IntraLevelDeltaCommon, + HummockVersionStateTableInfo, IntraLevelDelta, IntraLevelDeltaCommon, ObjectIdReader, + SstableIdReader, }; use crate::{can_concat, CompactionGroupId, HummockSstableId, HummockSstableObjectId}; #[derive(Debug, Clone, Default)] @@ -64,33 +65,6 @@ impl HummockVersion { .unwrap_or_else(|| panic!("compaction group {} does not exist", compaction_group_id)) } - pub fn get_combined_levels(&self) -> impl Iterator + '_ { - self.levels - .values() - .flat_map(|level| level.l0.sub_levels.iter().rev().chain(level.levels.iter())) - } - - pub fn get_object_ids(&self) -> HashSet { - self.get_sst_infos().map(|s| s.object_id).collect() - } - - pub fn get_sst_ids(&self) -> HashSet { - self.get_sst_infos().map(|s| s.sst_id).collect() - } - - pub fn get_sst_infos(&self) -> impl Iterator { - self.get_combined_levels() - .flat_map(|level| level.table_infos.iter()) - .chain(self.table_change_log.values().flat_map(|change_log| { - change_log.0.iter().flat_map(|epoch_change_log| { - epoch_change_log - .old_value - .iter() - .chain(epoch_change_log.new_value.iter()) - }) - })) - } - // only scan the sst infos from levels in the specified compaction group (without table change log) pub fn get_sst_ids_by_group_id( &self, @@ -859,9 +833,7 @@ impl HummockVersion { group_split::merge_levels(left_levels, right_levels); } -} -impl HummockVersionCommon { pub fn init_with_parent_group_v2( &mut self, parent_group_id: CompactionGroupId, @@ -993,6 +965,38 @@ impl HummockVersionCommon { } } +impl HummockVersionCommon +where + T: SstableIdReader + ObjectIdReader, +{ + pub fn get_combined_levels(&self) -> impl Iterator> + '_ { + self.levels + .values() + .flat_map(|level| level.l0.sub_levels.iter().rev().chain(level.levels.iter())) + } + + pub fn get_object_ids(&self) -> HashSet { + self.get_sst_infos().map(|s| s.object_id()).collect() + } + + pub fn get_sst_ids(&self) -> HashSet { + self.get_sst_infos().map(|s| s.sst_id()).collect() + } + + pub fn get_sst_infos(&self) -> impl Iterator { + self.get_combined_levels() + .flat_map(|level| level.table_infos.iter()) + .chain(self.table_change_log.values().flat_map(|change_log| { + change_log.0.iter().flat_map(|epoch_change_log| { + epoch_change_log + .old_value + .iter() + .chain(epoch_change_log.new_value.iter()) + }) + })) + } +} + impl Levels { pub(crate) fn apply_compact_ssts( &mut self, diff --git a/src/storage/hummock_sdk/src/sstable_info.rs b/src/storage/hummock_sdk/src/sstable_info.rs index 22a3cd8f31fc..10afe52ab9ad 100644 --- a/src/storage/hummock_sdk/src/sstable_info.rs +++ b/src/storage/hummock_sdk/src/sstable_info.rs @@ -17,6 +17,8 @@ use std::mem::size_of; use risingwave_pb::hummock::{PbBloomFilterType, PbKeyRange, PbSstableInfo}; use crate::key_range::KeyRange; +use crate::version::{ObjectIdReader, SstableIdReader}; +use crate::{HummockSstableId, HummockSstableObjectId}; #[derive(Debug, PartialEq, Clone, Default)] pub struct SstableInfo { @@ -216,3 +218,15 @@ impl SstableInfo { self.key_range = KeyRange::default(); } } + +impl SstableIdReader for SstableInfo { + fn sst_id(&self) -> HummockSstableId { + self.sst_id + } +} + +impl ObjectIdReader for SstableInfo { + fn object_id(&self) -> HummockSstableObjectId { + self.object_id + } +} diff --git a/src/storage/hummock_sdk/src/state_table_info.rs b/src/storage/hummock_sdk/src/state_table_info.rs index b15919fb2b06..3a0978b4f8f5 100644 --- a/src/storage/hummock_sdk/src/state_table_info.rs +++ b/src/storage/hummock_sdk/src/state_table_info.rs @@ -17,7 +17,9 @@ use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Clone)] pub struct StateTableInfo { + #[serde(default)] pub committed_epoch: u64, + #[serde(default)] pub compaction_group_id: u64, } diff --git a/src/storage/hummock_sdk/src/time_travel.rs b/src/storage/hummock_sdk/src/time_travel.rs index 3a58a7daa760..4f2508a5772b 100644 --- a/src/storage/hummock_sdk/src/time_travel.rs +++ b/src/storage/hummock_sdk/src/time_travel.rs @@ -23,8 +23,9 @@ use crate::level::Level; use crate::sstable_info::SstableInfo; use crate::version::{ HummockVersion, HummockVersionCommon, HummockVersionDelta, HummockVersionDeltaCommon, + ObjectIdReader, SstableIdReader, }; -use crate::{CompactionGroupId, HummockSstableId}; +use crate::{CompactionGroupId, HummockSstableId, HummockSstableObjectId}; pub type IncompleteHummockVersion = HummockVersionCommon; @@ -167,12 +168,28 @@ impl From<(&HummockVersionDelta, &HashSet)> for IncompleteHum } } -pub struct SstableIdInVersion(HummockSstableId); +pub struct SstableIdInVersion { + sst_id: HummockSstableId, + object_id: HummockSstableObjectId, +} + +impl SstableIdReader for SstableIdInVersion { + fn sst_id(&self) -> HummockSstableId { + self.sst_id + } +} + +impl ObjectIdReader for SstableIdInVersion { + fn object_id(&self) -> HummockSstableObjectId { + self.object_id + } +} impl From<&SstableIdInVersion> for PbSstableInfo { fn from(sst_id: &SstableIdInVersion) -> Self { Self { - sst_id: sst_id.0, + sst_id: sst_id.sst_id, + object_id: sst_id.object_id, ..Default::default() } } @@ -185,8 +202,11 @@ impl From for PbSstableInfo { } impl From<&PbSstableInfo> for SstableIdInVersion { - fn from(value: &PbSstableInfo) -> Self { - SstableIdInVersion(value.sst_id) + fn from(s: &PbSstableInfo) -> Self { + SstableIdInVersion { + sst_id: s.sst_id, + object_id: s.object_id, + } } } diff --git a/src/storage/hummock_sdk/src/version.rs b/src/storage/hummock_sdk/src/version.rs index 80276b09ffdf..6266ee84474b 100644 --- a/src/storage/hummock_sdk/src/version.rs +++ b/src/storage/hummock_sdk/src/version.rs @@ -37,7 +37,8 @@ use crate::level::LevelsCommon; use crate::sstable_info::SstableInfo; use crate::table_watermark::TableWatermarks; use crate::{ - CompactionGroupId, HummockEpoch, HummockSstableObjectId, HummockVersionId, FIRST_VERSION_ID, + CompactionGroupId, HummockEpoch, HummockSstableId, HummockSstableObjectId, HummockVersionId, + FIRST_VERSION_ID, }; #[derive(Debug, Clone, PartialEq)] @@ -506,27 +507,38 @@ where } } -impl HummockVersionDelta { +pub trait SstableIdReader { + fn sst_id(&self) -> HummockSstableId; +} + +pub trait ObjectIdReader { + fn object_id(&self) -> HummockSstableObjectId; +} + +impl HummockVersionDeltaCommon +where + T: SstableIdReader + ObjectIdReader, +{ /// Get the newly added object ids from the version delta. /// /// Note: the result can be false positive because we only collect the set of sst object ids in the `inserted_table_infos`, /// but it is possible that the object is moved or split from other compaction groups or levels. pub fn newly_added_object_ids(&self) -> HashSet { self.newly_added_sst_infos(None) - .map(|sst| sst.object_id) + .map(|sst| sst.object_id()) .collect() } pub fn newly_added_sst_ids(&self) -> HashSet { self.newly_added_sst_infos(None) - .map(|sst| sst.sst_id) + .map(|sst| sst.sst_id()) .collect() } pub fn newly_added_sst_infos<'a>( &'a self, select_group: Option<&'a HashSet>, - ) -> impl Iterator + 'a { + ) -> impl Iterator + 'a { self.group_deltas .iter() .filter_map(move |(cg_id, group_deltas)| { @@ -559,7 +571,9 @@ impl HummockVersionDelta { new_log.new_value.iter().chain(new_log.old_value.iter()) })) } +} +impl HummockVersionDelta { #[expect(deprecated)] pub fn max_committed_epoch_for_migration(&self) -> HummockEpoch { self.max_committed_epoch diff --git a/src/storage/hummock_test/src/bin/replay/replay_impl.rs b/src/storage/hummock_test/src/bin/replay/replay_impl.rs index 6653db94e054..8b264b6fb186 100644 --- a/src/storage/hummock_test/src/bin/replay/replay_impl.rs +++ b/src/storage/hummock_test/src/bin/replay/replay_impl.rs @@ -138,10 +138,17 @@ impl ReplayRead for GlobalReplayImpl { #[async_trait::async_trait] impl ReplayStateStore for GlobalReplayImpl { - async fn sync(&self, id: u64, table_ids: Vec) -> Result { + async fn sync(&self, sync_table_epochs: Vec<(u64, Vec)>) -> Result { let result: SyncResult = self .store - .sync(id, table_ids.into_iter().map(TableId::new).collect()) + .sync( + sync_table_epochs + .into_iter() + .map(|(epoch, table_ids)| { + (epoch, table_ids.into_iter().map(TableId::new).collect()) + }) + .collect(), + ) .await .map_err(|e| TraceError::SyncFailed(format!("{e}")))?; Ok(result.sync_size) diff --git a/src/storage/hummock_trace/src/collector.rs b/src/storage/hummock_trace/src/collector.rs index 068cbdcee45e..a3962c1bf883 100644 --- a/src/storage/hummock_trace/src/collector.rs +++ b/src/storage/hummock_trace/src/collector.rs @@ -25,7 +25,7 @@ use bincode::{Decode, Encode}; use bytes::Bytes; use parking_lot::Mutex; use risingwave_common::catalog::TableId; -use risingwave_hummock_sdk::HummockReadEpoch; +use risingwave_hummock_sdk::{HummockEpoch, HummockReadEpoch}; use risingwave_pb::meta::SubscribeResponse; use tokio::runtime::Runtime; use tokio::sync::mpsc::{ @@ -281,14 +281,20 @@ impl TraceSpan { } pub fn new_sync_span( - epoch: u64, - table_ids: &HashSet, + sync_table_epochs: &Vec<(HummockEpoch, HashSet)>, storage_type: StorageType, ) -> MayTraceSpan { Self::new_global_op( Operation::Sync( - epoch, - table_ids.iter().map(|table_id| table_id.table_id).collect(), + sync_table_epochs + .iter() + .map(|(epoch, table_ids)| { + ( + *epoch, + table_ids.iter().map(|table_id| table_id.table_id).collect(), + ) + }) + .collect(), ), storage_type, ) diff --git a/src/storage/hummock_trace/src/record.rs b/src/storage/hummock_trace/src/record.rs index a9ae562f02b4..e740ce3158ba 100644 --- a/src/storage/hummock_trace/src/record.rs +++ b/src/storage/hummock_trace/src/record.rs @@ -146,7 +146,7 @@ pub enum Operation { IterNext(RecordId), /// Sync operation of Hummock. - Sync(u64, Vec), + Sync(Vec<(u64, Vec)>), /// `MetaMessage` operation of Hummock. MetaMessage(Box), diff --git a/src/storage/hummock_trace/src/replay/mod.rs b/src/storage/hummock_trace/src/replay/mod.rs index 347ef3070457..d5262a69f762 100644 --- a/src/storage/hummock_trace/src/replay/mod.rs +++ b/src/storage/hummock_trace/src/replay/mod.rs @@ -115,7 +115,7 @@ pub trait ReplayWrite { #[cfg_attr(test, automock)] #[async_trait::async_trait] pub trait ReplayStateStore { - async fn sync(&self, id: u64, table_ids: Vec) -> Result; + async fn sync(&self, sync_table_epochs: Vec<(u64, Vec)>) -> Result; async fn notify_hummock(&self, info: Info, op: RespOperation, version: u64) -> Result; async fn new_local(&self, opts: TracedNewLocalOptions) -> Box; async fn try_wait_epoch( @@ -147,7 +147,7 @@ mock! { } #[async_trait::async_trait] impl ReplayStateStore for GlobalReplayInterface{ - async fn sync(&self, id: u64, table_ids: Vec) -> Result; + async fn sync(&self, sync_table_epochs: Vec<(u64, Vec)>) -> Result; async fn notify_hummock(&self, info: Info, op: RespOperation, version: u64, ) -> Result; async fn new_local(&self, opts: TracedNewLocalOptions) -> Box; diff --git a/src/storage/hummock_trace/src/replay/runner.rs b/src/storage/hummock_trace/src/replay/runner.rs index 3794671ace2a..911e1f5ddfd8 100644 --- a/src/storage/hummock_trace/src/replay/runner.rs +++ b/src/storage/hummock_trace/src/replay/runner.rs @@ -196,7 +196,7 @@ mod tests { let mut non_local: Vec> = vec![ (12, Operation::Finish), - (13, Operation::Sync(sync_id, vec![1, 2, 3])), + (13, Operation::Sync(vec![(sync_id, vec![1, 2, 3])])), ( 13, Operation::Result(OperationResult::Sync(TraceResult::Ok(0))), @@ -244,9 +244,9 @@ mod tests { mock_replay .expect_sync() - .with(predicate::eq(sync_id), predicate::eq(vec![1, 2, 3])) + .with(predicate::eq(vec![(sync_id, vec![1, 2, 3])])) .times(1) - .returning(|_, _| Ok(0)); + .returning(|_| Ok(0)); let mut replay = HummockReplay::new(mock_reader, mock_replay); diff --git a/src/storage/hummock_trace/src/replay/worker.rs b/src/storage/hummock_trace/src/replay/worker.rs index 08d877cadf3a..65a9faf4d812 100644 --- a/src/storage/hummock_trace/src/replay/worker.rs +++ b/src/storage/hummock_trace/src/replay/worker.rs @@ -257,9 +257,9 @@ impl ReplayWorker { panic!("expect iter result, but got {:?}", res); } } - Operation::Sync(epoch_id, table_ids) => { + Operation::Sync(sync_table_epochs) => { assert_eq!(storage_type, StorageType::Global); - let sync_result = replay.sync(epoch_id, table_ids).await.unwrap(); + let sync_result = replay.sync(sync_table_epochs).await.unwrap(); let res = res_rx.recv().await.expect("recv result failed"); if let OperationResult::Sync(expected) = res { assert_eq!(TraceResult::Ok(sync_result), expected, "sync failed"); diff --git a/src/storage/src/hummock/backup_reader.rs b/src/storage/src/hummock/backup_reader.rs index 7fa2ee9dd905..a09e2ca2bed7 100644 --- a/src/storage/src/hummock/backup_reader.rs +++ b/src/storage/src/hummock/backup_reader.rs @@ -195,7 +195,7 @@ impl BackupReader { .snapshot_metadata .iter() .find(|v| { - if let Some(m) = v.state_table_info.get(&table_id) { + if let Some(m) = v.state_table_info.get(&table_id.table_id()) { return epoch == m.committed_epoch; } false diff --git a/src/storage/src/hummock/compactor/mod.rs b/src/storage/src/hummock/compactor/mod.rs index e687b60e0e65..996ff1c0de59 100644 --- a/src/storage/src/hummock/compactor/mod.rs +++ b/src/storage/src/hummock/compactor/mod.rs @@ -276,6 +276,7 @@ impl Compactor { /// The background compaction thread that receives compaction tasks from hummock compaction /// manager and runs compaction tasks. #[cfg_attr(coverage, coverage(off))] +#[must_use] pub fn start_compactor( compactor_context: CompactorContext, hummock_meta_client: Arc, @@ -611,6 +612,7 @@ pub fn start_compactor( /// The background compaction thread that receives compaction tasks from hummock compaction /// manager and runs compaction tasks. #[cfg_attr(coverage, coverage(off))] +#[must_use] pub fn start_shared_compactor( grpc_proxy_client: GrpcCompactorProxyClient, mut receiver: mpsc::UnboundedReceiver>, diff --git a/src/storage/src/hummock/event_handler/hummock_event_handler.rs b/src/storage/src/hummock/event_handler/hummock_event_handler.rs index 0b8cadaf4c97..a2da11bee715 100644 --- a/src/storage/src/hummock/event_handler/hummock_event_handler.rs +++ b/src/storage/src/hummock/event_handler/hummock_event_handler.rs @@ -463,17 +463,12 @@ impl HummockEventHandler { fn handle_sync_epoch( &mut self, - new_sync_epoch: HummockEpoch, + sync_table_epochs: Vec<(HummockEpoch, HashSet)>, sync_result_sender: oneshot::Sender>, - table_ids: HashSet, ) { - debug!( - new_sync_epoch, - ?table_ids, - "awaiting for epoch to be synced", - ); + debug!(?sync_table_epochs, "awaiting for epoch to be synced",); self.uploader - .start_sync_epoch(new_sync_epoch, sync_result_sender, table_ids); + .start_sync_epoch(sync_result_sender, sync_table_epochs); } fn handle_clear(&mut self, notifier: oneshot::Sender<()>, table_ids: Option>) { @@ -641,11 +636,10 @@ impl HummockEventHandler { self.uploader.may_flush(); } HummockEvent::SyncEpoch { - new_sync_epoch, sync_result_sender, - table_ids, + sync_table_epochs, } => { - self.handle_sync_epoch(new_sync_epoch, sync_result_sender, table_ids); + self.handle_sync_epoch(sync_table_epochs, sync_result_sender); } HummockEvent::Clear(notifier, table_ids) => { self.handle_clear(notifier, table_ids); @@ -1013,16 +1007,14 @@ mod tests { let (tx1, mut rx1) = oneshot::channel(); send_event(HummockEvent::SyncEpoch { - new_sync_epoch: epoch1, sync_result_sender: tx1, - table_ids: HashSet::from_iter([TEST_TABLE_ID]), + sync_table_epochs: vec![(epoch1, HashSet::from_iter([TEST_TABLE_ID]))], }); assert!(poll_fn(|cx| Poll::Ready(rx1.poll_unpin(cx).is_pending())).await); let (tx2, mut rx2) = oneshot::channel(); send_event(HummockEvent::SyncEpoch { - new_sync_epoch: epoch2, sync_result_sender: tx2, - table_ids: HashSet::from_iter([TEST_TABLE_ID]), + sync_table_epochs: vec![(epoch2, HashSet::from_iter([TEST_TABLE_ID]))], }); assert!(poll_fn(|cx| Poll::Ready(rx2.poll_unpin(cx).is_pending())).await); @@ -1144,9 +1136,8 @@ mod tests { let sync_epoch = |table_id, new_sync_epoch| { let (tx, rx) = oneshot::channel(); send_event(HummockEvent::SyncEpoch { - new_sync_epoch, sync_result_sender: tx, - table_ids: HashSet::from_iter([table_id]), + sync_table_epochs: vec![(new_sync_epoch, HashSet::from_iter([table_id]))], }); rx }; @@ -1281,9 +1272,8 @@ mod tests { vec![imm1_2_2.batch_id()], )])); send_event(HummockEvent::SyncEpoch { - new_sync_epoch: epoch2, sync_result_sender: tx2, - table_ids: HashSet::from_iter([table_id1]), + sync_table_epochs: vec![(epoch2, HashSet::from_iter([table_id1]))], }); wait_task_start.await; assert!(poll_fn(|cx| Poll::Ready(sync_rx2.poll_unpin(cx).is_pending())).await); diff --git a/src/storage/src/hummock/event_handler/mod.rs b/src/storage/src/hummock/event_handler/mod.rs index 46b44c051fdf..a5253c7ac656 100644 --- a/src/storage/src/hummock/event_handler/mod.rs +++ b/src/storage/src/hummock/event_handler/mod.rs @@ -59,9 +59,8 @@ pub enum HummockEvent { /// task on this epoch. Previous concurrent flush task join handle will be returned by the join /// handle sender. SyncEpoch { - new_sync_epoch: HummockEpoch, sync_result_sender: oneshot::Sender>, - table_ids: HashSet, + sync_table_epochs: Vec<(HummockEpoch, HashSet)>, }, /// Clear shared buffer and reset all states @@ -117,10 +116,9 @@ impl HummockEvent { HummockEvent::BufferMayFlush => "BufferMayFlush".to_string(), HummockEvent::SyncEpoch { - new_sync_epoch, sync_result_sender: _, - table_ids, - } => format!("AwaitSyncEpoch epoch {} {:?}", new_sync_epoch, table_ids), + sync_table_epochs, + } => format!("AwaitSyncEpoch epoch {:?}", sync_table_epochs), HummockEvent::Clear(_, table_ids) => { format!("Clear {:?}", table_ids) diff --git a/src/storage/src/hummock/event_handler/uploader/mod.rs b/src/storage/src/hummock/event_handler/uploader/mod.rs index 96b565c00ef4..3160c4526e00 100644 --- a/src/storage/src/hummock/event_handler/uploader/mod.rs +++ b/src/storage/src/hummock/event_handler/uploader/mod.rs @@ -945,60 +945,72 @@ impl UnsyncData { impl UploaderData { fn sync( &mut self, - epoch: HummockEpoch, context: &UploaderContext, - table_ids: HashSet, sync_result_sender: oneshot::Sender>, + sync_table_epochs: Vec<(HummockEpoch, HashSet)>, ) { let mut all_table_watermarks = HashMap::new(); let mut uploading_tasks = HashSet::new(); let mut spilled_tasks = BTreeSet::new(); + let mut all_table_ids = HashSet::new(); let mut flush_payload = HashMap::new(); - if let Some(UnsyncEpochId(_, min_table_id)) = get_unsync_epoch_id(epoch, &table_ids) { - let min_table_id_data = self - .unsync_data - .table_data - .get_mut(&min_table_id) - .expect("should exist"); - let epochs = take_before_epoch(&mut min_table_id_data.unsync_epochs.clone(), epoch); - for epoch in epochs.keys() { - assert_eq!( - self.unsync_data - .unsync_epochs - .remove(&UnsyncEpochId(*epoch, min_table_id)) - .expect("should exist"), - table_ids + for (epoch, table_ids) in &sync_table_epochs { + let epoch = *epoch; + for table_id in table_ids { + assert!( + all_table_ids.insert(*table_id), + "duplicate sync table epoch: {:?} {:?}", + all_table_ids, + sync_table_epochs ); } - for table_id in &table_ids { - let table_data = self + if let Some(UnsyncEpochId(_, min_table_id)) = get_unsync_epoch_id(epoch, table_ids) { + let min_table_id_data = self .unsync_data .table_data - .get_mut(table_id) + .get_mut(&min_table_id) .expect("should exist"); - let (unflushed_payload, table_watermarks, task_ids, table_unsync_epochs) = - table_data.sync(epoch); - assert_eq!(table_unsync_epochs, epochs); - for (instance_id, payload) in unflushed_payload { - if !payload.is_empty() { - flush_payload.insert(instance_id, payload); - } - } - if let Some((direction, watermarks)) = table_watermarks { - Self::add_table_watermarks( - &mut all_table_watermarks, - *table_id, - direction, - watermarks, + let epochs = take_before_epoch(&mut min_table_id_data.unsync_epochs.clone(), epoch); + for epoch in epochs.keys() { + assert_eq!( + &self + .unsync_data + .unsync_epochs + .remove(&UnsyncEpochId(*epoch, min_table_id)) + .expect("should exist"), + table_ids ); } - for task_id in task_ids { - if self.unsync_data.spilled_data.contains_key(&task_id) { - spilled_tasks.insert(task_id); - } else { - uploading_tasks.insert(task_id); + for table_id in table_ids { + let table_data = self + .unsync_data + .table_data + .get_mut(table_id) + .expect("should exist"); + let (unflushed_payload, table_watermarks, task_ids, table_unsync_epochs) = + table_data.sync(epoch); + assert_eq!(table_unsync_epochs, epochs); + for (instance_id, payload) in unflushed_payload { + if !payload.is_empty() { + flush_payload.insert(instance_id, payload); + } + } + if let Some((direction, watermarks)) = table_watermarks { + Self::add_table_watermarks( + &mut all_table_watermarks, + *table_id, + direction, + watermarks, + ); + } + for task_id in task_ids { + if self.unsync_data.spilled_data.contains_key(&task_id) { + spilled_tasks.insert(task_id); + } else { + uploading_tasks.insert(task_id); + } } } } @@ -1015,7 +1027,7 @@ impl UploaderData { sync_id, flush_payload, uploading_tasks.iter().cloned(), - &table_ids, + &all_table_ids, ) { uploading_tasks.insert(extra_flush_task_id); } @@ -1031,10 +1043,10 @@ impl UploaderData { .remove(task_id) .expect("should exist"); assert!( - spill_table_ids.is_subset(&table_ids), + spill_table_ids.is_subset(&all_table_ids), "spilled tabled ids {:?} not a subset of sync table id {:?}", spill_table_ids, - table_ids + all_table_ids ); sst }) @@ -1043,8 +1055,7 @@ impl UploaderData { self.syncing_data.insert( sync_id, SyncingData { - sync_epoch: epoch, - table_ids, + sync_table_epochs, remaining_uploading_tasks: uploading_tasks, uploaded, table_watermarks: all_table_watermarks, @@ -1068,8 +1079,7 @@ impl UnsyncData { } struct SyncingData { - sync_epoch: HummockEpoch, - table_ids: HashSet, + sync_table_epochs: Vec<(HummockEpoch, HashSet)>, remaining_uploading_tasks: HashSet, // newer data at the front uploaded: VecDeque>, @@ -1136,8 +1146,15 @@ impl UploaderData { self.unsync_data .clear_tables(&table_ids, &mut self.task_manager); self.syncing_data.retain(|sync_id, syncing_data| { - if !syncing_data.table_ids.is_disjoint(&table_ids) { - assert!(syncing_data.table_ids.is_subset(&table_ids)); + if syncing_data + .sync_table_epochs + .iter() + .any(|(_, sync_table_ids)| !sync_table_ids.is_disjoint(&table_ids)) + { + assert!(syncing_data + .sync_table_epochs + .iter() + .all(|(_, sync_table_ids)| sync_table_ids.is_subset(&table_ids))); for task_id in &syncing_data.remaining_uploading_tasks { match self .task_manager @@ -1179,7 +1196,7 @@ impl UploaderData { } struct ErrState { - failed_epoch: HummockEpoch, + failed_sync_table_epochs: Vec<(HummockEpoch, HashSet)>, reason: String, } @@ -1295,27 +1312,26 @@ impl HummockUploader { pub(super) fn start_sync_epoch( &mut self, - epoch: HummockEpoch, sync_result_sender: oneshot::Sender>, - table_ids: HashSet, + sync_table_epochs: Vec<(HummockEpoch, HashSet)>, ) { let data = match &mut self.state { UploaderState::Working(data) => data, UploaderState::Err(ErrState { - failed_epoch, + failed_sync_table_epochs, reason, }) => { let result = Err(HummockError::other(format!( - "previous epoch {} failed due to [{}]", - failed_epoch, reason + "previous sync epoch {:?} failed due to [{}]", + failed_sync_table_epochs, reason ))); send_sync_result(sync_result_sender, result); return; } }; - debug!(epoch, ?table_ids, "start sync epoch"); + debug!(?sync_table_epochs, "start sync epoch"); - data.sync(epoch, &self.context, table_ids, sync_result_sender); + data.sync(&self.context, sync_result_sender, sync_table_epochs); data.may_notify_sync_task(&self.context); @@ -1438,8 +1454,7 @@ impl UploaderData { { let (_, syncing_data) = self.syncing_data.pop_first().expect("non-empty"); let SyncingData { - sync_epoch, - table_ids, + sync_table_epochs, remaining_uploading_tasks: _, uploaded, table_watermarks, @@ -1450,11 +1465,13 @@ impl UploaderData { .uploader_syncing_epoch_count .set(self.syncing_data.len() as _); - for table_id in table_ids { - if let Some(table_data) = self.unsync_data.table_data.get_mut(&table_id) { - table_data.ack_synced(sync_epoch); - if table_data.is_empty() { - self.unsync_data.table_data.remove(&table_id); + for (sync_epoch, table_ids) in sync_table_epochs { + for table_id in table_ids { + if let Some(table_data) = self.unsync_data.table_data.get_mut(&table_id) { + table_data.ack_synced(sync_epoch); + if table_data.is_empty() { + self.unsync_data.table_data.remove(&table_id); + } } } } @@ -1560,11 +1577,11 @@ impl HummockUploader { Err((sync_id, e)) => { let syncing_data = data.syncing_data.remove(&sync_id).expect("should exist"); - let failed_epoch = syncing_data.sync_epoch; + let failed_epochs = syncing_data.sync_table_epochs.clone(); let data = must_match!(replace( &mut self.state, UploaderState::Err(ErrState { - failed_epoch, + failed_sync_table_epochs: syncing_data.sync_table_epochs, reason: e.as_report().to_string(), }), ), UploaderState::Working(data) => data); @@ -1578,8 +1595,8 @@ impl HummockUploader { data.abort(|| { HummockError::other(format!( - "previous epoch {} failed to sync", - failed_epoch + "previous epoch {:?} failed to sync", + failed_epochs )) }); Poll::Pending @@ -1604,16 +1621,30 @@ pub(crate) mod tests { use std::task::Poll; use futures::FutureExt; + use risingwave_common::catalog::TableId; use risingwave_common::util::epoch::EpochExt; use risingwave_hummock_sdk::HummockEpoch; use tokio::sync::oneshot; use super::test_utils::*; - use crate::hummock::event_handler::uploader::{get_payload_imm_ids, SyncedData, UploadingTask}; + use crate::hummock::event_handler::uploader::{ + get_payload_imm_ids, HummockUploader, SyncedData, UploadingTask, + }; use crate::hummock::event_handler::TEST_LOCAL_INSTANCE_ID; - use crate::hummock::HummockError; + use crate::hummock::{HummockError, HummockResult}; use crate::opts::StorageOpts; + impl HummockUploader { + pub(super) fn start_single_epoch_sync( + &mut self, + epoch: HummockEpoch, + sync_result_sender: oneshot::Sender>, + table_ids: HashSet, + ) { + self.start_sync_epoch(sync_result_sender, vec![(epoch, table_ids)]); + } + } + #[tokio::test] pub async fn test_uploading_task_future() { let uploader_context = test_uploader_context(dummy_success_upload_future); @@ -1696,11 +1727,11 @@ pub(crate) mod tests { uploader.local_seal_epoch_for_test(TEST_LOCAL_INSTANCE_ID, epoch1); let (sync_tx, sync_rx) = oneshot::channel(); - uploader.start_sync_epoch(epoch1, sync_tx, HashSet::from_iter([TEST_TABLE_ID])); + uploader.start_single_epoch_sync(epoch1, sync_tx, HashSet::from_iter([TEST_TABLE_ID])); assert_eq!(epoch1 as HummockEpoch, uploader.test_max_syncing_epoch()); assert_eq!(1, uploader.data().syncing_data.len()); let (_, syncing_data) = uploader.data().syncing_data.first_key_value().unwrap(); - assert_eq!(epoch1 as HummockEpoch, syncing_data.sync_epoch); + assert_eq!(epoch1 as HummockEpoch, syncing_data.sync_table_epochs[0].0); assert!(syncing_data.uploaded.is_empty()); assert!(!syncing_data.remaining_uploading_tasks.is_empty()); @@ -1763,7 +1794,7 @@ pub(crate) mod tests { uploader.start_epochs_for_test([epoch1]); uploader.init_instance(TEST_LOCAL_INSTANCE_ID, TEST_TABLE_ID, epoch1); uploader.local_seal_epoch_for_test(TEST_LOCAL_INSTANCE_ID, epoch1); - uploader.start_sync_epoch(epoch1, sync_tx, HashSet::from_iter([TEST_TABLE_ID])); + uploader.start_single_epoch_sync(epoch1, sync_tx, HashSet::from_iter([TEST_TABLE_ID])); assert_eq!(epoch1, uploader.test_max_syncing_epoch()); assert_uploader_pending(&mut uploader).await; @@ -1805,7 +1836,7 @@ pub(crate) mod tests { uploader.add_imm(TEST_LOCAL_INSTANCE_ID, imm); let (sync_tx, sync_rx) = oneshot::channel(); - uploader.start_sync_epoch(epoch1, sync_tx, HashSet::from_iter([TEST_TABLE_ID])); + uploader.start_single_epoch_sync(epoch1, sync_tx, HashSet::from_iter([TEST_TABLE_ID])); assert_eq!(epoch1, uploader.test_max_syncing_epoch()); assert_uploader_pending(&mut uploader).await; @@ -1887,7 +1918,7 @@ pub(crate) mod tests { assert_eq!(epoch3, uploader.test_max_syncing_epoch()); let (sync_tx, sync_rx) = oneshot::channel(); - uploader.start_sync_epoch(epoch6, sync_tx, HashSet::from_iter([TEST_TABLE_ID])); + uploader.start_single_epoch_sync(epoch6, sync_tx, HashSet::from_iter([TEST_TABLE_ID])); assert_eq!(epoch6, uploader.test_max_syncing_epoch()); uploader.update_pinned_version(version4); assert_eq!(epoch4, uploader.test_max_synced_epoch()); @@ -1982,7 +2013,7 @@ pub(crate) mod tests { new_task_notifier(get_payload_imm_ids(&epoch1_sync_payload)); uploader.local_seal_epoch_for_test(instance_id1, epoch1); let (sync_tx1, mut sync_rx1) = oneshot::channel(); - uploader.start_sync_epoch(epoch1, sync_tx1, HashSet::from_iter([TEST_TABLE_ID])); + uploader.start_single_epoch_sync(epoch1, sync_tx1, HashSet::from_iter([TEST_TABLE_ID])); await_start1_4.await; uploader.local_seal_epoch_for_test(instance_id1, epoch2); @@ -2066,7 +2097,7 @@ pub(crate) mod tests { // synced: epoch1: sst([imm1_4]), sst([imm1_3]), sst([imm1_2, imm1_1]) let (sync_tx2, sync_rx2) = oneshot::channel(); - uploader.start_sync_epoch(epoch2, sync_tx2, HashSet::from_iter([TEST_TABLE_ID])); + uploader.start_single_epoch_sync(epoch2, sync_tx2, HashSet::from_iter([TEST_TABLE_ID])); uploader.local_seal_epoch_for_test(instance_id2, epoch3); let sst = uploader.next_uploaded_sst().await; assert_eq!(&get_payload_imm_ids(&epoch3_spill_payload1), sst.imm_ids()); @@ -2095,7 +2126,7 @@ pub(crate) mod tests { let (await_start4_with_3_3, finish_tx4_with_3_3) = new_task_notifier(get_payload_imm_ids(&epoch4_sync_payload)); let (sync_tx4, mut sync_rx4) = oneshot::channel(); - uploader.start_sync_epoch(epoch4, sync_tx4, HashSet::from_iter([TEST_TABLE_ID])); + uploader.start_single_epoch_sync(epoch4, sync_tx4, HashSet::from_iter([TEST_TABLE_ID])); await_start4_with_3_3.await; // current uploader state: diff --git a/src/storage/src/hummock/event_handler/uploader/spiller.rs b/src/storage/src/hummock/event_handler/uploader/spiller.rs index 4e560c36eacf..6c39e9d17eea 100644 --- a/src/storage/src/hummock/event_handler/uploader/spiller.rs +++ b/src/storage/src/hummock/event_handler/uploader/spiller.rs @@ -336,17 +336,17 @@ mod tests { // epoch4 spill(imm1_1_4, imm1_2_4, size 2) spill(imm2_4_1, size 1), imm2_4_2 | let (sync_tx1_1, sync_rx1_1) = oneshot::channel(); - uploader.start_sync_epoch(epoch1, sync_tx1_1, HashSet::from_iter([table_id1])); + uploader.start_single_epoch_sync(epoch1, sync_tx1_1, HashSet::from_iter([table_id1])); let (sync_tx2_1, sync_rx2_1) = oneshot::channel(); - uploader.start_sync_epoch(epoch2, sync_tx2_1, HashSet::from_iter([table_id1])); + uploader.start_single_epoch_sync(epoch2, sync_tx2_1, HashSet::from_iter([table_id1])); let (sync_tx3_1, sync_rx3_1) = oneshot::channel(); - uploader.start_sync_epoch(epoch3, sync_tx3_1, HashSet::from_iter([table_id1])); + uploader.start_single_epoch_sync(epoch3, sync_tx3_1, HashSet::from_iter([table_id1])); let (sync_tx1_2, sync_rx1_2) = oneshot::channel(); - uploader.start_sync_epoch(epoch1, sync_tx1_2, HashSet::from_iter([table_id2])); + uploader.start_single_epoch_sync(epoch1, sync_tx1_2, HashSet::from_iter([table_id2])); let (sync_tx2_2, sync_rx2_2) = oneshot::channel(); - uploader.start_sync_epoch(epoch2, sync_tx2_2, HashSet::from_iter([table_id2])); + uploader.start_single_epoch_sync(epoch2, sync_tx2_2, HashSet::from_iter([table_id2])); let (sync_tx3_2, sync_rx3_2) = oneshot::channel(); - uploader.start_sync_epoch(epoch3, sync_tx3_2, HashSet::from_iter([table_id2])); + uploader.start_single_epoch_sync(epoch3, sync_tx3_2, HashSet::from_iter([table_id2])); let (await_start2_4_2, finish_tx2_4_2) = new_task_notifier(HashMap::from_iter([( instance_id2, @@ -412,7 +412,11 @@ mod tests { // trigger the sync after the spill task is finished and acked to cover the case let (sync_tx4, mut sync_rx4) = oneshot::channel(); - uploader.start_sync_epoch(epoch4, sync_tx4, HashSet::from_iter([table_id1, table_id2])); + uploader.start_single_epoch_sync( + epoch4, + sync_tx4, + HashSet::from_iter([table_id1, table_id2]), + ); await_start2_4_2.await; let sst = uploader.next_uploaded_sst().await; diff --git a/src/storage/src/hummock/store/hummock_storage.rs b/src/storage/src/hummock/store/hummock_storage.rs index efc4989b8e6e..f2691c73ddd1 100644 --- a/src/storage/src/hummock/store/hummock_storage.rs +++ b/src/storage/src/hummock/store/hummock_storage.rs @@ -19,7 +19,6 @@ use std::sync::Arc; use arc_swap::ArcSwap; use bytes::Bytes; -use futures::FutureExt; use itertools::Itertools; use risingwave_common::catalog::TableId; use risingwave_common::util::epoch::is_max_epoch; @@ -30,7 +29,7 @@ use risingwave_hummock_sdk::key::{ use risingwave_hummock_sdk::sstable_info::SstableInfo; use risingwave_hummock_sdk::table_watermark::TableWatermarksIndex; use risingwave_hummock_sdk::version::HummockVersion; -use risingwave_hummock_sdk::{HummockReadEpoch, HummockSstableObjectId}; +use risingwave_hummock_sdk::{HummockReadEpoch, HummockSstableObjectId, SyncResult}; use risingwave_rpc_client::HummockMetaClient; use thiserror_ext::AsReport; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; @@ -572,6 +571,21 @@ impl HummockStorage { .expect("should send success"); rx.await.expect("should await success") } + + pub async fn sync( + &self, + sync_table_epochs: Vec<(HummockEpoch, HashSet)>, + ) -> StorageResult { + let (tx, rx) = oneshot::channel(); + let _ = self.hummock_event_sender.send(HummockEvent::SyncEpoch { + sync_result_sender: tx, + sync_table_epochs, + }); + let synced_data = rx + .await + .map_err(|_| HummockError::other("failed to receive sync result"))??; + Ok(synced_data.into_sync_result()) + } } impl StateStoreRead for HummockStorage { @@ -704,20 +718,6 @@ impl StateStore for HummockStorage { Ok(()) } - fn sync(&self, epoch: u64, table_ids: HashSet) -> impl SyncFuture { - let (tx, rx) = oneshot::channel(); - let _ = self.hummock_event_sender.send(HummockEvent::SyncEpoch { - new_sync_epoch: epoch, - sync_result_sender: tx, - table_ids, - }); - rx.map(|recv_result| { - Ok(recv_result - .map_err(|_| HummockError::other("failed to receive sync result"))?? - .into_sync_result()) - }) - } - fn new_local(&self, option: NewLocalOptions) -> impl Future + Send + '_ { self.new_local_inner(option) } @@ -730,7 +730,7 @@ impl HummockStorage { epoch: u64, table_ids: HashSet, ) -> StorageResult { - self.sync(epoch, table_ids).await + self.sync(vec![(epoch, table_ids)]).await } /// Used in the compaction test tool diff --git a/src/storage/src/memory.rs b/src/storage/src/memory.rs index 6f6de5a47dd0..d78e956123ce 100644 --- a/src/storage/src/memory.rs +++ b/src/storage/src/memory.rs @@ -13,7 +13,7 @@ // limitations under the License. use std::cmp::Ordering; -use std::collections::{BTreeMap, BTreeSet, HashSet, VecDeque}; +use std::collections::{BTreeMap, BTreeSet, VecDeque}; use std::ops::Bound::{Excluded, Included, Unbounded}; use std::ops::{Bound, RangeBounds}; use std::sync::{Arc, LazyLock}; @@ -22,7 +22,7 @@ use bytes::Bytes; use parking_lot::RwLock; use risingwave_common::catalog::TableId; use risingwave_hummock_sdk::key::{FullKey, TableKey, TableKeyRange, UserKey}; -use risingwave_hummock_sdk::{HummockEpoch, HummockReadEpoch, SyncResult}; +use risingwave_hummock_sdk::{HummockEpoch, HummockReadEpoch}; use crate::error::StorageResult; use crate::mem_table::MemtableLocalStateStore; @@ -747,16 +747,6 @@ impl StateStore for RangeKvStateStore { Ok(()) } - #[allow(clippy::unused_async)] - fn sync(&self, _epoch: u64, _table_ids: HashSet) -> impl SyncFuture { - let result = self.inner.flush(); - // memory backend doesn't need to push to S3, so this is a no-op - async move { - result?; - Ok(SyncResult::default()) - } - } - async fn new_local(&self, option: NewLocalOptions) -> Self::Local { MemtableLocalStateStore::new(self.clone(), option) } diff --git a/src/storage/src/monitor/monitored_store.rs b/src/storage/src/monitor/monitored_store.rs index e5e3ceaaca01..fc62c010ddc4 100644 --- a/src/storage/src/monitor/monitored_store.rs +++ b/src/storage/src/monitor/monitored_store.rs @@ -23,7 +23,7 @@ use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::TableId; use risingwave_common::hash::VirtualNode; use risingwave_hummock_sdk::key::{TableKey, TableKeyRange}; -use risingwave_hummock_sdk::HummockReadEpoch; +use risingwave_hummock_sdk::{HummockEpoch, HummockReadEpoch, SyncResult}; use thiserror_ext::AsReport; use tokio::time::Instant; use tracing::{error, Instrument}; @@ -37,6 +37,7 @@ use crate::hummock::{HummockStorage, SstableObjectIdManagerRef}; use crate::monitor::monitored_storage_metrics::StateStoreIterStats; use crate::monitor::{StateStoreIterLogStats, StateStoreIterStatsTrait}; use crate::store::*; + /// A state store wrapper for monitoring metrics. #[derive(Clone)] pub struct MonitoredStateStore { @@ -334,25 +335,6 @@ impl StateStore for MonitoredStateStore { .inspect_err(|e| error!(error = %e.as_report(), "Failed in wait_epoch")) } - fn sync(&self, epoch: u64, table_ids: HashSet) -> impl SyncFuture { - let future = self - .inner - .sync(epoch, table_ids) - .instrument_await("store_sync"); - let timer = self.storage_metrics.sync_duration.start_timer(); - let sync_size = self.storage_metrics.sync_size.clone(); - async move { - let sync_result = future - .await - .inspect_err(|e| error!(error = %e.as_report(), "Failed in sync"))?; - timer.observe_duration(); - if sync_result.sync_size != 0 { - sync_size.observe(sync_result.sync_size as _); - } - Ok(sync_result) - } - } - fn monitored( self, _storage_metrics: Arc, @@ -379,6 +361,26 @@ impl MonitoredStateStore { pub fn sstable_object_id_manager(&self) -> SstableObjectIdManagerRef { self.inner.sstable_object_id_manager().clone() } + + pub async fn sync( + &self, + sync_table_epochs: Vec<(HummockEpoch, HashSet)>, + ) -> StorageResult { + let future = self + .inner + .sync(sync_table_epochs) + .instrument_await("store_sync"); + let timer = self.storage_metrics.sync_duration.start_timer(); + let sync_size = self.storage_metrics.sync_size.clone(); + let sync_result = future + .await + .inspect_err(|e| error!(error = %e.as_report(), "Failed in sync"))?; + timer.observe_duration(); + if sync_result.sync_size != 0 { + sync_size.observe(sync_result.sync_size as _); + } + Ok(sync_result) + } } /// A state store iterator wrapper for monitoring metrics. diff --git a/src/storage/src/monitor/traced_store.rs b/src/storage/src/monitor/traced_store.rs index f06c5634a522..bd308081da11 100644 --- a/src/storage/src/monitor/traced_store.rs +++ b/src/storage/src/monitor/traced_store.rs @@ -21,7 +21,7 @@ use risingwave_common::bitmap::Bitmap; use risingwave_common::catalog::TableId; use risingwave_common::hash::VirtualNode; use risingwave_hummock_sdk::key::{TableKey, TableKeyRange}; -use risingwave_hummock_sdk::HummockReadEpoch; +use risingwave_hummock_sdk::{HummockEpoch, HummockReadEpoch, SyncResult}; use risingwave_hummock_trace::{ init_collector, should_use_trace, ConcurrentId, MayTraceSpan, OperationResult, StorageType, TraceResult, TraceSpan, TracedBytes, TracedSealCurrentEpochOptions, LOCAL_ID, @@ -279,19 +279,6 @@ impl StateStore for TracedStateStore { res } - fn sync(&self, epoch: u64, table_ids: HashSet) -> impl SyncFuture { - let span: MayTraceSpan = TraceSpan::new_sync_span(epoch, &table_ids, self.storage_type); - - let future = self.inner.sync(epoch, table_ids); - - future.map(move |sync_result| { - span.may_send_result(OperationResult::Sync( - sync_result.as_ref().map(|res| res.sync_size).into(), - )); - sync_result - }) - } - async fn new_local(&self, options: NewLocalOptions) -> Self::Local { TracedStateStore::new_local(self.inner.new_local(options.clone()).await, options) } @@ -368,6 +355,24 @@ impl TracedStateStore { pub fn sstable_object_id_manager(&self) -> &SstableObjectIdManagerRef { self.inner.sstable_object_id_manager() } + + pub async fn sync( + &self, + sync_table_epochs: Vec<(HummockEpoch, HashSet)>, + ) -> StorageResult { + let span: MayTraceSpan = TraceSpan::new_sync_span(&sync_table_epochs, self.storage_type); + + let future = self.inner.sync(sync_table_epochs); + + future + .map(move |sync_result| { + span.may_send_result(OperationResult::Sync( + sync_result.as_ref().map(|res| res.sync_size).into(), + )); + sync_result + }) + .await + } } impl TracedStateStore { diff --git a/src/storage/src/panic_store.rs b/src/storage/src/panic_store.rs index ee1f8ebfbbf4..804c7c97334b 100644 --- a/src/storage/src/panic_store.rs +++ b/src/storage/src/panic_store.rs @@ -12,14 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; use std::marker::PhantomData; use std::ops::Bound; use std::sync::Arc; use bytes::Bytes; use risingwave_common::bitmap::Bitmap; -use risingwave_common::catalog::TableId; use risingwave_common::hash::VirtualNode; use risingwave_hummock_sdk::key::{TableKey, TableKeyRange}; use risingwave_hummock_sdk::HummockReadEpoch; @@ -181,11 +179,6 @@ impl StateStore for PanicStateStore { panic!("should not wait epoch from the panic state store!"); } - #[allow(clippy::unused_async)] - fn sync(&self, _epoch: u64, _table_ids: HashSet) -> impl SyncFuture { - async { panic!("should not await sync epoch from the panic state store!") } - } - #[allow(clippy::unused_async)] async fn new_local(&self, _option: NewLocalOptions) -> Self::Local { panic!("should not call new local from the panic state store"); diff --git a/src/storage/src/store.rs b/src/storage/src/store.rs index bf93639b3d70..2a5405b9f9a1 100644 --- a/src/storage/src/store.rs +++ b/src/storage/src/store.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::cmp::min; -use std::collections::HashSet; use std::default::Default; use std::fmt::{Debug, Formatter}; use std::future::Future; @@ -32,7 +31,7 @@ use risingwave_common::hash::VirtualNode; use risingwave_common::util::epoch::{Epoch, EpochPair}; use risingwave_hummock_sdk::key::{FullKey, TableKey, TableKeyRange}; use risingwave_hummock_sdk::table_watermark::{VnodeWatermark, WatermarkDirection}; -use risingwave_hummock_sdk::{HummockReadEpoch, SyncResult}; +use risingwave_hummock_sdk::HummockReadEpoch; use risingwave_hummock_trace::{ TracedInitOptions, TracedNewLocalOptions, TracedOpConsistencyLevel, TracedPrefetchOptions, TracedReadOptions, TracedSealCurrentEpochOptions, TracedTryWaitEpochOptions, @@ -357,8 +356,6 @@ pub trait StateStoreWrite: StaticSendSync { ) -> StorageResult; } -pub trait SyncFuture = Future> + Send + 'static; - #[derive(Clone)] pub struct TryWaitEpochOptions { pub table_id: TableId, @@ -398,8 +395,6 @@ pub trait StateStore: StateStoreRead + StaticSendSync + Clone { options: TryWaitEpochOptions, ) -> impl Future> + Send + '_; - fn sync(&self, epoch: u64, table_ids: HashSet) -> impl SyncFuture; - /// Creates a [`MonitoredStateStore`] from this state store, with given `stats`. fn monitored(self, storage_metrics: Arc) -> MonitoredStateStore { MonitoredStateStore::new(self, storage_metrics) diff --git a/src/storage/src/store_impl.rs b/src/storage/src/store_impl.rs index f59395d26db7..8ce2906ea3e7 100644 --- a/src/storage/src/store_impl.rs +++ b/src/storage/src/store_impl.rs @@ -222,7 +222,6 @@ macro_rules! dispatch_state_store { #[cfg(any(debug_assertions, test, feature = "test"))] pub mod verify { - use std::collections::HashSet; use std::fmt::Debug; use std::future::Future; use std::marker::PhantomData; @@ -231,7 +230,6 @@ pub mod verify { use bytes::Bytes; use risingwave_common::bitmap::Bitmap; - use risingwave_common::catalog::TableId; use risingwave_common::hash::VirtualNode; use risingwave_hummock_sdk::key::{TableKey, TableKeyRange}; use risingwave_hummock_sdk::HummockReadEpoch; @@ -575,20 +573,6 @@ pub mod verify { self.actual.try_wait_epoch(epoch, options) } - fn sync(&self, epoch: u64, table_ids: HashSet) -> impl SyncFuture { - let expected_future = self - .expected - .as_ref() - .map(|expected| expected.sync(epoch, table_ids.clone())); - let actual_future = self.actual.sync(epoch, table_ids); - async move { - if let Some(expected_future) = expected_future { - expected_future.await?; - } - actual_future.await - } - } - async fn new_local(&self, option: NewLocalOptions) -> Self::Local { let expected = if let Some(expected) = &self.expected { Some(expected.new_local(option.clone()).await) @@ -826,20 +810,16 @@ impl AsHummock for SledStateStore { #[cfg(debug_assertions)] pub mod boxed_state_store { - use std::collections::HashSet; use std::future::Future; use std::ops::{Deref, DerefMut}; use std::sync::Arc; use bytes::Bytes; use dyn_clone::{clone_trait_object, DynClone}; - use futures::future::BoxFuture; - use futures::FutureExt; use risingwave_common::bitmap::Bitmap; - use risingwave_common::catalog::TableId; use risingwave_common::hash::VirtualNode; use risingwave_hummock_sdk::key::{TableKey, TableKeyRange}; - use risingwave_hummock_sdk::{HummockReadEpoch, SyncResult}; + use risingwave_hummock_sdk::HummockReadEpoch; use crate::error::StorageResult; use crate::hummock::HummockStorage; @@ -1161,12 +1141,6 @@ pub mod boxed_state_store { options: TryWaitEpochOptions, ) -> StorageResult<()>; - fn sync( - &self, - epoch: u64, - table_ids: HashSet, - ) -> BoxFuture<'static, StorageResult>; - async fn new_local(&self, option: NewLocalOptions) -> BoxDynamicDispatchedLocalStateStore; } @@ -1180,14 +1154,6 @@ pub mod boxed_state_store { self.try_wait_epoch(epoch, options).await } - fn sync( - &self, - epoch: u64, - table_ids: HashSet, - ) -> BoxFuture<'static, StorageResult> { - self.sync(epoch, table_ids).boxed() - } - async fn new_local(&self, option: NewLocalOptions) -> BoxDynamicDispatchedLocalStateStore { Box::new(self.new_local(option).await) } @@ -1267,14 +1233,6 @@ pub mod boxed_state_store { self.deref().try_wait_epoch(epoch, options) } - fn sync( - &self, - epoch: u64, - table_ids: HashSet, - ) -> impl Future> + Send + 'static { - self.deref().sync(epoch, table_ids) - } - fn new_local( &self, option: NewLocalOptions, diff --git a/src/storage/src/table/batch_table/storage_table.rs b/src/storage/src/table/batch_table/storage_table.rs index 66f926bf8213..65e6d2cc95d3 100644 --- a/src/storage/src/table/batch_table/storage_table.rs +++ b/src/storage/src/table/batch_table/storage_table.rs @@ -370,8 +370,6 @@ impl StorageTableInner { pk: impl Row, wait_epoch: HummockReadEpoch, ) -> StorageResult> { - // `get_row` doesn't support select `_rw_timestamp` yet. - assert!(self.epoch_idx.is_none()); let epoch = wait_epoch.get_epoch(); let read_backup = matches!(wait_epoch, HummockReadEpoch::Backup(_)); let read_committed = wait_epoch.is_read_committed(); @@ -406,7 +404,11 @@ impl StorageTableInner { cache_policy: CachePolicy::Fill(CacheContext::Default), ..Default::default() }; - if let Some(value) = self.store.get(serialized_pk, epoch, read_options).await? { + if let Some((full_key, value)) = self + .store + .get_keyed_row(serialized_pk, epoch, read_options) + .await? + { let row = self.row_serde.deserialize(&value)?; let result_row_in_value = self.mapping.project(OwnedRow::new(row)); @@ -416,7 +418,13 @@ impl StorageTableInner { pk.project(&self.output_row_in_key_indices).into_owned_row(); let mut result_row_vec = vec![]; for idx in &self.output_indices { - if self.value_output_indices.contains(idx) { + if let Some(epoch_idx) = self.epoch_idx + && *idx == epoch_idx + { + let epoch = Epoch::from(full_key.epoch_with_gap.pure_epoch()); + result_row_vec + .push(risingwave_common::types::Datum::from(epoch.as_scalar())); + } else if self.value_output_indices.contains(idx) { let item_position_in_value_indices = &self .value_output_indices .iter() @@ -440,7 +448,32 @@ impl StorageTableInner { let result_row = OwnedRow::new(result_row_vec); Ok(Some(result_row)) } - None => Ok(Some(result_row_in_value.into_owned_row())), + None => match &self.epoch_idx { + Some(epoch_idx) => { + let mut result_row_vec = vec![]; + for idx in &self.output_indices { + if idx == epoch_idx { + let epoch = Epoch::from(full_key.epoch_with_gap.pure_epoch()); + result_row_vec + .push(risingwave_common::types::Datum::from(epoch.as_scalar())); + } else { + let item_position_in_value_indices = &self + .value_output_indices + .iter() + .position(|p| idx == p) + .unwrap(); + result_row_vec.push( + result_row_in_value + .datum_at(*item_position_in_value_indices) + .to_owned_datum(), + ); + } + } + let result_row = OwnedRow::new(result_row_vec); + Ok(Some(result_row)) + } + None => Ok(Some(result_row_in_value.into_owned_row())), + }, } } else { Ok(None) @@ -452,10 +485,6 @@ impl StorageTableInner { pub fn update_vnode_bitmap(&mut self, new_vnodes: Arc) -> Arc { self.distribution.update_vnode_bitmap(new_vnodes) } - - pub fn has_epoch_idx(&self) -> bool { - self.epoch_idx.is_some() - } } pub trait PkAndRowStream = Stream>> + Send; diff --git a/src/stream/src/executor/source/fetch_executor.rs b/src/stream/src/executor/source/fetch_executor.rs index 13bbac436d36..8964eaecff45 100644 --- a/src/stream/src/executor/source/fetch_executor.rs +++ b/src/stream/src/executor/source/fetch_executor.rs @@ -160,9 +160,9 @@ impl FsFetchExecutor { batch: SplitBatch, rate_limit_rps: Option, ) -> StreamExecutorResult { - let stream = source_desc + let (stream, _) = source_desc .source - .build_stream(batch, column_ids, Arc::new(source_ctx)) + .build_stream(batch, column_ids, Arc::new(source_ctx), false) .await .map_err(StreamExecutorError::connector_error)?; Ok(apply_rate_limit(stream, rate_limit_rps).boxed()) diff --git a/src/stream/src/executor/source/source_backfill_executor.rs b/src/stream/src/executor/source/source_backfill_executor.rs index 9df74a719d46..bbf71b281d3e 100644 --- a/src/stream/src/executor/source/source_backfill_executor.rs +++ b/src/stream/src/executor/source/source_backfill_executor.rs @@ -570,6 +570,33 @@ impl SourceBackfillExecutorInner { ) .await?; } + Mutation::Throttle(actor_to_apply) => { + if let Some(new_rate_limit) = + actor_to_apply.get(&self.actor_ctx.id) + && *new_rate_limit != self.rate_limit_rps + { + tracing::info!( + "updating rate limit from {:?} to {:?}", + self.rate_limit_rps, + *new_rate_limit + ); + self.rate_limit_rps = *new_rate_limit; + // rebuild reader + let (reader, _backfill_info) = self + .build_stream_source_reader( + &source_desc, + backfill_stage + .get_latest_unfinished_splits()?, + ) + .await?; + + backfill_stream = select_with_strategy( + input.by_ref().map(Either::Left), + reader.map(Either::Right), + select_strategy, + ); + } + } _ => {} } } @@ -609,7 +636,6 @@ impl SourceBackfillExecutorInner { .await?; if self.should_report_finished(&backfill_stage.states) { - tracing::debug!("progress finish"); self.progress.finish( barrier.epoch, backfill_stage.total_backfilled_rows(), diff --git a/src/stream/src/executor/source/source_executor.rs b/src/stream/src/executor/source/source_executor.rs index e0bbe3d1f6d9..80b252014d28 100644 --- a/src/stream/src/executor/source/source_executor.rs +++ b/src/stream/src/executor/source/source_executor.rs @@ -71,7 +71,7 @@ pub struct SourceExecutor { /// Rate limit in rows/s. rate_limit_rps: Option, - is_shared: bool, + is_shared_non_cdc: bool, } impl SourceExecutor { @@ -82,7 +82,7 @@ impl SourceExecutor { barrier_receiver: UnboundedReceiver, system_params: SystemParamsReaderRef, rate_limit_rps: Option, - is_shared: bool, + is_shared_non_cdc: bool, ) -> Self { Self { actor_ctx, @@ -91,7 +91,7 @@ impl SourceExecutor { barrier_receiver: Some(barrier_receiver), system_params, rate_limit_rps, - is_shared, + is_shared_non_cdc, } } @@ -116,11 +116,13 @@ impl SourceExecutor { })) } + /// If `seek_to_latest` is true, will also return the latest splits after seek. pub async fn build_stream_source_reader( &self, source_desc: &SourceDesc, state: ConnectorState, - ) -> StreamExecutorResult { + seek_to_latest: bool, + ) -> StreamExecutorResult<(BoxChunkSourceStream, Option>)> { let column_ids = source_desc .columns .iter() @@ -183,13 +185,16 @@ impl SourceExecutor { source_desc.source.config.clone(), schema_change_tx, ); - let stream = source_desc + let (stream, latest_splits) = source_desc .source - .build_stream(state, column_ids, Arc::new(source_ctx)) + .build_stream(state, column_ids, Arc::new(source_ctx), seek_to_latest) .await - .map_err(StreamExecutorError::connector_error); + .map_err(StreamExecutorError::connector_error)?; - Ok(apply_rate_limit(stream?, self.rate_limit_rps).boxed()) + Ok(( + apply_rate_limit(stream, self.rate_limit_rps).boxed(), + latest_splits, + )) } fn is_auto_schema_change_enable(&self) -> bool { @@ -367,10 +372,10 @@ impl SourceExecutor { ); // Replace the source reader with a new one of the new state. - let reader = self - .build_stream_source_reader(source_desc, Some(target_state.clone())) - .await? - .map_err(StreamExecutorError::connector_error); + let (reader, _) = self + .build_stream_source_reader(source_desc, Some(target_state.clone()), false) + .await?; + let reader = reader.map_err(StreamExecutorError::connector_error); stream.replace_data_stream(reader); @@ -459,7 +464,7 @@ impl SourceExecutor { }; core.split_state_store.init_epoch(first_epoch).await?; - + let mut is_uninitialized = self.actor_ctx.initial_dispatch_num == 0; for ele in &mut boot_state { if let Some(recover_state) = core .split_state_store @@ -467,42 +472,47 @@ impl SourceExecutor { .await? { *ele = recover_state; + // if state store is non-empty, we consider it's initialized. + is_uninitialized = false; } else { // This is a new split, not in state table. - if self.is_shared { - // For shared source, we start from latest and let the downstream SourceBackfillExecutors to read historical data. - // It's highly probable that the work of scanning historical data cannot be shared, - // so don't waste work on it. - // For more details, see https://github.com/risingwavelabs/risingwave/issues/16576#issuecomment-2095413297 - if ele.is_cdc_split() { - // shared CDC source already starts from latest. - continue; - } - match ele { - SplitImpl::Kafka(split) => { - split.seek_to_latest_offset(); - } - _ => unreachable!("only kafka source can be shared, got {:?}", ele), - } - } + // make sure it is written to state table later. + // Then even it receives no messages, we can observe it in state table. + core.updated_splits_in_epoch.insert(ele.id(), ele.clone()); } } // init in-memory split states with persisted state if any core.init_split_state(boot_state.clone()); - let mut is_uninitialized = self.actor_ctx.initial_dispatch_num == 0; // Return the ownership of `stream_source_core` to the source executor. self.stream_source_core = Some(core); let recover_state: ConnectorState = (!boot_state.is_empty()).then_some(boot_state); tracing::debug!(state = ?recover_state, "start with state"); - let source_chunk_reader = self - .build_stream_source_reader(&source_desc, recover_state) + let (source_chunk_reader, latest_splits) = self + .build_stream_source_reader( + &source_desc, + recover_state, + // For shared source, we start from latest and let the downstream SourceBackfillExecutors to read historical data. + // It's highly probable that the work of scanning historical data cannot be shared, + // so don't waste work on it. + // For more details, see https://github.com/risingwavelabs/risingwave/issues/16576#issuecomment-2095413297 + // Note that shared CDC source is special. It already starts from latest. + self.is_shared_non_cdc && is_uninitialized, + ) .instrument_await("source_build_reader") - .await? - .map_err(StreamExecutorError::connector_error); - + .await?; + let source_chunk_reader = source_chunk_reader.map_err(StreamExecutorError::connector_error); + if let Some(latest_splits) = latest_splits { + // make sure it is written to state table later. + // Then even it receives no messages, we can observe it in state table. + self.stream_source_core + .as_mut() + .unwrap() + .updated_splits_in_epoch + .extend(latest_splits.into_iter().map(|s| (s.id(), s))); + } // Merge the chunks from source and the barriers into a single stream. We prioritize // barriers over source data chunks here. let barrier_stream = barrier_to_message_stream(barrier_receiver).boxed(); @@ -510,14 +520,9 @@ impl SourceExecutor { StreamReaderWithPause::::new(barrier_stream, source_chunk_reader); let mut command_paused = false; - // - For shared source, pause until there's a MV. // - If the first barrier requires us to pause on startup, pause the stream. - if (self.is_shared && is_uninitialized) || is_pause_on_startup { - tracing::info!( - is_shared = self.is_shared, - is_uninitialized = is_uninitialized, - "source paused on startup" - ); + if is_pause_on_startup { + tracing::info!("source paused on startup"); stream.pause_stream(); command_paused = true; } @@ -562,14 +567,6 @@ impl SourceExecutor { let epoch = barrier.epoch; - if self.is_shared - && is_uninitialized - && barrier.has_more_downstream_fragments(self.actor_ctx.id) - { - stream.resume_stream(); - is_uninitialized = false; - } - if let Some(mutation) = barrier.mutation.as_deref() { match mutation { Mutation::Pause => { @@ -611,7 +608,7 @@ impl SourceExecutor { if let Some(new_rate_limit) = actor_to_apply.get(&self.actor_ctx.id) && *new_rate_limit != self.rate_limit_rps { - tracing::debug!( + tracing::info!( "updating rate limit from {:?} to {:?}", self.rate_limit_rps, *new_rate_limit diff --git a/src/stream/src/executor/top_n/group_top_n.rs b/src/stream/src/executor/top_n/group_top_n.rs index 9c7d037c40ac..9bf1fecc3f3a 100644 --- a/src/stream/src/executor/top_n/group_top_n.rs +++ b/src/stream/src/executor/top_n/group_top_n.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::ops::{Deref, DerefMut}; use risingwave_common::array::Op; use risingwave_common::bitmap::Bitmap; use risingwave_common::hash::HashKey; -use risingwave_common::row::RowExt; +use risingwave_common::row::{RowDeserializer, RowExt}; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::iter_util::ZipEqDebug; use risingwave_common::util::sort_util::ColumnOrder; @@ -157,14 +158,18 @@ impl TopNExecutorBase where TopNCache: TopNCacheTrait, { - async fn apply_chunk(&mut self, chunk: StreamChunk) -> StreamExecutorResult { - let mut res_ops = Vec::with_capacity(self.limit); - let mut res_rows = Vec::with_capacity(self.limit); + async fn apply_chunk( + &mut self, + chunk: StreamChunk, + ) -> StreamExecutorResult> { let keys = K::build_many(&self.group_by, chunk.data_chunk()); + let mut stagings = HashMap::new(); // K -> `TopNStaging` + for (r, group_cache_key) in chunk.rows_with_holes().zip_eq_debug(keys.iter()) { let Some((op, row_ref)) = r else { continue; }; + // The pk without group by let pk_row = row_ref.project(&self.storage_key_indices[self.group_by.len()..]); let cache_key = serialize_pk_to_cache_key(pk_row, &self.cache_key_serde); @@ -184,12 +189,13 @@ where } let mut cache = self.caches.get_mut(group_cache_key).unwrap(); + let staging = stagings.entry(group_cache_key.clone()).or_default(); // apply the chunk to state table match op { Op::Insert | Op::UpdateInsert => { self.managed_state.insert(row_ref); - cache.insert(cache_key, row_ref, &mut res_ops, &mut res_rows); + cache.insert(cache_key, row_ref, staging); } Op::Delete | Op::UpdateDelete => { @@ -200,17 +206,27 @@ where &mut self.managed_state, cache_key, row_ref, - &mut res_ops, - &mut res_rows, + staging, ) .await?; } } } + self.metrics .group_top_n_cached_entry_count .set(self.caches.len() as i64); - generate_output(res_rows, res_ops, &self.schema) + + let data_types = self.schema.data_types(); + let deserializer = RowDeserializer::new(data_types.clone()); + let mut chunk_builder = StreamChunkBuilder::unlimited(data_types, Some(chunk.capacity())); + for staging in stagings.into_values() { + for res in staging.into_deserialized_changes(&deserializer) { + let (op, row) = res?; + let _none = chunk_builder.append_row(op, row); + } + } + Ok(chunk_builder.take()) } async fn flush_data(&mut self, epoch: EpochPair) -> StreamExecutorResult<()> { @@ -250,7 +266,6 @@ where mod tests { use std::sync::atomic::AtomicU64; - use assert_matches::assert_matches; use risingwave_common::array::stream_chunk::StreamChunkTestExt; use risingwave_common::catalog::Field; use risingwave_common::hash::SerializedKey; @@ -260,7 +275,7 @@ mod tests { use super::*; use crate::executor::test_utils::top_n_executor::create_in_memory_state_table; - use crate::executor::test_utils::MockSource; + use crate::executor::test_utils::{MockSource, StreamExecutorTestExt}; fn create_schema() -> Schema { Schema { @@ -357,7 +372,7 @@ mod tests { ) .await; let schema = source.schema().clone(); - let top_n_executor = GroupTopNExecutor::::new( + let top_n = GroupTopNExecutor::::new( source, ActorContext::for_test(0), schema, @@ -369,14 +384,13 @@ mod tests { Arc::new(AtomicU64::new(0)), ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I + 10 9 1 + 8 8 2 @@ -384,58 +398,50 @@ mod tests { + 9 1 1 + 10 1 1 ", - ), + ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I - 10 9 1 - 8 8 2 - 10 1 1 + 8 1 3 ", - ), + ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I - 7 8 2 - 8 1 3 - 9 1 1 ", - ), + ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I + 5 1 1 + 2 1 1 ", - ), + ) + .sort_rows(), ); } @@ -453,7 +459,7 @@ mod tests { ) .await; let schema = source.schema().clone(); - let top_n_executor = GroupTopNExecutor::::new( + let top_n = GroupTopNExecutor::::new( source, ActorContext::for_test(0), schema, @@ -465,66 +471,57 @@ mod tests { Arc::new(AtomicU64::new(0)), ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I + 8 8 2 + 10 1 1 + 8 1 3 ", - ), + ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I - 8 8 2 - 10 1 1 ", - ), + ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I - 8 1 3", - ), + ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I + 5 1 1 + 3 1 2 ", - ), + ) + .sort_rows(), ); } @@ -542,7 +539,7 @@ mod tests { ) .await; let schema = source.schema().clone(); - let top_n_executor = GroupTopNExecutor::::new( + let top_n = GroupTopNExecutor::::new( source, ActorContext::for_test(0), schema, @@ -554,14 +551,13 @@ mod tests { Arc::new(AtomicU64::new(0)), ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I + 10 9 1 + 8 8 2 @@ -569,56 +565,148 @@ mod tests { + 9 1 1 + 10 1 1 + 8 1 3", - ), + ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I - 10 9 1 - 8 8 2 - 10 1 1", - ), + ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I - 7 8 2 - 8 1 3 - 9 1 1", - ), + ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - res.as_chunk().unwrap(), - &StreamChunk::from_pretty( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( " I I I + 5 1 1 + 2 1 1 + 3 1 2 + 4 1 3", - ), + ) + .sort_rows(), ); } + + #[tokio::test] + async fn test_compact_changes() { + let schema = create_schema(); + let source = MockSource::with_messages(vec![ + Message::Barrier(Barrier::new_test_barrier(test_epoch(1))), + Message::Chunk(StreamChunk::from_pretty( + " I I I + + 0 0 9 + + 0 0 8 + + 0 0 7 + + 0 0 6 + + 0 1 15 + + 0 1 14", + )), + Message::Barrier(Barrier::new_test_barrier(test_epoch(2))), + Message::Chunk(StreamChunk::from_pretty( + " I I I + - 0 0 6 + - 0 0 8 + + 0 0 4 + + 0 0 3 + + 0 1 12 + + 0 2 26 + - 0 1 12 + + 0 1 11", + )), + Message::Barrier(Barrier::new_test_barrier(test_epoch(3))), + Message::Chunk(StreamChunk::from_pretty( + " I I I + + 0 0 11", // this should result in no chunk output + )), + Message::Barrier(Barrier::new_test_barrier(test_epoch(4))), + ]) + .into_executor(schema.clone(), vec![2]); + + let state_table = create_in_memory_state_table( + &schema.data_types(), + &[ + OrderType::ascending(), + OrderType::ascending(), + OrderType::ascending(), + ], + &[0, 1, 2], // table pk = group key (0, 1) + order key (2) + additional pk (empty) + ) + .await; + + let top_n = GroupTopNExecutor::::new( + source, + ActorContext::for_test(0), + schema, + vec![ + ColumnOrder::new(0, OrderType::ascending()), + ColumnOrder::new(1, OrderType::ascending()), + ColumnOrder::new(2, OrderType::ascending()), + ], + (0, 2), // (offset, limit) + vec![ColumnOrder::new(2, OrderType::ascending())], + vec![0, 1], + state_table, + Arc::new(AtomicU64::new(0)), + ) + .unwrap(); + let mut top_n = top_n.boxed().execute(); + + // initial barrier + top_n.expect_barrier().await; + + assert_eq!( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( + " I I I + + 0 0 7 + + 0 0 6 + + 0 1 15 + + 0 1 14", + ) + .sort_rows(), + ); + top_n.expect_barrier().await; + + assert_eq!( + top_n.expect_chunk().await.sort_rows(), + StreamChunk::from_pretty( + " I I I + - 0 0 6 + - 0 0 7 + + 0 0 4 + + 0 0 3 + - 0 1 15 + + 0 1 11 + + 0 2 26", + ) + .sort_rows(), + ); + top_n.expect_barrier().await; + + // no output chunk for the last input chunk + top_n.expect_barrier().await; + } } diff --git a/src/stream/src/executor/top_n/group_top_n_appendonly.rs b/src/stream/src/executor/top_n/group_top_n_appendonly.rs index 2cf1741169ce..edabb31c3296 100644 --- a/src/stream/src/executor/top_n/group_top_n_appendonly.rs +++ b/src/stream/src/executor/top_n/group_top_n_appendonly.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; + use risingwave_common::array::Op; use risingwave_common::bitmap::Bitmap; use risingwave_common::hash::HashKey; @@ -137,17 +139,20 @@ impl TopNExecutorBase where TopNCache: AppendOnlyTopNCacheTrait, { - async fn apply_chunk(&mut self, chunk: StreamChunk) -> StreamExecutorResult { - let mut res_ops = Vec::with_capacity(self.limit); - let mut res_rows = Vec::with_capacity(self.limit); + async fn apply_chunk( + &mut self, + chunk: StreamChunk, + ) -> StreamExecutorResult> { let keys = K::build_many(&self.group_by, chunk.data_chunk()); + let mut stagings = HashMap::new(); // K -> `TopNStaging` let data_types = self.schema.data_types(); - let row_deserializer = RowDeserializer::new(data_types.clone()); + let deserializer = RowDeserializer::new(data_types.clone()); for (r, group_cache_key) in chunk.rows_with_holes().zip_eq_debug(keys.iter()) { let Some((op, row_ref)) = r else { continue; }; + // The pk without group by let pk_row = row_ref.project(&self.storage_key_indices[self.group_by.len()..]); let cache_key = serialize_pk_to_cache_key(pk_row, &self.cache_key_serde); @@ -164,22 +169,33 @@ where .await?; self.caches.push(group_cache_key.clone(), topn_cache); } + let mut cache = self.caches.get_mut(group_cache_key).unwrap(); + let staging = stagings.entry(group_cache_key.clone()).or_default(); debug_assert_eq!(op, Op::Insert); cache.insert( cache_key, row_ref, - &mut res_ops, - &mut res_rows, + staging, &mut self.managed_state, - &row_deserializer, + &deserializer, )?; } + self.metrics .group_top_n_cached_entry_count .set(self.caches.len() as i64); - generate_output(res_rows, res_ops, &self.schema) + + let mut chunk_builder = StreamChunkBuilder::unlimited(data_types, Some(chunk.capacity())); + for staging in stagings.into_values() { + for res in staging.into_deserialized_changes(&deserializer) { + let (op, row) = res?; + let _none = chunk_builder.append_row(op, row); + } + } + + Ok(chunk_builder.take()) } async fn flush_data(&mut self, epoch: EpochPair) -> StreamExecutorResult<()> { diff --git a/src/stream/src/executor/top_n/top_n_appendonly.rs b/src/stream/src/executor/top_n/top_n_appendonly.rs index 2dcf36b2250b..9d4e8127c2a9 100644 --- a/src/stream/src/executor/top_n/top_n_appendonly.rs +++ b/src/stream/src/executor/top_n/top_n_appendonly.rs @@ -17,7 +17,7 @@ use risingwave_common::row::{RowDeserializer, RowExt}; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::sort_util::ColumnOrder; -use super::top_n_cache::AppendOnlyTopNCacheTrait; +use super::top_n_cache::{AppendOnlyTopNCacheTrait, TopNStaging}; use super::utils::*; use super::{ManagedTopNState, TopNCache}; use crate::executor::prelude::*; @@ -104,11 +104,13 @@ impl TopNExecutorBase where TopNCache: AppendOnlyTopNCacheTrait, { - async fn apply_chunk(&mut self, chunk: StreamChunk) -> StreamExecutorResult { - let mut res_ops = Vec::with_capacity(self.cache.limit); - let mut res_rows = Vec::with_capacity(self.cache.limit); + async fn apply_chunk( + &mut self, + chunk: StreamChunk, + ) -> StreamExecutorResult> { + let mut staging = TopNStaging::new(); let data_types = self.schema.data_types(); - let row_deserializer = RowDeserializer::new(data_types); + let deserializer = RowDeserializer::new(data_types.clone()); // apply the chunk to state table for (op, row_ref) in chunk.rows() { debug_assert_eq!(op, Op::Insert); @@ -117,14 +119,21 @@ where self.cache.insert( cache_key, row_ref, - &mut res_ops, - &mut res_rows, + &mut staging, &mut self.managed_state, - &row_deserializer, + &deserializer, )?; } - generate_output(res_rows, res_ops, &self.schema) + if staging.is_empty() { + return Ok(None); + } + let mut chunk_builder = StreamChunkBuilder::unlimited(data_types, Some(staging.len())); + for res in staging.into_deserialized_changes(&deserializer) { + let (op, row) = res?; + let _none = chunk_builder.append_row(op, row); + } + Ok(chunk_builder.take()) } async fn flush_data(&mut self, epoch: EpochPair) -> StreamExecutorResult<()> { @@ -151,8 +160,6 @@ where #[cfg(test)] mod tests { - use assert_matches::assert_matches; - use futures::StreamExt; use risingwave_common::array::stream_chunk::StreamChunkTestExt; use risingwave_common::array::StreamChunk; use risingwave_common::catalog::{Field, Schema}; @@ -162,7 +169,7 @@ mod tests { use super::AppendOnlyTopNExecutor; use crate::executor::test_utils::top_n_executor::create_in_memory_state_table; - use crate::executor::test_utils::MockSource; + use crate::executor::test_utils::{MockSource, StreamExecutorTestExt}; use crate::executor::{ActorContext, Barrier, Execute, Executor, Message, PkIndices}; fn create_stream_chunks() -> Vec { @@ -241,7 +248,7 @@ mod tests { .await; let schema = source.schema().clone(); - let top_n_executor = AppendOnlyTopNExecutor::<_, false>::new( + let top_n = AppendOnlyTopNExecutor::<_, false>::new( source, ActorContext::for_test(0), schema, @@ -251,54 +258,43 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init epoch - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I + 1 0 + 2 1 + 3 2 - + 10 3 + 9 4 - - 10 3 + 8 5" ) + .sort_rows(), ); // We added (1, 2, 3, 10, 9, 8). // Now (1, 2, 3, 8, 9) // Barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 9 4 - + 7 6 - 8 5 + 3 7 - - 7 6 + 1 8" ) + .sort_rows(), ); // We added (7, 3, 1, 9). // Now (1, 1, 2, 3, 3) // Barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 3 7 @@ -306,6 +302,7 @@ mod tests { - 3 2 + 1 13" ) + .sort_rows(), ); // We added (1, 1, 2, 3). // Now (1, 1, 1, 1, 2) @@ -322,7 +319,7 @@ mod tests { .await; let schema = source.schema().clone(); - let top_n_executor = AppendOnlyTopNExecutor::<_, false>::new( + let top_n = AppendOnlyTopNExecutor::<_, false>::new( source, ActorContext::for_test(0), schema, @@ -332,30 +329,26 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init epoch - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I + 10 3 + 9 4 + 8 5" ) + .sort_rows(), ); // We added (1, 2, 3, 10, 9, 8). // Now (1, 2, 3) -> (8, 9, 10) // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I + 7 6 @@ -364,17 +357,14 @@ mod tests { - 9 4 + 3 2" ) + .sort_rows(), ); // We added (7, 3, 1, 9). // Now (1, 1, 2) -> (3, 3, 7, 8) // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 8 5 @@ -384,6 +374,7 @@ mod tests { - 3 7 + 2 14" ) + .sort_rows(), ); // We added (1, 1, 2, 3). // Now (1, 1, 1) -> (1, 2, 2, 3) diff --git a/src/stream/src/executor/top_n/top_n_cache.rs b/src/stream/src/executor/top_n/top_n_cache.rs index d8211b4ad076..c7536bf98bbb 100644 --- a/src/stream/src/executor/top_n/top_n_cache.rs +++ b/src/stream/src/executor/top_n/top_n_cache.rs @@ -13,12 +13,13 @@ // limitations under the License. use std::cmp::Ordering; +use std::collections::BTreeMap; use std::fmt::Debug; use std::future::Future; use itertools::Itertools; use risingwave_common::array::{Op, RowRef}; -use risingwave_common::row::{CompactedRow, Row, RowDeserializer, RowExt}; +use risingwave_common::row::{CompactedRow, OwnedRow, Row, RowDeserializer, RowExt}; use risingwave_common::types::DataType; use risingwave_common_estimate_size::collections::EstimatedBTreeMap; use risingwave_common_estimate_size::EstimateSize; @@ -149,14 +150,7 @@ pub trait TopNCacheTrait { /// /// Changes in `self.middle` is recorded to `res_ops` and `res_rows`, which will be /// used to generate messages to be sent to downstream operators. - #[allow(clippy::too_many_arguments)] - fn insert( - &mut self, - cache_key: CacheKey, - row: impl Row + Send, - res_ops: &mut Vec, - res_rows: &mut Vec, - ); + fn insert(&mut self, cache_key: CacheKey, row: impl Row + Send, staging: &mut TopNStaging); /// Delete input row from the cache. /// @@ -166,15 +160,13 @@ pub trait TopNCacheTrait { /// Because we may need to refill data from the state table to `self.high` during the delete /// operation, we need to pass in `group_key`, `epoch` and `managed_state` to do a prefix /// scan of the state table. - #[allow(clippy::too_many_arguments)] fn delete( &mut self, group_key: Option, managed_state: &mut ManagedTopNState, cache_key: CacheKey, row: impl Row + Send, - res_ops: &mut Vec, - res_rows: &mut Vec, + staging: &mut TopNStaging, ) -> impl Future> + Send; } @@ -286,22 +278,11 @@ impl TopNCache { } impl TopNCacheTrait for TopNCache { - fn insert( - &mut self, - cache_key: CacheKey, - row: impl Row + Send, - res_ops: &mut Vec, - res_rows: &mut Vec, - ) { + fn insert(&mut self, cache_key: CacheKey, row: impl Row + Send, staging: &mut TopNStaging) { if let Some(row_count) = self.table_row_count.as_mut() { *row_count += 1; } - let mut append_res = |op: Op, row: CompactedRow| { - res_ops.push(op); - res_rows.push(row); - }; - let mut to_insert = (cache_key, (&row).into()); let mut is_last_of_lower_cache = false; // for saving one key comparison @@ -328,8 +309,8 @@ impl TopNCacheTrait for TopNCache { // try insert into middle cache if !self.middle_is_full() { - self.middle.insert(to_insert.0, to_insert.1.clone()); - append_res(Op::Insert, to_insert.1); + self.middle.insert(to_insert.0.clone(), to_insert.1.clone()); + staging.insert(to_insert.0, to_insert.1); return; } @@ -338,10 +319,10 @@ impl TopNCacheTrait for TopNCache { if is_last_of_lower_cache || &to_insert.0 < middle_last.key() { // make space for the new entry let middle_last = middle_last.remove_entry(); - self.middle.insert(to_insert.0, to_insert.1.clone()); + self.middle.insert(to_insert.0.clone(), to_insert.1.clone()); - append_res(Op::Delete, middle_last.1.clone()); - append_res(Op::Insert, to_insert.1); + staging.delete(middle_last.0.clone(), middle_last.1.clone()); + staging.insert(to_insert.0, to_insert.1); to_insert = middle_last; // move the last entry to the high cache is_last_of_lower_cache = true; @@ -382,8 +363,7 @@ impl TopNCacheTrait for TopNCache { managed_state: &mut ManagedTopNState, cache_key: CacheKey, row: impl Row + Send, - res_ops: &mut Vec, - res_rows: &mut Vec, + staging: &mut TopNStaging, ) -> StreamExecutorResult<()> { if !enable_strict_consistency() && self.table_row_count == Some(0) { // If strict consistency is disabled, and we receive a `DELETE` but the row count is 0, we @@ -395,11 +375,6 @@ impl TopNCacheTrait for TopNCache { *row_count -= 1; } - let mut append_res = |op: Op, row: CompactedRow| { - res_ops.push(op); - res_rows.push(row); - }; - if self.middle_is_full() && &cache_key > self.middle.last_key_value().unwrap().0 { // the row is in high self.high.remove(&cache_key); @@ -414,7 +389,7 @@ impl TopNCacheTrait for TopNCache { { // the row is in middle let removed = self.middle.remove(&cache_key); - append_res(Op::Delete, (&row).into()); + staging.delete(cache_key.clone(), (&row).into()); if removed.is_none() { // the middle cache should always be synced, if the key is not found, then it also doesn't @@ -443,8 +418,9 @@ impl TopNCacheTrait for TopNCache { // bring one element, if any, from high cache to middle cache if !self.high.is_empty() { let high_first = self.high.pop_first().unwrap(); - append_res(Op::Insert, high_first.1.clone()); - self.middle.insert(high_first.0, high_first.1); + self.middle + .insert(high_first.0.clone(), high_first.1.clone()); + staging.insert(high_first.0, high_first.1); } assert!(self.high.is_empty() || self.middle.len() == self.limit); @@ -463,7 +439,7 @@ impl TopNCacheTrait for TopNCache { // bring one element, if any, from middle cache to low cache if !self.middle.is_empty() { let middle_first = self.middle.pop_first().unwrap(); - append_res(Op::Delete, middle_first.1.clone()); + staging.delete(middle_first.0.clone(), middle_first.1.clone()); low.insert(middle_first.0, middle_first.1); // fill the high cache if it's not synced @@ -482,8 +458,9 @@ impl TopNCacheTrait for TopNCache { // bring one element, if any, from high cache to middle cache if !self.high.is_empty() { let high_first = self.high.pop_first().unwrap(); - append_res(Op::Insert, high_first.1.clone()); - self.middle.insert(high_first.0, high_first.1); + self.middle + .insert(high_first.0.clone(), high_first.1.clone()); + staging.insert(high_first.0, high_first.1); } } } @@ -493,13 +470,7 @@ impl TopNCacheTrait for TopNCache { } impl TopNCacheTrait for TopNCache { - fn insert( - &mut self, - cache_key: CacheKey, - row: impl Row + Send, - res_ops: &mut Vec, - res_rows: &mut Vec, - ) { + fn insert(&mut self, cache_key: CacheKey, row: impl Row + Send, staging: &mut TopNStaging) { if let Some(row_count) = self.table_row_count.as_mut() { *row_count += 1; } @@ -509,18 +480,13 @@ impl TopNCacheTrait for TopNCache { "Offset is not supported yet for WITH TIES, so low cache should be None" ); - let mut append_res = |op: Op, row: CompactedRow| { - res_ops.push(op); - res_rows.push(row); - }; - let to_insert: (CacheKey, CompactedRow) = (cache_key, (&row).into()); // try insert into middle cache if !self.middle_is_full() { self.middle.insert(to_insert.0.clone(), to_insert.1.clone()); - append_res(Op::Insert, to_insert.1); + staging.insert(to_insert.0.clone(), to_insert.1); return; } @@ -550,7 +516,7 @@ impl TopNCacheTrait for TopNCache { && middle_last.key().0 == middle_last_sort_key { let middle_last = middle_last.remove_entry(); - append_res(Op::Delete, middle_last.1.clone()); + staging.delete(middle_last.0.clone(), middle_last.1.clone()); // we can blindly move entries from middle cache to high cache no matter high cache is synced or not self.high.insert(middle_last.0, middle_last.1); } @@ -564,13 +530,13 @@ impl TopNCacheTrait for TopNCache { self.high.retain(|k, _| k.0 != high_last_sort_key); } - append_res(Op::Insert, to_insert.1.clone()); - self.middle.insert(to_insert.0, to_insert.1); + self.middle.insert(to_insert.0.clone(), to_insert.1.clone()); + staging.insert(to_insert.0, to_insert.1); } Ordering::Equal => { // the row is in middle and is a tie of the last row - append_res(Op::Insert, to_insert.1.clone()); - self.middle.insert(to_insert.0, to_insert.1); + self.middle.insert(to_insert.0.clone(), to_insert.1.clone()); + staging.insert(to_insert.0, to_insert.1); } Ordering::Greater => { // the row is in high @@ -610,8 +576,7 @@ impl TopNCacheTrait for TopNCache { managed_state: &mut ManagedTopNState, cache_key: CacheKey, row: impl Row + Send, - res_ops: &mut Vec, - res_rows: &mut Vec, + staging: &mut TopNStaging, ) -> StreamExecutorResult<()> { if !enable_strict_consistency() && self.table_row_count == Some(0) { // If strict consistency is disabled, and we receive a `DELETE` but the row count is 0, we @@ -627,18 +592,13 @@ impl TopNCacheTrait for TopNCache { "Offset is not supported yet for WITH TIES, so low cache should be None" ); - let mut append_res = |op: Op, row: CompactedRow| { - res_ops.push(op); - res_rows.push(row); - }; - if self.middle.is_empty() { consistency_error!( ?group_key, ?cache_key, "middle cache is empty, but we receive a DELETE operation" ); - append_res(Op::Delete, (&row).into()); + staging.delete(cache_key, (&row).into()); return Ok(()); } @@ -651,7 +611,7 @@ impl TopNCacheTrait for TopNCache { } else { // the row is in middle self.middle.remove(&cache_key); - append_res(Op::Delete, (&row).into()); + staging.delete(cache_key.clone(), (&row).into()); if self.middle.len() >= self.limit { // this can happen when there are ties return Ok(()); @@ -675,12 +635,13 @@ impl TopNCacheTrait for TopNCache { let high_first_sort_key = (high_first.0).0.clone(); assert!(high_first_sort_key > middle_last_sort_key); - append_res(Op::Insert, high_first.1.clone()); - self.middle.insert(high_first.0, high_first.1); + self.middle + .insert(high_first.0.clone(), high_first.1.clone()); + staging.insert(high_first.0, high_first.1); for (cache_key, row) in self.high.extract_if(|k, _| k.0 == high_first_sort_key) { - append_res(Op::Insert, row.clone()); - self.middle.insert(cache_key, row); + self.middle.insert(cache_key.clone(), row.clone()); + staging.insert(cache_key, row); } } } @@ -702,8 +663,7 @@ pub trait AppendOnlyTopNCacheTrait { &mut self, cache_key: CacheKey, row_ref: RowRef<'_>, - res_ops: &mut Vec, - res_rows: &mut Vec, + staging: &mut TopNStaging, managed_state: &mut ManagedTopNState, row_deserializer: &RowDeserializer, ) -> StreamExecutorResult<()>; @@ -714,8 +674,7 @@ impl AppendOnlyTopNCacheTrait for TopNCache { &mut self, cache_key: CacheKey, row_ref: RowRef<'_>, - res_ops: &mut Vec, - res_rows: &mut Vec, + staging: &mut TopNStaging, managed_state: &mut ManagedTopNState, row_deserializer: &RowDeserializer, ) -> StreamExecutorResult<()> { @@ -724,11 +683,6 @@ impl AppendOnlyTopNCacheTrait for TopNCache { } managed_state.insert(row_ref); - let mut append_res = |op: Op, row: CompactedRow| { - res_ops.push(op); - res_rows.push(row); - }; - // insert input row into corresponding cache according to its sort key let mut to_insert = (cache_key, row_ref.into()); @@ -754,8 +708,8 @@ impl AppendOnlyTopNCacheTrait for TopNCache { // try insert into middle cache if !self.middle_is_full() { - self.middle.insert(to_insert.0, to_insert.1.clone()); - append_res(Op::Insert, to_insert.1); + self.middle.insert(to_insert.0.clone(), to_insert.1.clone()); + staging.insert(to_insert.0, to_insert.1); return Ok(()); } @@ -763,12 +717,11 @@ impl AppendOnlyTopNCacheTrait for TopNCache { // the largest row in `cache.middle` needs to be removed. let middle_last = self.middle.pop_last().unwrap(); debug_assert!(to_insert.0 < middle_last.0); - - append_res(Op::Delete, middle_last.1.clone()); managed_state.delete(row_deserializer.deserialize(middle_last.1.row.as_ref())?); + staging.delete(middle_last.0, middle_last.1); - append_res(Op::Insert, to_insert.1.clone()); - self.middle.insert(to_insert.0, to_insert.1); + self.middle.insert(to_insert.0.clone(), to_insert.1.clone()); + staging.insert(to_insert.0, to_insert.1); // Unlike normal topN, append only topN does not use the high part of the cache. @@ -781,8 +734,7 @@ impl AppendOnlyTopNCacheTrait for TopNCache { &mut self, cache_key: CacheKey, row_ref: RowRef<'_>, - res_ops: &mut Vec, - res_rows: &mut Vec, + staging: &mut TopNStaging, managed_state: &mut ManagedTopNState, row_deserializer: &RowDeserializer, ) -> StreamExecutorResult<()> { @@ -791,11 +743,6 @@ impl AppendOnlyTopNCacheTrait for TopNCache { "Offset is not supported yet for WITH TIES, so low cache should be empty" ); - let mut append_res = |op: Op, row: CompactedRow| { - res_ops.push(op); - res_rows.push(row); - }; - let to_insert = (cache_key, row_ref); // try insert into middle cache @@ -803,8 +750,8 @@ impl AppendOnlyTopNCacheTrait for TopNCache { if !self.middle_is_full() { managed_state.insert(to_insert.1); let row: CompactedRow = to_insert.1.into(); - self.middle.insert(to_insert.0, row.clone()); - append_res(Op::Insert, row); + self.middle.insert(to_insert.0.clone(), row.clone()); + staging.insert(to_insert.0, row); return Ok(()); } @@ -833,25 +780,24 @@ impl AppendOnlyTopNCacheTrait for TopNCache { && middle_last.key().0 == middle_last_sort_key { let middle_last = middle_last.remove_entry(); - append_res(Op::Delete, middle_last.1.clone()); - // we don't need to maintain the high part so just delete it from state table managed_state .delete(row_deserializer.deserialize(middle_last.1.row.as_ref())?); + staging.delete(middle_last.0, middle_last.1); } } managed_state.insert(to_insert.1); let row: CompactedRow = to_insert.1.into(); - append_res(Op::Insert, row.clone()); - self.middle.insert(to_insert.0, row); + self.middle.insert(to_insert.0.clone(), row.clone()); + staging.insert(to_insert.0, row); } Ordering::Equal => { // the row is in middle and is a tie of the last row managed_state.insert(to_insert.1); let row: CompactedRow = to_insert.1.into(); - append_res(Op::Insert, row.clone()); - self.middle.insert(to_insert.0, row); + self.middle.insert(to_insert.0.clone(), row.clone()); + staging.insert(to_insert.0, row); } Ordering::Greater => { // the row is in high, do nothing @@ -861,3 +807,92 @@ impl AppendOnlyTopNCacheTrait for TopNCache { Ok(()) } } + +/// Used to build diff between before and after applying an input chunk, for `TopNCache` (of one group). +/// It should be maintained when an entry is inserted or deleted from the `middle` cache. +#[derive(Debug, Default)] +pub struct TopNStaging { + to_delete: BTreeMap, + to_insert: BTreeMap, + to_update: BTreeMap, +} + +impl TopNStaging { + pub fn new() -> Self { + Self::default() + } + + /// Insert a row into the staging changes. This method must be called when a row is + /// added to the `middle` cache. + fn insert(&mut self, cache_key: CacheKey, row: CompactedRow) { + if let Some(old_row) = self.to_delete.remove(&cache_key) { + if old_row != row { + self.to_update.insert(cache_key, (old_row, row)); + } + } else { + self.to_insert.insert(cache_key, row); + } + } + + /// Delete a row from the staging changes. This method must be called when a row is + /// removed from the `middle` cache. + fn delete(&mut self, cache_key: CacheKey, row: CompactedRow) { + if self.to_insert.remove(&cache_key).is_some() { + // do nothing more + } else if let Some((old_row, _)) = self.to_update.remove(&cache_key) { + self.to_delete.insert(cache_key, old_row); + } else { + self.to_delete.insert(cache_key, row); + } + } + + /// Get the count of effective changes in the staging. + pub fn len(&self) -> usize { + self.to_delete.len() + self.to_insert.len() + self.to_update.len() + } + + /// Check if the staging is empty. + pub fn is_empty(&self) -> bool { + self.to_delete.is_empty() && self.to_insert.is_empty() && self.to_update.is_empty() + } + + /// Iterate over the changes in the staging. + pub fn into_changes(self) -> impl Iterator { + #[cfg(debug_assertions)] + { + let keys = self + .to_delete + .keys() + .chain(self.to_insert.keys()) + .chain(self.to_update.keys()) + .unique() + .count(); + assert_eq!( + keys, + self.to_delete.len() + self.to_insert.len() + self.to_update.len(), + "should not have duplicate keys with different operations", + ); + } + + // We expect one `CacheKey` to appear at most once in the staging, and, the order of + // the outputs of `TopN` doesn't really matter, so we can simply chain the three maps. + // Although the output order is not important, we still ensure that `Delete`s are emitted + // before `Insert`s, so that we can avoid temporary violation of the `LIMIT` constraint. + self.to_update + .into_values() + .flat_map(|(old_row, new_row)| { + [(Op::UpdateDelete, old_row), (Op::UpdateInsert, new_row)] + }) + .chain(self.to_delete.into_values().map(|row| (Op::Delete, row))) + .chain(self.to_insert.into_values().map(|row| (Op::Insert, row))) + } + + /// Iterate over the changes in the staging, and deserialize the rows. + pub fn into_deserialized_changes( + self, + deserializer: &RowDeserializer, + ) -> impl Iterator> + '_ { + self.into_changes() + .map(|(op, row)| Ok((op, deserializer.deserialize(row.row.as_ref())?))) + } +} diff --git a/src/stream/src/executor/top_n/top_n_plain.rs b/src/stream/src/executor/top_n/top_n_plain.rs index ebddd801a579..73468cd7dbe6 100644 --- a/src/stream/src/executor/top_n/top_n_plain.rs +++ b/src/stream/src/executor/top_n/top_n_plain.rs @@ -13,10 +13,11 @@ // limitations under the License. use risingwave_common::array::Op; -use risingwave_common::row::RowExt; +use risingwave_common::row::{RowDeserializer, RowExt}; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::sort_util::ColumnOrder; +use super::top_n_cache::TopNStaging; use super::utils::*; use super::{ManagedTopNState, TopNCache, TopNCacheTrait}; use crate::executor::prelude::*; @@ -126,9 +127,11 @@ impl TopNExecutorBase for InnerTopNExecuto where TopNCache: TopNCacheTrait, { - async fn apply_chunk(&mut self, chunk: StreamChunk) -> StreamExecutorResult { - let mut res_ops = Vec::with_capacity(self.cache.limit); - let mut res_rows = Vec::with_capacity(self.cache.limit); + async fn apply_chunk( + &mut self, + chunk: StreamChunk, + ) -> StreamExecutorResult> { + let mut staging = TopNStaging::new(); // apply the chunk to state table for (op, row_ref) in chunk.rows() { @@ -138,8 +141,7 @@ where Op::Insert | Op::UpdateInsert => { // First insert input row to state store self.managed_state.insert(row_ref); - self.cache - .insert(cache_key, row_ref, &mut res_ops, &mut res_rows) + self.cache.insert(cache_key, row_ref, &mut staging) } Op::Delete | Op::UpdateDelete => { @@ -151,14 +153,24 @@ where &mut self.managed_state, cache_key, row_ref, - &mut res_ops, - &mut res_rows, + &mut staging, ) .await? } } } - generate_output(res_rows, res_ops, &self.schema) + + let data_types = self.schema.data_types(); + let deserializer = RowDeserializer::new(data_types.clone()); + if staging.is_empty() { + return Ok(None); + } + let mut chunk_builder = StreamChunkBuilder::unlimited(data_types, Some(staging.len())); + for res in staging.into_deserialized_changes(&deserializer) { + let (op, row) = res?; + let _none = chunk_builder.append_row(op, row); + } + Ok(chunk_builder.take()) } async fn flush_data(&mut self, epoch: EpochPair) -> StreamExecutorResult<()> { @@ -184,7 +196,6 @@ where #[cfg(test)] mod tests { - use assert_matches::assert_matches; use risingwave_common::array::stream_chunk::StreamChunkTestExt; use risingwave_common::catalog::{Field, Schema}; use risingwave_common::types::DataType; @@ -200,6 +211,7 @@ mod tests { use risingwave_common::util::epoch::test_epoch; use super::*; + use crate::executor::test_utils::StreamExecutorTestExt; fn create_stream_chunks() -> Vec { let chunk1 = StreamChunk::from_pretty( @@ -287,7 +299,7 @@ mod tests { .await; let schema = source.schema().clone(); - let top_n_executor = TopNExecutor::<_, false>::new( + let top_n = TopNExecutor::<_, false>::new( source, ActorContext::for_test(0), schema, @@ -297,49 +309,38 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I + 10 3 + 9 4 + 8 5" ) + .sort_rows(), ); // Barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - + 7 6 - - 7 6 - - 8 5 - + 8 5 - 8 5 + 11 8" ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; - let res = top_n_executor.next().await.unwrap().unwrap(); // (8, 9, 10, 11, 12, 13, 14) assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I + 8 5 @@ -347,29 +348,24 @@ mod tests { + 13 11 + 14 12" ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; // (10, 12, 13, 14) - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 8 5 - 9 4 - 11 8" ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; } #[tokio::test] @@ -382,7 +378,7 @@ mod tests { ) .await; let schema = source.schema().clone(); - let top_n_executor = TopNExecutor::<_, false>::new( + let top_n = TopNExecutor::<_, false>::new( source, ActorContext::for_test(0), schema, @@ -392,76 +388,58 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I + 1 0 + 2 1 + 3 2 - + 10 3 - - 10 3 - + 9 4 - - 9 4 + 8 5" ) + .sort_rows(), ); // now () -> (1, 2, 3, 8) // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - - 8 5 + 7 6 - 3 2 - + 8 5 - 1 0 - + 9 4 - - 9 4 + 5 7 - 2 1 + 9 4" ) + .sort_rows(), ); // (5, 7, 8, 9) // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 9 4 + 6 9" ) + .sort_rows(), ); // (5, 6, 7, 8) // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 5 7 @@ -469,13 +447,11 @@ mod tests { - 6 9 + 10 3" ) + .sort_rows(), ); // (7, 8, 9, 10) // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; } // Should have the same result as above, since there are no duplicate sort keys. @@ -489,7 +465,7 @@ mod tests { ) .await; let schema = source.schema().clone(); - let top_n_executor = TopNExecutor::<_, true>::new( + let top_n = TopNExecutor::<_, true>::new( source, ActorContext::for_test(0), schema, @@ -499,76 +475,58 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I + 1 0 + 2 1 + 3 2 - + 10 3 - - 10 3 - + 9 4 - - 9 4 + 8 5" ) + .sort_rows(), ); // now () -> (1, 2, 3, 8) // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - - 8 5 + 7 6 - 3 2 - + 8 5 - 1 0 - + 9 4 - - 9 4 + 5 7 - 2 1 + 9 4" ) + .sort_rows(), ); // (5, 7, 8, 9) // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 9 4 + 6 9" ) + .sort_rows(), ); // (5, 6, 7, 8) // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 5 7 @@ -576,13 +534,11 @@ mod tests { - 6 9 + 10 3" ) + .sort_rows(), ); // (7, 8, 9, 10) // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; } #[tokio::test] @@ -595,7 +551,7 @@ mod tests { ) .await; let schema = source.schema().clone(); - let top_n_executor = TopNExecutor::<_, false>::new( + let top_n = TopNExecutor::<_, false>::new( source, ActorContext::for_test(0), schema, @@ -605,60 +561,46 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I + 10 3 + 9 4 + 8 5" ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - + 7 6 - - 7 6 - - 8 5 - + 8 5 - 8 5 + 11 8" ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - + 8 5" + + 8 5" ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 8 5 @@ -668,12 +610,10 @@ mod tests { - 11 8 + 14 12" ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; } } @@ -684,6 +624,7 @@ mod tests { use super::*; use crate::executor::test_utils::top_n_executor::create_in_memory_state_table_from_state_store; + use crate::executor::test_utils::StreamExecutorTestExt; fn create_source_new() -> Executor { let mut chunks = vec![ StreamChunk::from_pretty( @@ -812,7 +753,7 @@ mod tests { ) .await; let schema = source.schema().clone(); - let top_n_executor = TopNExecutor::<_, false>::new( + let top_n = TopNExecutor::<_, false>::new( source, ActorContext::for_test(0), schema, @@ -822,55 +763,42 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; - let res = top_n_executor.next().await.unwrap().unwrap(); - // should be empty assert_eq!( - *res.as_chunk().unwrap(), - StreamChunk::from_pretty(" I I I I") - ); - - let res = top_n_executor.next().await.unwrap().unwrap(); - assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I I I - + 5 1 4 1002 - " + + 5 1 4 1002" ) + .sort_rows(), ); - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I I I - + 1 9 1 1003 - + 9 8 1 1004 - - 9 8 1 1004 - + 1 1 4 1001", - ), + + 1 9 1 1003 + + 1 1 4 1001", + ) + .sort_rows(), ); - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I I I - - 5 1 4 1002 - + 1 0 2 1006", + - 5 1 4 1002 + + 1 0 2 1006", ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; } #[tokio::test] @@ -890,7 +818,7 @@ mod tests { .await; let source = create_source_new_before_recovery(); let schema = source.schema().clone(); - let top_n_executor = TopNExecutor::<_, false>::new( + let top_n = TopNExecutor::<_, false>::new( source, ActorContext::for_test(0), schema, @@ -900,33 +828,22 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; - let res = top_n_executor.next().await.unwrap().unwrap(); - // should be empty assert_eq!( - *res.as_chunk().unwrap(), - StreamChunk::from_pretty(" I I I I") - ); - - let res = top_n_executor.next().await.unwrap().unwrap(); - assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I I I - + 5 1 4 1002 - " + + 5 1 4 1002" ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; let state_table = create_in_memory_state_table_from_state_store( &[ @@ -944,7 +861,7 @@ mod tests { // recovery let source = create_source_new_after_recovery(); let schema = source.schema().clone(); - let top_n_executor_after_recovery = TopNExecutor::<_, false>::new( + let top_n_after_recovery = TopNExecutor::<_, false>::new( source, ActorContext::for_test(0), schema, @@ -954,41 +871,33 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor_after_recovery.boxed().execute(); + let mut top_n = top_n_after_recovery.boxed().execute(); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I I I - + 1 9 1 1003 - + 9 8 1 1004 - - 9 8 1 1004 - + 1 1 4 1001", - ), + + 1 9 1 1003 + + 1 1 4 1001", + ) + .sort_rows(), ); - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I I I - - 5 1 4 1002 - + 1 0 2 1006", + - 5 1 4 1002 + + 1 0 2 1006", ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; } } @@ -999,6 +908,7 @@ mod tests { use super::*; use crate::executor::test_utils::top_n_executor::create_in_memory_state_table_from_state_store; + use crate::executor::test_utils::StreamExecutorTestExt; fn create_source() -> Executor { let mut chunks = vec![ @@ -1070,7 +980,7 @@ mod tests { ) .await; let schema = source.schema().clone(); - let top_n_executor = TopNExecutor::new_with_ties_for_test( + let top_n = TopNExecutor::new_with_ties_for_test( source, ActorContext::for_test(0), schema, @@ -1080,64 +990,56 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I + 1 0 + 2 1 + 3 2" ) + .sort_rows(), ); - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - + 3 6 - + 3 7 - - 3 7 - - 3 6 - 3 2 + 1 8 + 2 9" ) + .sort_rows(), ); - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 1 0" ) + .sort_rows(), ); // High cache has only 2 capacity, but we need to trigger 3 inserts here! - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 1 8 + 3 2 + 3 6 - + 3 7 - " + + 3 7" ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; } fn create_source_before_recovery() -> Executor { @@ -1149,8 +1051,7 @@ mod tests { + 3 2 + 10 3 + 9 4 - + 8 5 - ", + + 8 5", ), StreamChunk::from_pretty( " I I @@ -1214,7 +1115,7 @@ mod tests { .await; let source = create_source_before_recovery(); let schema = source.schema().clone(); - let top_n_executor = TopNExecutor::new_with_ties_for_test( + let top_n = TopNExecutor::new_with_ties_for_test( source, ActorContext::for_test(0), schema, @@ -1224,41 +1125,34 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor.boxed().execute(); + let mut top_n = top_n.boxed().execute(); // consume the init barrier - top_n_executor.next().await.unwrap().unwrap(); - let res = top_n_executor.next().await.unwrap().unwrap(); + top_n.expect_barrier().await; assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I + 1 0 + 2 1 + 3 2" ) + .sort_rows(), ); - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - + 3 6 - + 3 7 - - 3 7 - - 3 6 - 3 2 + 1 8 + 2 9" ) + .sort_rows(), ); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; let state_table = create_in_memory_state_table_from_state_store( &[DataType::Int64, DataType::Int64], @@ -1271,7 +1165,7 @@ mod tests { // recovery let source = create_source_after_recovery(); let schema = source.schema().clone(); - let top_n_executor_after_recovery = TopNExecutor::new_with_ties_for_test( + let top_n_after_recovery = TopNExecutor::new_with_ties_for_test( source, ActorContext::for_test(0), schema, @@ -1281,42 +1175,34 @@ mod tests { state_table, ) .unwrap(); - let mut top_n_executor = top_n_executor_after_recovery.boxed().execute(); + let mut top_n = top_n_after_recovery.boxed().execute(); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 1 0" ) + .sort_rows(), ); // High cache has only 2 capacity, but we need to trigger 3 inserts here! - let res = top_n_executor.next().await.unwrap().unwrap(); assert_eq!( - *res.as_chunk().unwrap(), + top_n.expect_chunk().await.sort_rows(), StreamChunk::from_pretty( " I I - 1 8 + 3 2 + 3 6 - + 3 7 - " + + 3 7" ) + .sort_rows(), ); - println!("hello"); // barrier - assert_matches!( - top_n_executor.next().await.unwrap().unwrap(), - Message::Barrier(_) - ); + top_n.expect_barrier().await; } } } diff --git a/src/stream/src/executor/top_n/top_n_state.rs b/src/stream/src/executor/top_n/top_n_state.rs index 919197f48ed3..db32f3500fcc 100644 --- a/src/stream/src/executor/top_n/top_n_state.rs +++ b/src/stream/src/executor/top_n/top_n_state.rs @@ -327,7 +327,7 @@ mod tests { use super::*; use crate::executor::test_utils::top_n_executor::create_in_memory_state_table; - use crate::executor::top_n::top_n_cache::TopNCacheTrait; + use crate::executor::top_n::top_n_cache::{TopNCacheTrait, TopNStaging}; use crate::executor::top_n::{create_cache_key_serde, NO_GROUP_KEY}; use crate::row_nonnull; @@ -496,15 +496,14 @@ mod tests { let row1_bytes = serialize_pk_to_cache_key(row1.clone(), &cache_key_serde); let mut cache = TopNCache::::new(0, 1, data_types); - cache.insert(row1_bytes.clone(), row1.clone(), &mut vec![], &mut vec![]); + cache.insert(row1_bytes.clone(), row1.clone(), &mut TopNStaging::new()); cache .delete( NO_GROUP_KEY, &mut managed_state, row1_bytes, row1, - &mut vec![], - &mut vec![], + &mut TopNStaging::new(), ) .await .unwrap(); diff --git a/src/stream/src/executor/top_n/utils.rs b/src/stream/src/executor/top_n/utils.rs index 761556319e6b..174001828c84 100644 --- a/src/stream/src/executor/top_n/utils.rs +++ b/src/stream/src/executor/top_n/utils.rs @@ -14,11 +14,7 @@ use std::future::Future; -use itertools::Itertools; -use risingwave_common::array::Op; use risingwave_common::bitmap::Bitmap; -use risingwave_common::row::{CompactedRow, RowDeserializer}; -use risingwave_common::util::chunk_coalesce::DataChunkBuilder; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::row_serde::OrderedRowSerde; use risingwave_common::util::sort_util::ColumnOrder; @@ -28,10 +24,12 @@ use crate::executor::prelude::*; pub trait TopNExecutorBase: Send + 'static { /// Apply the chunk to the dirty state and get the diffs. + /// TODO(rc): There can be a 2 times amplification in terms of the chunk size, so we may need to + /// allow `apply_chunk` return a stream of chunks. Motivation is not quite strong though. fn apply_chunk( &mut self, chunk: StreamChunk, - ) -> impl Future> + Send; + ) -> impl Future>> + Send; /// Flush the buffered chunk to the storage backend. fn flush_data( @@ -102,7 +100,9 @@ where } } Message::Chunk(chunk) => { - yield Message::Chunk(self.inner.apply_chunk(chunk).await?); + if let Some(output_chunk) = self.inner.apply_chunk(chunk).await? { + yield Message::Chunk(output_chunk); + } self.inner.try_flush_data().await?; } Message::Barrier(barrier) => { @@ -120,33 +120,6 @@ where } } -pub fn generate_output( - new_rows: Vec, - new_ops: Vec, - schema: &Schema, -) -> StreamExecutorResult { - if !new_rows.is_empty() { - let mut data_chunk_builder = DataChunkBuilder::new(schema.data_types(), new_rows.len() + 1); - let row_deserializer = RowDeserializer::new(schema.data_types()); - for compacted_row in new_rows { - let res = data_chunk_builder - .append_one_row(row_deserializer.deserialize(compacted_row.row.as_ref())?); - debug_assert!(res.is_none()); - } - // since `new_rows` is not empty, we unwrap directly - let new_data_chunk = data_chunk_builder.consume_all().unwrap(); - let new_stream_chunk = StreamChunk::new(new_ops, new_data_chunk.columns().to_vec()); - Ok(new_stream_chunk) - } else { - let columns = schema - .create_array_builders(0) - .into_iter() - .map(|x| x.finish().into()) - .collect_vec(); - Ok(StreamChunk::new(vec![], columns)) - } -} - /// For a given pk (Row), it can be split into `order_key` and `additional_pk` according to /// `order_by_len`, and the two split parts are serialized separately. pub fn serialize_pk_to_cache_key(pk: impl Row, cache_key_serde: &CacheKeySerde) -> CacheKey { diff --git a/src/stream/src/from_proto/source/trad_source.rs b/src/stream/src/from_proto/source/trad_source.rs index 98746a672e43..4d4786eea3bf 100644 --- a/src/stream/src/from_proto/source/trad_source.rs +++ b/src/stream/src/from_proto/source/trad_source.rs @@ -232,7 +232,7 @@ impl ExecutorBuilder for SourceExecutorBuilder { barrier_receiver, system_params, source.rate_limit, - is_shared, + is_shared && !source.with_properties.is_cdc_connector(), ) .boxed() } diff --git a/src/stream/src/task/barrier_manager.rs b/src/stream/src/task/barrier_manager.rs index 155320281421..fec0d74ab6d5 100644 --- a/src/stream/src/task/barrier_manager.rs +++ b/src/stream/src/task/barrier_manager.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::BTreeSet; +use std::collections::{BTreeSet, HashSet}; use std::fmt::Display; use std::future::pending; use std::iter::once; @@ -20,10 +20,13 @@ use std::sync::Arc; use std::time::Duration; use anyhow::anyhow; -use futures::stream::BoxStream; -use futures::StreamExt; +use await_tree::InstrumentAwait; +use futures::future::BoxFuture; +use futures::stream::{BoxStream, FuturesOrdered}; +use futures::{FutureExt, StreamExt, TryFutureExt}; use itertools::Itertools; use risingwave_common::error::tonic::extra::Score; +use risingwave_pb::stream_plan::barrier::BarrierKind; use risingwave_pb::stream_service::barrier_complete_response::{ PbCreateMviewProgress, PbLocalSstableInfo, }; @@ -51,7 +54,6 @@ use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::runtime::BackgroundShutdownRuntime; use risingwave_hummock_sdk::table_stats::to_prost_table_stats_map; use risingwave_hummock_sdk::{LocalSstableInfo, SyncResult}; -use risingwave_pb::stream_plan::barrier::BarrierKind; use risingwave_pb::stream_service::streaming_control_stream_request::{ InitRequest, InitialPartialGraph, Request, }; @@ -265,6 +267,9 @@ pub(super) struct LocalBarrierWorker { /// Current barrier collection state. pub(super) state: ManagedBarrierState, + /// Futures will be finished in the order of epoch in ascending order. + await_epoch_completed_futures: FuturesOrdered, + control_stream_handle: ControlStreamHandle, pub(super) actor_manager: Arc, @@ -296,6 +301,7 @@ impl LocalBarrierWorker { shared_context.clone(), initial_partial_graphs, ), + await_epoch_completed_futures: Default::default(), control_stream_handle: ControlStreamHandle::empty(), actor_manager, current_shared_context: shared_context, @@ -316,10 +322,17 @@ impl LocalBarrierWorker { loop { select! { biased; - (partial_graph_id, completed_epoch) = self.state.next_completed_epoch() => { - let result = self.on_epoch_completed(partial_graph_id, completed_epoch); - if let Err(err) = result { - self.notify_other_failure(err, "failed to complete epoch").await; + (partial_graph_id, barrier) = self.state.next_collected_epoch() => { + self.complete_barrier(partial_graph_id, barrier.epoch.prev); + } + (partial_graph_id, barrier, result) = rw_futures_util::pending_on_none(self.await_epoch_completed_futures.next()) => { + match result { + Ok(result) => { + self.on_epoch_completed(partial_graph_id, barrier.epoch.prev, result); + } + Err(err) => { + self.notify_other_failure(err, "failed to complete epoch").await; + } } }, event = self.barrier_event_rx.recv() => { @@ -454,23 +467,139 @@ impl LocalBarrierWorker { } } -// event handler +mod await_epoch_completed_future { + use std::future::Future; + + use futures::future::BoxFuture; + use futures::FutureExt; + use risingwave_hummock_sdk::SyncResult; + use risingwave_pb::stream_service::barrier_complete_response::PbCreateMviewProgress; + + use crate::error::StreamResult; + use crate::executor::Barrier; + use crate::task::{await_tree_key, BarrierCompleteResult, PartialGraphId}; + + pub(super) type AwaitEpochCompletedFuture = impl Future)> + + 'static; + + pub(super) fn instrument_complete_barrier_future( + partial_graph_id: PartialGraphId, + complete_barrier_future: Option>>, + barrier: Barrier, + barrier_await_tree_reg: Option<&await_tree::Registry>, + create_mview_progress: Vec, + ) -> AwaitEpochCompletedFuture { + let prev_epoch = barrier.epoch.prev; + let future = async move { + if let Some(future) = complete_barrier_future { + let result = future.await; + result.map(Some) + } else { + Ok(None) + } + } + .map(move |result| { + ( + partial_graph_id, + barrier, + result.map(|sync_result| BarrierCompleteResult { + sync_result, + create_mview_progress, + }), + ) + }); + if let Some(reg) = barrier_await_tree_reg { + reg.register( + await_tree_key::BarrierAwait { prev_epoch }, + format!("SyncEpoch({})", prev_epoch), + ) + .instrument(future) + .left_future() + } else { + future.right_future() + } + } +} + +use await_epoch_completed_future::*; +use risingwave_common::catalog::TableId; +use risingwave_storage::StateStoreImpl; + +fn sync_epoch( + state_store: &StateStoreImpl, + streaming_metrics: &StreamingMetrics, + prev_epoch: u64, + table_ids: HashSet, +) -> BoxFuture<'static, StreamResult> { + let timer = streaming_metrics.barrier_sync_latency.start_timer(); + let hummock = state_store.as_hummock().cloned(); + let future = async move { + if let Some(hummock) = hummock { + hummock.sync(vec![(prev_epoch, table_ids)]).await + } else { + Ok(SyncResult::default()) + } + }; + future + .instrument_await(format!("sync_epoch (epoch {})", prev_epoch)) + .inspect_ok(move |_| { + timer.observe_duration(); + }) + .map_err(move |e| { + tracing::error!( + prev_epoch, + error = %e.as_report(), + "Failed to sync state store", + ); + e.into() + }) + .boxed() +} + impl LocalBarrierWorker { + fn complete_barrier(&mut self, partial_graph_id: PartialGraphId, prev_epoch: u64) { + { + let (barrier, table_ids, create_mview_progress) = self + .state + .pop_barrier_to_complete(partial_graph_id, prev_epoch); + + let complete_barrier_future = match &barrier.kind { + BarrierKind::Unspecified => unreachable!(), + BarrierKind::Initial => { + tracing::info!( + epoch = prev_epoch, + "ignore sealing data for the first barrier" + ); + tracing::info!(?prev_epoch, "ignored syncing data for the first barrier"); + None + } + BarrierKind::Barrier => None, + BarrierKind::Checkpoint => Some(sync_epoch( + &self.actor_manager.env.state_store(), + &self.actor_manager.streaming_metrics, + prev_epoch, + table_ids.expect("should be Some on BarrierKind::Checkpoint"), + )), + }; + + self.await_epoch_completed_futures.push_back({ + instrument_complete_barrier_future( + partial_graph_id, + complete_barrier_future, + barrier, + self.actor_manager.await_tree_reg.as_ref(), + create_mview_progress, + ) + }); + } + } + fn on_epoch_completed( &mut self, partial_graph_id: PartialGraphId, epoch: u64, - ) -> StreamResult<()> { - let state = self - .state - .graph_states - .get_mut(&partial_graph_id) - .expect("should exist"); - let result = state - .pop_completed_epoch(epoch) - .expect("should exist") - .expect("should have completed")?; - + result: BarrierCompleteResult, + ) { let BarrierCompleteResult { create_mview_progress, sync_result, @@ -524,7 +653,6 @@ impl LocalBarrierWorker { }; self.control_stream_handle.send_response(result); - Ok(()) } /// Broadcast a barrier to all senders. Save a receiver which will get notified when this @@ -538,11 +666,6 @@ impl LocalBarrierWorker { barrier: &Barrier, request: InjectBarrierRequest, ) -> StreamResult<()> { - if barrier.kind == BarrierKind::Initial { - self.actor_manager - .watermark_epoch - .store(barrier.epoch.curr, std::sync::atomic::Ordering::SeqCst); - } debug!( target: "events::stream::barrier::manager::send", "send barrier {:?}, actor_ids_to_collect = {:?}", diff --git a/src/stream/src/task/barrier_manager/managed_state.rs b/src/stream/src/task/barrier_manager/managed_state.rs index cd6bb924f478..bd5c92570f13 100644 --- a/src/stream/src/task/barrier_manager/managed_state.rs +++ b/src/stream/src/task/barrier_manager/managed_state.rs @@ -12,58 +12,39 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; use std::cell::LazyCell; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::{Debug, Display, Formatter}; use std::future::{pending, poll_fn, Future}; use std::mem::replace; use std::sync::Arc; -use std::task::{ready, Context, Poll}; +use std::task::Poll; -use anyhow::anyhow; -use await_tree::InstrumentAwait; -use futures::future::BoxFuture; -use futures::stream::FuturesOrdered; -use futures::{FutureExt, StreamExt, TryFutureExt}; use prometheus::HistogramTimer; use risingwave_common::catalog::TableId; -use risingwave_common::must_match; use risingwave_common::util::epoch::EpochPair; -use risingwave_hummock_sdk::SyncResult; use risingwave_pb::stream_plan::barrier::BarrierKind; -use risingwave_storage::{dispatch_state_store, StateStore, StateStoreImpl}; -use thiserror_ext::AsReport; +use risingwave_storage::StateStoreImpl; use tokio::sync::mpsc; use tokio::task::JoinHandle; use super::progress::BackfillState; -use super::BarrierCompleteResult; use crate::error::{StreamError, StreamResult}; use crate::executor::monitor::StreamingMetrics; -use crate::executor::{Barrier, Mutation}; +use crate::executor::Barrier; use crate::task::{ActorId, PartialGraphId, SharedContext, StreamActorManager}; struct IssuedState { - pub mutation: Option>, /// Actor ids remaining to be collected. pub remaining_actors: BTreeSet, pub barrier_inflight_latency: HistogramTimer, - - /// Only be `Some(_)` when `kind` is `Checkpoint` - pub table_ids: Option>, - - pub kind: BarrierKind, } impl Debug for IssuedState { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.debug_struct("IssuedState") - .field("mutation", &self.mutation) .field("remaining_actors", &self.remaining_actors) - .field("table_ids", &self.table_ids) - .field("kind", &self.kind) .finish() } } @@ -75,100 +56,23 @@ enum ManagedBarrierStateInner { Issued(IssuedState), /// The barrier has been collected by all remaining actors - AllCollected, - - /// The barrier has been completed, which means the barrier has been collected by all actors and - /// synced in state store - Completed(StreamResult), + AllCollected(Vec), } #[derive(Debug)] -pub(super) struct BarrierState { +struct BarrierState { barrier: Barrier, + /// Only be `Some(_)` when `barrier.kind` is `Checkpoint` + table_ids: Option>, inner: ManagedBarrierStateInner, } -mod await_epoch_completed_future { - use std::future::Future; - - use futures::future::BoxFuture; - use futures::FutureExt; - use risingwave_hummock_sdk::SyncResult; - use risingwave_pb::stream_service::barrier_complete_response::PbCreateMviewProgress; - - use crate::error::StreamResult; - use crate::executor::Barrier; - use crate::task::{await_tree_key, BarrierCompleteResult}; - - pub(super) type AwaitEpochCompletedFuture = - impl Future)> + 'static; - - pub(super) fn instrument_complete_barrier_future( - complete_barrier_future: Option>>, - barrier: Barrier, - barrier_await_tree_reg: Option<&await_tree::Registry>, - create_mview_progress: Vec, - ) -> AwaitEpochCompletedFuture { - let prev_epoch = barrier.epoch.prev; - let future = async move { - if let Some(future) = complete_barrier_future { - let result = future.await; - result.map(Some) - } else { - Ok(None) - } - } - .map(move |result| { - ( - barrier, - result.map(|sync_result| BarrierCompleteResult { - sync_result, - create_mview_progress, - }), - ) - }); - if let Some(reg) = barrier_await_tree_reg { - reg.register( - await_tree_key::BarrierAwait { prev_epoch }, - format!("SyncEpoch({})", prev_epoch), - ) - .instrument(future) - .left_future() - } else { - future.right_future() - } - } -} - -use await_epoch_completed_future::*; +use risingwave_common::must_match; use risingwave_pb::stream_plan::SubscriptionUpstreamInfo; +use risingwave_pb::stream_service::barrier_complete_response::PbCreateMviewProgress; use risingwave_pb::stream_service::streaming_control_stream_request::InitialPartialGraph; use risingwave_pb::stream_service::InjectBarrierRequest; -fn sync_epoch( - state_store: &S, - streaming_metrics: &StreamingMetrics, - prev_epoch: u64, - table_ids: HashSet, -) -> BoxFuture<'static, StreamResult> { - let timer = streaming_metrics.barrier_sync_latency.start_timer(); - let future = state_store.sync(prev_epoch, table_ids); - future - .instrument_await(format!("sync_epoch (epoch {})", prev_epoch)) - .inspect_ok(move |_| { - timer.observe_duration(); - }) - .map_err(move |e| { - tracing::error!( - prev_epoch, - error = %e.as_report(), - "Failed to sync state store", - ); - e.into() - }) - .boxed() -} - pub(super) struct ManagedBarrierStateDebugInfo<'a> { graph_states: &'a HashMap, } @@ -190,7 +94,11 @@ impl Display for &'_ PartialGraphManagedBarrierState { write!(f, "> Epoch {}: ", epoch)?; match &barrier_state.inner { ManagedBarrierStateInner::Issued(state) => { - write!(f, "Issued [{:?}]. Remaining actors: [", state.kind)?; + write!( + f, + "Issued [{:?}]. Remaining actors: [", + barrier_state.barrier.kind + )?; let mut is_prev_epoch_issued = false; if prev_epoch != 0 { let bs = &self.epoch_barrier_state_map[&prev_epoch]; @@ -221,12 +129,9 @@ impl Display for &'_ PartialGraphManagedBarrierState { } write!(f, "]")?; } - ManagedBarrierStateInner::AllCollected => { + ManagedBarrierStateInner::AllCollected(_) => { write!(f, "AllCollected")?; } - ManagedBarrierStateInner::Completed(_) => { - write!(f, "Completed")?; - } } prev_epoch = *epoch; writeln!(f)?; @@ -378,18 +283,12 @@ pub(super) struct PartialGraphManagedBarrierState { /// Record the progress updates of creating mviews for each epoch of concurrent checkpoints. /// /// This is updated by [`super::CreateMviewProgressReporter::update`] and will be reported to meta - /// in [`BarrierCompleteResult`]. + /// in [`crate::task::barrier_manager::BarrierCompleteResult`]. pub(super) create_mview_progress: HashMap>, - pub(super) state_store: StateStoreImpl, - - pub(super) streaming_metrics: Arc, + state_store: StateStoreImpl, - /// Futures will be finished in the order of epoch in ascending order. - await_epoch_completed_futures: FuturesOrdered, - - /// Manages the await-trees of all barriers. - barrier_await_tree_reg: Option, + streaming_metrics: Arc, } impl PartialGraphManagedBarrierState { @@ -397,24 +296,17 @@ impl PartialGraphManagedBarrierState { Self::new_inner( actor_manager.env.state_store(), actor_manager.streaming_metrics.clone(), - actor_manager.await_tree_reg.clone(), ) } - fn new_inner( - state_store: StateStoreImpl, - streaming_metrics: Arc, - barrier_await_tree_reg: Option, - ) -> Self { + fn new_inner(state_store: StateStoreImpl, streaming_metrics: Arc) -> Self { Self { epoch_barrier_state_map: Default::default(), prev_barrier_table_ids: None, mv_depended_subscriptions: Default::default(), create_mview_progress: Default::default(), - await_epoch_completed_futures: Default::default(), state_store, streaming_metrics, - barrier_await_tree_reg, } } @@ -423,7 +315,6 @@ impl PartialGraphManagedBarrierState { Self::new_inner( StateStoreImpl::for_test(), Arc::new(StreamingMetrics::unused()), - None, ) } @@ -676,23 +567,26 @@ impl ManagedBarrierState { Ok(()) } - pub(super) fn next_completed_epoch( + pub(super) fn next_collected_epoch( &mut self, - ) -> impl Future + '_ { - poll_fn(|cx| { + ) -> impl Future + '_ { + poll_fn(|_| { + let mut output = None; for (partial_graph_id, graph_state) in &mut self.graph_states { - if let Poll::Ready(barrier) = graph_state.poll_next_completed_barrier(cx) { + if let Some(barrier) = graph_state.may_have_collected_all() { if let Some(actors_to_stop) = barrier.all_stop_actors() { self.current_shared_context.drop_actors(actors_to_stop); } - let partial_graph_id = *partial_graph_id; - return Poll::Ready((partial_graph_id, barrier.epoch.prev)); + output = Some((*partial_graph_id, barrier)); + break; } } - Poll::Pending + output.map(Poll::Ready).unwrap_or(Poll::Pending) }) } +} +impl ManagedBarrierState { pub(super) fn collect(&mut self, actor_id: ActorId, epoch: EpochPair) { let (prev_partial_graph_id, is_finished) = self .actor_states @@ -711,25 +605,34 @@ impl ManagedBarrierState { .expect("should exist"); prev_graph_state.collect(actor_id, epoch); } + + pub(super) fn pop_barrier_to_complete( + &mut self, + partial_graph_id: PartialGraphId, + prev_epoch: u64, + ) -> ( + Barrier, + Option>, + Vec, + ) { + self.graph_states + .get_mut(&partial_graph_id) + .expect("should exist") + .pop_barrier_to_complete(prev_epoch) + } } impl PartialGraphManagedBarrierState { /// This method is called when barrier state is modified in either `Issued` or `Stashed` /// to transform the state to `AllCollected` and start state store `sync` when the barrier /// has been collected from all actors for an `Issued` barrier. - fn may_have_collected_all(&mut self, prev_epoch: u64) { - // Report if there's progress on the earliest in-flight barrier. - if self.epoch_barrier_state_map.keys().next() == Some(&prev_epoch) { - self.streaming_metrics.barrier_manager_progress.inc(); - } - - for (prev_epoch, barrier_state) in &mut self.epoch_barrier_state_map { - let prev_epoch = *prev_epoch; + fn may_have_collected_all(&mut self) -> Option { + for barrier_state in self.epoch_barrier_state_map.values_mut() { match &barrier_state.inner { ManagedBarrierStateInner::Issued(IssuedState { remaining_actors, .. }) if remaining_actors.is_empty() => {} - ManagedBarrierStateInner::AllCollected | ManagedBarrierStateInner::Completed(_) => { + ManagedBarrierStateInner::AllCollected(_) => { continue; } ManagedBarrierStateInner::Issued(_) => { @@ -737,65 +640,60 @@ impl PartialGraphManagedBarrierState { } } + self.streaming_metrics.barrier_manager_progress.inc(); + + let create_mview_progress = self + .create_mview_progress + .remove(&barrier_state.barrier.epoch.curr) + .unwrap_or_default() + .into_iter() + .map(|(actor, state)| state.to_pb(actor)) + .collect(); + let prev_state = replace( &mut barrier_state.inner, - ManagedBarrierStateInner::AllCollected, + ManagedBarrierStateInner::AllCollected(create_mview_progress), ); - let (kind, table_ids) = must_match!(prev_state, ManagedBarrierStateInner::Issued(IssuedState { + must_match!(prev_state, ManagedBarrierStateInner::Issued(IssuedState { barrier_inflight_latency: timer, - kind, - table_ids, .. }) => { timer.observe_duration(); - (kind, table_ids) }); - let create_mview_progress = self - .create_mview_progress - .remove(&barrier_state.barrier.epoch.curr) - .unwrap_or_default() - .into_iter() - .map(|(actor, state)| state.to_pb(actor)) - .collect(); + return Some(barrier_state.barrier.clone()); + } + None + } - let complete_barrier_future = match kind { - BarrierKind::Unspecified => unreachable!(), - BarrierKind::Initial => { - tracing::info!( - epoch = prev_epoch, - "ignore sealing data for the first barrier" - ); - tracing::info!(?prev_epoch, "ignored syncing data for the first barrier"); - None - } - BarrierKind::Barrier => None, - BarrierKind::Checkpoint => { - dispatch_state_store!(&self.state_store, state_store, { - Some(sync_epoch( - state_store, - &self.streaming_metrics, - prev_epoch, - table_ids.expect("should be Some on BarrierKind::Checkpoint"), - )) - }) - } - }; + fn pop_barrier_to_complete( + &mut self, + prev_epoch: u64, + ) -> ( + Barrier, + Option>, + Vec, + ) { + let (popped_prev_epoch, barrier_state) = self + .epoch_barrier_state_map + .pop_first() + .expect("should exist"); - let barrier = barrier_state.barrier.clone(); + assert_eq!(prev_epoch, popped_prev_epoch); - self.await_epoch_completed_futures.push_back({ - instrument_complete_barrier_future( - complete_barrier_future, - barrier, - self.barrier_await_tree_reg.as_ref(), - create_mview_progress, - ) - }); - } + let create_mview_progress = must_match!(barrier_state.inner, ManagedBarrierStateInner::AllCollected(create_mview_progress) => { + create_mview_progress + }); + ( + barrier_state.barrier, + barrier_state.table_ids, + create_mview_progress, + ) } +} +impl PartialGraphManagedBarrierState { /// Collect a `barrier` from the actor with `actor_id`. pub(super) fn collect(&mut self, actor_id: ActorId, epoch: EpochPair) { tracing::debug!( @@ -830,7 +728,6 @@ impl PartialGraphManagedBarrierState { actor_id, epoch.curr ); assert_eq!(barrier.epoch.curr, epoch.curr); - self.may_have_collected_all(epoch.prev); } Some(BarrierState { inner, .. }) => { panic!( @@ -914,79 +811,20 @@ impl PartialGraphManagedBarrierState { barrier: barrier.clone(), inner: ManagedBarrierStateInner::Issued(IssuedState { remaining_actors: BTreeSet::from_iter(actor_ids_to_collect), - mutation: barrier.mutation.clone(), barrier_inflight_latency: timer, - kind: barrier.kind, - table_ids, }), + table_ids, }, ); - self.may_have_collected_all(barrier.epoch.prev); - } - - /// Return a future that yields the next completed epoch. The future is cancellation safe. - pub(crate) fn poll_next_completed_barrier(&mut self, cx: &mut Context<'_>) -> Poll { - ready!(self.await_epoch_completed_futures.next().poll_unpin(cx)) - .map(|(barrier, result)| { - let state = self - .epoch_barrier_state_map - .get_mut(&barrier.epoch.prev) - .expect("should exist"); - // sanity check on barrier state - assert_matches!(&state.inner, ManagedBarrierStateInner::AllCollected); - state.inner = ManagedBarrierStateInner::Completed(result); - barrier - }) - .map(Poll::Ready) - .unwrap_or(Poll::Pending) - } - - /// Pop the completion result of an completed epoch. - /// Return: - /// - `Err(_)` `prev_epoch` is not an epoch to be collected. - /// - `Ok(None)` when `prev_epoch` exists but has not completed. - /// - `Ok(Some(_))` when `prev_epoch` has completed but not been reclaimed yet. - /// The `BarrierCompleteResult` will be popped out. - pub(crate) fn pop_completed_epoch( - &mut self, - prev_epoch: u64, - ) -> StreamResult>> { - let state = self - .epoch_barrier_state_map - .get(&prev_epoch) - .ok_or_else(|| { - // It's still possible that `collect_complete_receiver` does not contain the target epoch - // when receiving collect_barrier request. Because `collect_complete_receiver` could - // be cleared when CN is under recovering. We should return error rather than panic. - anyhow!( - "barrier collect complete receiver for prev epoch {} not exists", - prev_epoch - ) - })?; - match &state.inner { - ManagedBarrierStateInner::Completed(_) => { - match self - .epoch_barrier_state_map - .remove(&prev_epoch) - .expect("should exists") - .inner - { - ManagedBarrierStateInner::Completed(result) => Ok(Some(result)), - _ => unreachable!(), - } - } - _ => Ok(None), - } } #[cfg(test)] async fn pop_next_completed_epoch(&mut self) -> u64 { - let barrier = poll_fn(|cx| self.poll_next_completed_barrier(cx)).await; - let _ = self - .pop_completed_epoch(barrier.epoch.prev) - .unwrap() - .unwrap(); - barrier.epoch.prev + if let Some(barrier) = self.may_have_collected_all() { + self.pop_barrier_to_complete(barrier.epoch.prev); + return barrier.epoch.prev; + } + pending().await } } diff --git a/src/stream/src/task/barrier_manager/progress.rs b/src/stream/src/task/barrier_manager/progress.rs index dba8f5050627..c860b8f430fa 100644 --- a/src/stream/src/task/barrier_manager/progress.rs +++ b/src/stream/src/task/barrier_manager/progress.rs @@ -250,6 +250,7 @@ impl CreateMviewProgressReporter { if let Some(BackfillState::DoneConsumingUpstreamTableOrSource(_)) = self.state { return; } + tracing::debug!("progress finish"); self.update_inner( epoch, BackfillState::DoneConsumingUpstreamTableOrSource(current_consumed_rows), diff --git a/src/tests/e2e_extended_mode/README.md b/src/tests/e2e_extended_mode/README.md index 07e67d68e3a0..53e44e6ac8e4 100644 --- a/src/tests/e2e_extended_mode/README.md +++ b/src/tests/e2e_extended_mode/README.md @@ -1,16 +1,16 @@ This is a program used for e2e test in extended mode. -## What is difference between it and extended_mode/*.slt in e2e_test +## What is difference between this and `e2e_test/extended_mode` + +For e2e test in extended query mode, there are a few things we can't test in sqllogictest -For e2e test in extended query mode, there are two thing we can't test in sqllogitest 1. bind parameter 2. max row number 3. cancel query -See [detail](https://www.postgresql.org/docs/15/protocol-flow.html#PROTOCOL-FLOW-PIPELINING:~:text=Once%20a%20portal,count%20is%20ignored) -So before sqllogictest supporting these, we test these function in this program. +See more details [here](https://www.postgresql.org/docs/15/protocol-flow.html#PROTOCOL-FLOW-PIPELINING:~:text=Once%20a%20portal,count%20is%20ignored). -In the future, we may merge it to e2e_text/extended_query +Before sqllogictest supports these, we test these functions in this program. In the future, we may merge it to `e2e_test/extended_mode`. # How to run diff --git a/src/tests/simulation/Cargo.toml b/src/tests/simulation/Cargo.toml index f5b3fc6145e1..30ca8da23b16 100644 --- a/src/tests/simulation/Cargo.toml +++ b/src/tests/simulation/Cargo.toml @@ -50,7 +50,7 @@ risingwave_sqlsmith = { workspace = true } serde = "1.0.188" serde_derive = "1.0.188" serde_json = "1.0.107" -sqllogictest = "0.22.0" +sqllogictest = "0.23.0" tempfile = "3" tikv-jemallocator = { workspace = true } tokio = { version = "0.2", package = "madsim-tokio" } diff --git a/src/tests/simulation/src/main.rs b/src/tests/simulation/src/main.rs index 4d3122c486d9..3b8e6d7b24af 100644 --- a/src/tests/simulation/src/main.rs +++ b/src/tests/simulation/src/main.rs @@ -133,10 +133,13 @@ pub struct Args { /// Use arrangement backfill #[clap(long, default_value = "false")] use_arrangement_backfill: bool, + + /// Set vnode count (`STREAMING_MAX_PARALLELISM`) to random value before running DDL. + #[clap(long, env = "RW_SIM_RANDOM_VNODE_COUNT")] + random_vnode_count: bool, } #[tokio::main] -#[cfg_or_panic(madsim)] async fn main() { use std::sync::Arc; @@ -165,7 +168,6 @@ async fn main() { } else { vec!["SET STREAMING_USE_ARRANGEMENT_BACKFILL = false;".to_string()].into() }, - ..Default::default() }; let kill_opts = KillOpts { kill_meta: false, @@ -186,7 +188,7 @@ async fn main() { cluster.create_kafka_producer(&datadir).await; } - let seed = madsim::runtime::Handle::current().seed(); + let seed = sqlsmith_seed(); if let Some(count) = args.sqlsmith { cluster .run_on_client(async move { @@ -248,7 +250,12 @@ async fn main() { if let Some(jobs) = args.jobs { run_parallel_slt_task(glob, jobs).await.unwrap(); } else { - run_slt_task(cluster0, glob, &kill_opts, args.background_ddl_rate).await; + let opts = Opts { + kill_opts, + background_ddl_rate: args.background_ddl_rate, + random_vnode_count: args.random_vnode_count, + }; + run_slt_task(cluster0, glob, opts).await; } }) .await; @@ -270,3 +277,8 @@ async fn main() { cluster.graceful_shutdown().await; } + +#[cfg_or_panic(madsim)] +fn sqlsmith_seed() -> u64 { + madsim::runtime::Handle::current().seed() +} diff --git a/src/tests/simulation/src/slt.rs b/src/tests/simulation/src/slt.rs index 7bf9d62d1964..ede789792d1a 100644 --- a/src/tests/simulation/src/slt.rs +++ b/src/tests/simulation/src/slt.rs @@ -19,6 +19,7 @@ use std::time::Duration; use anyhow::{bail, Result}; use itertools::Itertools; +use rand::seq::IteratorRandom; use rand::{thread_rng, Rng, SeedableRng}; use rand_chacha::ChaChaRng; use sqllogictest::{Condition, ParallelTestError, QueryExpect, Record, StatementExpect}; @@ -85,6 +86,15 @@ impl SqlCmd { // are not transactional, we can't kill during `alter table add/drop columns` for now, will // remove it until transactional commit of table fragment and catalog is supported. } + + fn is_create(&self) -> bool { + matches!( + self, + SqlCmd::Create { .. } + | SqlCmd::CreateSink { .. } + | SqlCmd::CreateMaterializedView { .. } + ) + } } fn extract_sql_command(sql: &str) -> SqlCmd { @@ -189,13 +199,23 @@ async fn wait_background_mv_finished(mview_name: &str) -> Result<()> { } } +pub struct Opts { + pub kill_opts: KillOpts, + /// Probability of `background_ddl` being set to true per ddl record. + pub background_ddl_rate: f64, + /// Set vnode count (`STREAMING_MAX_PARALLELISM`) to random value before running DDL. + pub random_vnode_count: bool, +} + /// Run the sqllogictest files in `glob`. pub async fn run_slt_task( cluster: Arc, glob: &str, - opts: &KillOpts, - // Probability of background_ddl being set to true per ddl record. - background_ddl_rate: f64, + Opts { + kill_opts, + background_ddl_rate, + random_vnode_count, + }: Opts, ) { tracing::info!("background_ddl_rate: {}", background_ddl_rate); let seed = std::env::var("MADSIM_TEST_SEED") @@ -203,7 +223,10 @@ pub async fn run_slt_task( .parse::() .unwrap(); let mut rng = ChaChaRng::seed_from_u64(seed); - let kill = opts.kill_compute || opts.kill_meta || opts.kill_frontend || opts.kill_compactor; + let kill = kill_opts.kill_compute + || kill_opts.kill_meta + || kill_opts.kill_frontend + || kill_opts.kill_compactor; let files = glob::glob(glob).expect("failed to read glob pattern"); for file in files { // use a session per file @@ -229,7 +252,22 @@ pub async fn run_slt_task( // We can revert it back to false only if we encounter a record that sets background_ddl to false. let mut manual_background_ddl_enabled = false; - for record in sqllogictest::parse_file(path).expect("failed to parse file") { + let records = sqllogictest::parse_file(path).expect("failed to parse file"); + let random_vnode_count = random_vnode_count + // Skip using random vnode count if the test case cares about parallelism, including + // setting parallelism manually or checking the parallelism with system tables. + && records.iter().all(|record| { + if let Record::Statement { sql, .. } | Record::Query { sql, .. } = record + && sql.to_lowercase().contains("parallelism") + { + println!("[RANDOM VNODE COUNT] skip: {}", path.display()); + false + } else { + true + } + }); + + for record in records { // uncomment to print metrics for task counts // let metrics = madsim::runtime::Handle::current().metrics(); // println!("{:#?}", metrics); @@ -238,8 +276,42 @@ pub async fn run_slt_task( break; } + let cmd = match &record { + sqllogictest::Record::Statement { sql, .. } + | sqllogictest::Record::Query { sql, .. } => extract_sql_command(sql), + _ => SqlCmd::Others, + }; + // For normal records. if !kill { + // Set random vnode count if needed. + if random_vnode_count + && cmd.is_create() + && let Record::Statement { + loc, + conditions, + connection, + .. + } = &record + { + let vnode_count = (2..=64) // small + .chain(224..=288) // normal + .chain(992..=1056) // 1024 affects row id gen behavior + .choose(&mut thread_rng()) + .unwrap(); + let sql = format!("SET STREAMING_MAX_PARALLELISM = {vnode_count};"); + println!("[RANDOM VNODE COUNT] set: {vnode_count}"); + let set_random_vnode_count = Record::Statement { + loc: loc.clone(), + conditions: conditions.clone(), + connection: connection.clone(), + sql, + expected: StatementExpect::Ok, + }; + tester.run_async(set_random_vnode_count).await.unwrap(); + println!("[RANDOM VNODE COUNT] run: {record}"); + } + match tester .run_async(record.clone()) .timed(|_res, elapsed| { @@ -253,11 +325,6 @@ pub async fn run_slt_task( } // For kill enabled. - let cmd = match &record { - sqllogictest::Record::Statement { sql, .. } - | sqllogictest::Record::Query { sql, .. } => extract_sql_command(sql), - _ => SqlCmd::Others, - }; tracing::debug!(?cmd, "Running"); if background_ddl_rate > 0.0 @@ -329,11 +396,11 @@ pub async fn run_slt_task( continue; } - let should_kill = thread_rng().gen_bool(opts.kill_rate as f64); + let should_kill = thread_rng().gen_bool(kill_opts.kill_rate as f64); // spawn a background task to kill nodes let handle = if should_kill { let cluster = cluster.clone(); - let opts = *opts; + let opts = kill_opts; Some(tokio::spawn(async move { let t = thread_rng().gen_range(Duration::default()..Duration::from_secs(1)); tokio::time::sleep(t).await; diff --git a/src/tests/simulation/tests/integration_tests/sink/basic.rs b/src/tests/simulation/tests/integration_tests/sink/basic.rs index 8ba8982ce4d7..4899b7d152d8 100644 --- a/src/tests/simulation/tests/integration_tests/sink/basic.rs +++ b/src/tests/simulation/tests/integration_tests/sink/basic.rs @@ -44,7 +44,7 @@ async fn basic_test_inner(is_decouple: bool) -> Result<()> { } session.run(CREATE_SOURCE).await?; session.run(CREATE_SINK).await?; - assert_eq!(6, test_sink.parallelism_counter.load(Relaxed)); + test_sink.wait_initial_parallelism(6).await?; let internal_tables = session.run("show internal tables").await?; diff --git a/src/tests/simulation/tests/integration_tests/sink/err_isolation.rs b/src/tests/simulation/tests/integration_tests/sink/err_isolation.rs index 124653946b87..0307fc671e02 100644 --- a/src/tests/simulation/tests/integration_tests/sink/err_isolation.rs +++ b/src/tests/simulation/tests/integration_tests/sink/err_isolation.rs @@ -40,7 +40,7 @@ async fn test_sink_decouple_err_isolation() -> Result<()> { session.run("set sink_decouple = true").await?; session.run(CREATE_SOURCE).await?; session.run(CREATE_SINK).await?; - assert_eq!(6, test_sink.parallelism_counter.load(Relaxed)); + test_sink.wait_initial_parallelism(6).await?; test_sink.set_err_rate(0.002); @@ -81,7 +81,7 @@ async fn test_sink_error_event_logs() -> Result<()> { session.run("set sink_decouple = true").await?; session.run(CREATE_SOURCE).await?; session.run(CREATE_SINK).await?; - assert_eq!(6, test_sink.parallelism_counter.load(Relaxed)); + test_sink.wait_initial_parallelism(6).await?; test_sink.store.wait_for_err(1).await?; diff --git a/src/tests/simulation/tests/integration_tests/sink/recovery.rs b/src/tests/simulation/tests/integration_tests/sink/recovery.rs index 6b4f71d7d508..124f0b0d9fe5 100644 --- a/src/tests/simulation/tests/integration_tests/sink/recovery.rs +++ b/src/tests/simulation/tests/integration_tests/sink/recovery.rs @@ -71,7 +71,7 @@ async fn recovery_test_inner(is_decouple: bool) -> Result<()> { } session.run(CREATE_SOURCE).await?; session.run(CREATE_SINK).await?; - assert_eq!(6, test_sink.parallelism_counter.load(Relaxed)); + test_sink.wait_initial_parallelism(6).await?; let count = test_source.id_list.len(); diff --git a/src/tests/simulation/tests/integration_tests/sink/scale.rs b/src/tests/simulation/tests/integration_tests/sink/scale.rs index 99c3b7e9ebc5..9ecff238fb03 100644 --- a/src/tests/simulation/tests/integration_tests/sink/scale.rs +++ b/src/tests/simulation/tests/integration_tests/sink/scale.rs @@ -73,7 +73,7 @@ async fn scale_test_inner(is_decouple: bool) -> Result<()> { } session.run(CREATE_SOURCE).await?; session.run(CREATE_SINK).await?; - assert_eq!(6, test_sink.parallelism_counter.load(Relaxed)); + test_sink.wait_initial_parallelism(6).await?; let mut sink_fragments = cluster .locate_fragments([identity_contains("Sink")]) diff --git a/src/tests/simulation/tests/integration_tests/sink/utils.rs b/src/tests/simulation/tests/integration_tests/sink/utils.rs index bef5bdfa35d0..9c93b388dca5 100644 --- a/src/tests/simulation/tests/integration_tests/sink/utils.rs +++ b/src/tests/simulation/tests/integration_tests/sink/utils.rs @@ -39,6 +39,7 @@ use risingwave_connector::source::test_source::{ registry_test_source, BoxSource, TestSourceRegistryGuard, TestSourceSplit, }; use risingwave_simulation::cluster::{Cluster, ConfigPath, Configuration}; +use tokio::task::yield_now; use tokio::time::sleep; use crate::{assert_eq_with_err_returned as assert_eq, assert_with_err_returned as assert}; @@ -244,6 +245,14 @@ impl SimulationTestSink { let err_rate = u32::MAX as f64 * err_rate; self.err_rate.store(err_rate as _, Relaxed); } + + pub async fn wait_initial_parallelism(&self, parallelism: usize) -> Result<()> { + while self.parallelism_counter.load(Relaxed) < parallelism { + yield_now().await; + } + assert_eq!(self.parallelism_counter.load(Relaxed), parallelism); + Ok(()) + } } pub fn build_stream_chunk(