diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 235fb3bbacfbb..acd75f253b699 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -3,5 +3,5 @@ contact_links:
url: https://github.com/risingwavelabs/risingwave/discussions
about: Have questions? Welcome to open a discussion.
- name: Community Chat
- url: https://join.slack.com/t/risingwave-community/shared_invite/zt-120rft0mr-d8uGk3d~NZiZAQWPnElOfw
+ url: https://risingwave.com/slack
about: Join the RisingWave Slack community and chat with us.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index b2d58279b5290..51242d0425e28 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -16,6 +16,7 @@ updates:
arrow:
patterns:
- "arrow*"
+ - "parquet"
aws:
patterns:
- "aws*"
diff --git a/.github/workflows/auto-create-doc-issue-by-issue.yml b/.github/workflows/auto-create-doc-issue-by-issue.yml
new file mode 100644
index 0000000000000..0c8d78062977a
--- /dev/null
+++ b/.github/workflows/auto-create-doc-issue-by-issue.yml
@@ -0,0 +1,31 @@
+name: Issue Documentation Checker
+
+on:
+ issues:
+ types:
+ - closed
+ - labeled
+
+jobs:
+ create-issue:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Log the event payload
+ run: echo "${{ toJSON(github.event) }}"
+ - name: Check if issue is done and labeled 'user-facing-changes'
+ uses: dacbd/create-issue-action@main
+ if: ${{ github.event.action == 'closed' && contains(github.event.issue.labels.*.name, 'user-facing-changes') }}
+ with:
+ token: ${{ secrets.ACCESS_TOKEN }}
+ owner: risingwavelabs
+ repo: risingwave-docs
+ title: |
+ Document: ${{ github.event.issue.title }}
+ body: |
+ ## Context
+ Source Issue URL: ${{ github.event.issue.html_url }}
+ Created At: ${{ github.event.issue.created_at }}
+ Created By: ${{ github.event.issue.user.login }}
+ Closed At: ${{ github.event.issue.closed_at }}
diff --git a/.github/workflows/auto-create-docs-pr.yml b/.github/workflows/auto-create-doc-issue-by-pr.yml
similarity index 100%
rename from .github/workflows/auto-create-docs-pr.yml
rename to .github/workflows/auto-create-doc-issue-by-pr.yml
diff --git a/.gitignore b/.gitignore
index 19fb6643dd8a6..375738f67093e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -74,4 +74,7 @@ simulation-it-test.tar.zst
# hummock-trace
.trace
+# spark binary
+e2e_test/iceberg/spark-*-bin*
+
**/poetry.lock
\ No newline at end of file
diff --git a/.licenserc.yaml b/.licenserc.yaml
index c1745a4d1ad74..7b49108b6b2f3 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -17,6 +17,6 @@ header:
- "**/*.d.ts"
- "src/sqlparser/**/*.rs"
- "java/connector-node/risingwave-source-cdc/src/main/java/com/risingwave/connector/cdc/debezium/internal/*.java"
- - "src/meta/src/model_v2/migration/**/*.rs"
+ - "src/meta/model_v2/migration/**/*.rs"
comment: on-failure
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9b519c16010ba..c0b3991fc1f61 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -4,7 +4,7 @@ Thanks for your interest in contributing to RisingWave! We welcome and appreciat
This document describes how to submit your code changes. To learn about the development process, see the [developer guide](docs/developer-guide.md). To understand the design and implementation of RisingWave, refer to the design docs listed in [docs/README.md](docs/README.md).
-If you have questions, you can search for existing discussions or start a new discussion in the [Discussions forum of RisingWave](https://github.com/risingwavelabs/risingwave/discussions), or ask in the RisingWave Community channel on Slack. Please use the [invitation link](https://join.slack.com/t/risingwave-community/shared_invite/zt-120rft0mr-d8uGk3d~NZiZAQWPnElOfw) to join the channel.
+If you have questions, you can search for existing discussions or start a new discussion in the [Discussions forum of RisingWave](https://github.com/risingwavelabs/risingwave/discussions), or ask in the RisingWave Community channel on Slack. Please use the [invitation link](https://risingwave.com/slack) to join the channel.
To report bugs, create a [GitHub issue](https://github.com/risingwavelabs/risingwave/issues/new/choose).
diff --git a/Cargo.lock b/Cargo.lock
index 4550cc2d7faeb..50519aeccab88 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -243,9 +243,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
[[package]]
name = "arrow-arith"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc1d4e368e87ad9ee64f28b9577a3834ce10fe2703a26b28417d485bbbdff956"
+checksum = "c5c3d17fc5b006e7beeaebfb1d2edfc92398b981f82d9744130437909b72a468"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -258,9 +258,9 @@ dependencies = [
[[package]]
name = "arrow-array"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d02efa7253ede102d45a4e802a129e83bcc3f49884cab795b1ac223918e4318d"
+checksum = "55705ada5cdde4cb0f202ffa6aa756637e33fea30e13d8d0d0fd6a24ffcee1e3"
dependencies = [
"ahash 0.8.3",
"arrow-buffer",
@@ -274,9 +274,9 @@ dependencies = [
[[package]]
name = "arrow-buffer"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fda119225204141138cb0541c692fbfef0e875ba01bfdeaed09e9d354f9d6195"
+checksum = "a722f90a09b94f295ab7102542e97199d3500128843446ef63e410ad546c5333"
dependencies = [
"bytes",
"half 2.3.1",
@@ -285,9 +285,9 @@ dependencies = [
[[package]]
name = "arrow-cast"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d825d51b9968868d50bc5af92388754056796dbc62a4e25307d588a1fc84dee"
+checksum = "af01fc1a06f6f2baf31a04776156d47f9f31ca5939fe6d00cd7a059f95a46ff1"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -302,9 +302,9 @@ dependencies = [
[[package]]
name = "arrow-data"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "475a4c3699c8b4095ca61cecf15da6f67841847a5f5aac983ccb9a377d02f73a"
+checksum = "d0a547195e607e625e7fafa1a7269b8df1a4a612c919efd9b26bd86e74538f3a"
dependencies = [
"arrow-buffer",
"arrow-schema",
@@ -314,9 +314,9 @@ dependencies = [
[[package]]
name = "arrow-flight"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd938ea4a0e8d0db2b9f47ebba792f73f6188f4289707caeaf93a3be705e5ed5"
+checksum = "c58645809ced5acd6243e89a63ae8535a2ab50d780affcd7efe8c7473a0da661"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -334,9 +334,9 @@ dependencies = [
[[package]]
name = "arrow-ipc"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1248005c8ac549f869b7a840859d942bf62471479c1a2d82659d453eebcd166a"
+checksum = "e36bf091502ab7e37775ff448413ef1ffff28ff93789acb669fffdd51b394d51"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -348,9 +348,9 @@ dependencies = [
[[package]]
name = "arrow-ord"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03b87aa408ea6a6300e49eb2eba0c032c88ed9dc19e0a9948489c55efdca71f4"
+checksum = "4502123d2397319f3a13688432bc678c61cb1582f2daa01253186da650bf5841"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -363,9 +363,9 @@ dependencies = [
[[package]]
name = "arrow-row"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "114a348ab581e7c9b6908fcab23cb39ff9f060eb19e72b13f8fb8eaa37f65d22"
+checksum = "249fc5a07906ab3f3536a6e9f118ec2883fbcde398a97a5ba70053f0276abda4"
dependencies = [
"ahash 0.8.3",
"arrow-array",
@@ -378,15 +378,15 @@ dependencies = [
[[package]]
name = "arrow-schema"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d1d179c117b158853e0101bfbed5615e86fe97ee356b4af901f1c5001e1ce4b"
+checksum = "9d7a8c3f97f5ef6abd862155a6f39aaba36b029322462d72bbcfa69782a50614"
[[package]]
name = "arrow-select"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5c71e003202e67e9db139e5278c79f5520bb79922261dfe140e4637ee8b6108"
+checksum = "f868f4a5001429e20f7c1994b5cd1aa68b82e3db8cf96c559cdb56dc8be21410"
dependencies = [
"ahash 0.8.3",
"arrow-array",
@@ -2448,10 +2448,11 @@ dependencies = [
[[package]]
name = "deranged"
-version = "0.3.8"
+version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946"
+checksum = "0f32d04922c60427da6f9fef14d042d9edddef64cb9d4ce0d64d0685fbeb1fd3"
dependencies = [
+ "powerfmt",
"serde",
]
@@ -2554,8 +2555,7 @@ checksum = "86e3bdc80eee6e16b2b6b0f87fbc98c04bee3455e35174c0de1a125d0688c632"
[[package]]
name = "dlv-list"
version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8aead04dc46b5f263c25721cf25c9e595951d15055f8063f92392fa0d7f64cf4"
+source = "git+https://github.com/sgodwincs/dlv-list-rs.git?rev=5bbc5d0#5bbc5d0cc84f257e173d851f8dc1674fb6e46f95"
dependencies = [
"const-random",
]
@@ -3018,7 +3018,7 @@ dependencies = [
[[package]]
name = "foyer"
version = "0.1.0"
-source = "git+https://github.com/mrcroxx/foyer?rev=438eec8#438eec87e90c7a80cb53a06b711c6ea1ad7a0f41"
+source = "git+https://github.com/MrCroxx/foyer?rev=2261151#2261151107ad362851f5fff9ce4fa56e61911b10"
dependencies = [
"foyer-common",
"foyer-intrusive",
@@ -3029,10 +3029,11 @@ dependencies = [
[[package]]
name = "foyer-common"
version = "0.1.0"
-source = "git+https://github.com/mrcroxx/foyer?rev=438eec8#438eec87e90c7a80cb53a06b711c6ea1ad7a0f41"
+source = "git+https://github.com/MrCroxx/foyer?rev=2261151#2261151107ad362851f5fff9ce4fa56e61911b10"
dependencies = [
"bytes",
"foyer-workspace-hack",
+ "itertools 0.11.0",
"madsim-tokio",
"parking_lot 0.12.1",
"paste",
@@ -3043,13 +3044,13 @@ dependencies = [
[[package]]
name = "foyer-intrusive"
version = "0.1.0"
-source = "git+https://github.com/mrcroxx/foyer?rev=438eec8#438eec87e90c7a80cb53a06b711c6ea1ad7a0f41"
+source = "git+https://github.com/MrCroxx/foyer?rev=2261151#2261151107ad362851f5fff9ce4fa56e61911b10"
dependencies = [
"bytes",
"cmsketch",
"foyer-common",
"foyer-workspace-hack",
- "itertools 0.10.5",
+ "itertools 0.11.0",
"memoffset",
"parking_lot 0.12.1",
"paste",
@@ -3060,7 +3061,7 @@ dependencies = [
[[package]]
name = "foyer-storage"
version = "0.1.0"
-source = "git+https://github.com/mrcroxx/foyer?rev=438eec8#438eec87e90c7a80cb53a06b711c6ea1ad7a0f41"
+source = "git+https://github.com/MrCroxx/foyer?rev=2261151#2261151107ad362851f5fff9ce4fa56e61911b10"
dependencies = [
"anyhow",
"async-channel",
@@ -3089,7 +3090,7 @@ dependencies = [
[[package]]
name = "foyer-workspace-hack"
version = "0.1.0"
-source = "git+https://github.com/mrcroxx/foyer?rev=438eec8#438eec87e90c7a80cb53a06b711c6ea1ad7a0f41"
+source = "git+https://github.com/MrCroxx/foyer?rev=2261151#2261151107ad362851f5fff9ce4fa56e61911b10"
dependencies = [
"crossbeam-utils",
"either",
@@ -3098,7 +3099,7 @@ dependencies = [
"futures-sink",
"futures-util",
"hyper",
- "itertools 0.10.5",
+ "itertools 0.11.0",
"libc",
"memchr",
"parking_lot 0.12.1",
@@ -3227,9 +3228,9 @@ dependencies = [
[[package]]
name = "futures-async-stream"
-version = "0.2.7"
+version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f529ccdeacfa2446a9577041686cf1abb839b1b3e15fee4c1b1232ab3b7d799f"
+checksum = "379790776b0d953337df4ab7ecc51936c66ea112484cad7912907b1d34253ebf"
dependencies = [
"futures-async-stream-macro",
"futures-core",
@@ -3238,13 +3239,13 @@ dependencies = [
[[package]]
name = "futures-async-stream-macro"
-version = "0.2.7"
+version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca2b48ee06dc8d2808ba5ebad075d06c3406085bb19deaac33be64c39113bf80"
+checksum = "5df2c13d48c8cb8a3ec093ede6f0f4482f327d7bb781120c5fb483ef0f17e758"
dependencies = [
"proc-macro2",
"quote",
- "syn 1.0.109",
+ "syn 2.0.37",
]
[[package]]
@@ -4141,6 +4142,18 @@ dependencies = [
"wasm-bindgen",
]
+[[package]]
+name = "jsonbb"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44376417b2ff0cd879b5c84976fa9e0855c316321b4e0502e33e52963bf84f74"
+dependencies = [
+ "bytes",
+ "serde",
+ "serde_json",
+ "smallvec",
+]
+
[[package]]
name = "jsonschema-transpiler"
version = "1.10.0"
@@ -4464,6 +4477,15 @@ dependencies = [
"libc",
]
+[[package]]
+name = "lz4_flex"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ea9b256699eda7b0387ffbc776dd625e28bde3918446381781245b7a50349d8"
+dependencies = [
+ "twox-hash",
+]
+
[[package]]
name = "lzma-sys"
version = "0.1.20"
@@ -4782,15 +4804,6 @@ dependencies = [
"syn 1.0.109",
]
-[[package]]
-name = "model_migration"
-version = "0.1.0"
-dependencies = [
- "async-std",
- "sea-orm-migration",
- "uuid",
-]
-
[[package]]
name = "moka"
version = "0.12.0"
@@ -5519,8 +5532,7 @@ dependencies = [
[[package]]
name = "ordered-multimap"
version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ed8acf08e98e744e5384c8bc63ceb0364e68a6854187221c18df61c4797690e"
+source = "git+https://github.com/risingwavelabs/ordered-multimap-rs.git?rev=19c743f#19c743f3e3d106c99ba37628f06a2ca6faa2284f"
dependencies = [
"dlv-list",
"hashbrown 0.13.2",
@@ -5643,9 +5655,9 @@ dependencies = [
[[package]]
name = "parquet"
-version = "47.0.0"
+version = "48.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0463cc3b256d5f50408c49a4be3a16674f4c8ceef60941709620a062b1f6bf4d"
+checksum = "239229e6a668ab50c61de3dce61cf0fa1069345f7aa0f4c934491f92205a4945"
dependencies = [
"ahash 0.8.3",
"arrow-array",
@@ -5662,7 +5674,7 @@ dependencies = [
"flate2",
"futures",
"hashbrown 0.14.0",
- "lz4",
+ "lz4_flex",
"num",
"num-bigint",
"paste",
@@ -5671,7 +5683,7 @@ dependencies = [
"thrift",
"tokio",
"twox-hash",
- "zstd 0.12.4",
+ "zstd 0.13.0",
]
[[package]]
@@ -6050,6 +6062,12 @@ dependencies = [
"serde_json",
]
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
[[package]]
name = "pprof"
version = "0.13.0"
@@ -7083,6 +7101,7 @@ dependencies = [
"hytra",
"itertools 0.11.0",
"itoa",
+ "jsonbb",
"libc",
"lru 0.7.6",
"mach2",
@@ -7461,6 +7480,7 @@ dependencies = [
"futures-util",
"hex",
"itertools 0.11.0",
+ "jsonbb",
"madsim-tokio",
"md5",
"num-traits",
@@ -7641,6 +7661,7 @@ dependencies = [
name = "risingwave_jni_core"
version = "0.1.0"
dependencies = [
+ "anyhow",
"bytes",
"cfg-or-panic",
"futures",
@@ -7691,7 +7712,6 @@ dependencies = [
"maplit",
"memcomparable",
"mime_guess",
- "model_migration",
"num-integer",
"num-traits",
"parking_lot 0.12.1",
@@ -7705,6 +7725,8 @@ dependencies = [
"risingwave_common_heap_profiling",
"risingwave_connector",
"risingwave_hummock_sdk",
+ "risingwave_meta_model_migration",
+ "risingwave_meta_model_v2",
"risingwave_object_store",
"risingwave_pb",
"risingwave_rpc_client",
@@ -7714,7 +7736,6 @@ dependencies = [
"sea-orm",
"serde",
"serde_json",
- "sqlx",
"sync-point",
"thiserror",
"tokio-retry",
@@ -7727,6 +7748,25 @@ dependencies = [
"workspace-hack",
]
+[[package]]
+name = "risingwave_meta_model_migration"
+version = "1.3.0-alpha"
+dependencies = [
+ "async-std",
+ "sea-orm-migration",
+ "uuid",
+]
+
+[[package]]
+name = "risingwave_meta_model_v2"
+version = "1.3.0-alpha"
+dependencies = [
+ "risingwave_pb",
+ "sea-orm",
+ "serde",
+ "serde_json",
+]
+
[[package]]
name = "risingwave_meta_node"
version = "1.3.0-alpha"
@@ -7739,13 +7779,13 @@ dependencies = [
"madsim-etcd-client",
"madsim-tokio",
"madsim-tonic",
- "model_migration",
"prometheus-http-query",
"regex",
"risingwave_common",
"risingwave_common_heap_profiling",
"risingwave_common_service",
"risingwave_meta",
+ "risingwave_meta_model_migration",
"risingwave_meta_service",
"risingwave_pb",
"risingwave_rpc_client",
@@ -7769,6 +7809,7 @@ dependencies = [
"risingwave_common",
"risingwave_connector",
"risingwave_meta",
+ "risingwave_meta_model_v2",
"risingwave_pb",
"sea-orm",
"sync-point",
@@ -9832,14 +9873,15 @@ dependencies = [
[[package]]
name = "time"
-version = "0.3.28"
+version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48"
+checksum = "c4a34ab300f2dee6e562c10a046fc05e358b29f9bf92277f30c3c8d82275f6f5"
dependencies = [
"deranged",
"itoa",
"libc",
"num_threads",
+ "powerfmt",
"serde",
"time-core",
"time-macros",
@@ -9847,15 +9889,15 @@ dependencies = [
[[package]]
name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
-version = "0.2.14"
+version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a942f44339478ef67935ab2bbaec2fb0322496cf3cbe84b261e06ac3814c572"
+checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20"
dependencies = [
"time-core",
]
@@ -11004,7 +11046,6 @@ dependencies = [
"futures-util",
"hashbrown 0.12.3",
"hashbrown 0.14.0",
- "heck 0.4.1",
"hyper",
"indexmap 1.9.3",
"itertools 0.10.5",
diff --git a/Cargo.toml b/Cargo.toml
index ef09221b818a2..ac533e733f7a8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,9 +19,10 @@ members = [
"src/java_binding",
"src/jni_core",
"src/meta",
+ "src/meta/model_v2",
+ "src/meta/model_v2/migration",
"src/meta/node",
"src/meta/service",
- "src/meta/src/model_v2/migration",
"src/object_store",
"src/prost",
"src/prost/helpers",
@@ -97,7 +98,7 @@ aws-smithy-types = "0.55"
aws-endpoint = "0.55"
aws-types = "0.55"
etcd-client = { package = "madsim-etcd-client", version = "0.4" }
-futures-async-stream = "0.2"
+futures-async-stream = "0.2.9"
hytra = "0.1"
rdkafka = { package = "madsim-rdkafka", version = "0.3.0", features = [
"cmake-build",
@@ -112,13 +113,13 @@ tonic = { package = "madsim-tonic", version = "0.4.0" }
tonic-build = { package = "madsim-tonic-build", version = "0.4.2" }
prost = { version = "0.12" }
icelake = { git = "https://github.com/icelake-io/icelake", rev = "16dab0e36ab337e58ee8002d828def2d212fa116" }
-arrow-array = "47"
-arrow-cast = "47"
-arrow-schema = "47"
-arrow-buffer = "47"
-arrow-flight = "47"
-arrow-select = "47"
-arrow-ord = "47"
+arrow-array = "48"
+arrow-cast = "48"
+arrow-schema = "48"
+arrow-buffer = "48"
+arrow-flight = "48"
+arrow-select = "48"
+arrow-ord = "48"
tikv-jemalloc-ctl = { git = "https://github.com/risingwavelabs/jemallocator.git", rev = "64a2d9" }
tikv-jemallocator = { git = "https://github.com/risingwavelabs/jemallocator.git", features = [
"profiling",
@@ -143,6 +144,8 @@ risingwave_hummock_test = { path = "./src/storage/hummock_test" }
risingwave_hummock_trace = { path = "./src/storage/hummock_trace" }
risingwave_meta = { path = "./src/meta" }
risingwave_meta_service = { path = "./src/meta/service" }
+risingwave_meta_model_migration = { path = "src/meta/model_v2/migration" }
+risingwave_meta_model_v2 = { path = "./src/meta/model_v2" }
risingwave_meta_node = { path = "./src/meta/node" }
risingwave_object_store = { path = "./src/object_store" }
risingwave_pb = { path = "./src/prost" }
@@ -165,6 +168,8 @@ unused_must_use = "forbid"
future_incompatible = "warn"
nonstandard_style = "warn"
rust_2018_idioms = "warn"
+# Backward compatibility is not important for an application.
+async_fn_in_trait = "allow"
[workspace.lints.clippy]
uninlined_format_args = "allow"
@@ -229,8 +234,8 @@ opt-level = 2
incremental = false
debug = 1
-# Patch third-party crates for deterministic simulation.
[patch.crates-io]
+# Patch third-party crates for deterministic simulation.
quanta = { git = "https://github.com/madsim-rs/quanta.git", rev = "948bdc3" }
getrandom = { git = "https://github.com/madsim-rs/getrandom.git", rev = "8daf97e" }
tokio-stream = { git = "https://github.com/madsim-rs/tokio.git", rev = "fe39bb8e" }
@@ -238,3 +243,8 @@ tokio-retry = { git = "https://github.com/madsim-rs/rust-tokio-retry.git", rev =
tokio-postgres = { git = "https://github.com/madsim-rs/rust-postgres.git", rev = "ac00d88" }
# patch: unlimit 4MB message size for grpc client
etcd-client = { git = "https://github.com/risingwavelabs/etcd-client.git", rev = "4e84d40" }
+
+# Patch for coverage_attribute.
+# https://github.com/sgodwincs/dlv-list-rs/pull/19#issuecomment-1774786289
+dlv-list = { git = "https://github.com/sgodwincs/dlv-list-rs.git", rev = "5bbc5d0" }
+ordered-multimap = { git = "https://github.com/risingwavelabs/ordered-multimap-rs.git", rev = "19c743f" }
diff --git a/README.md b/README.md
index c1878a2717159..29a7d7e51888a 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+
@@ -5,23 +6,110 @@
-[![Slack](https://badgen.net/badge/Slack/Join%20RisingWave/0abd59?icon=slack)](https://risingwave.com/slack)
-[![Build status](https://badge.buildkite.com/9394d2bca0f87e2e97aa78b25f765c92d4207c0b65e7f6648f.svg)](https://buildkite.com/risingwavelabs/main)
-[![codecov](https://codecov.io/gh/risingwavelabs/risingwave/branch/main/graph/badge.svg?token=EB44K9K38B)](https://codecov.io/gh/risingwavelabs/risingwave)
-
-RisingWave is a distributed SQL streaming database. It is designed to reduce the complexity and cost of building stream processing applications. RisingWave consumes streaming data, performs incremental computations when new data comes in, and updates results dynamically. As a database system, RisingWave maintains results inside its own storage so that users can access data efficiently.
-RisingWave offers wire compatibility with PostgreSQL and demonstrates exceptional performance surpassing the previous generation of stream processing systems, including Apache Flink, by several orders of magnitude.
-It particularly excels in handling complex stateful operations like multi-stream joins.
+
-RisingWave ingests data from sources like Apache Kafka, Apache Pulsar, Amazon Kinesis, Redpanda, and materialized CDC sources. Data in RisingWave can be delivered to external targets such as message brokers, data warehouses, and data lakes for storage or additional processing.
+### 🌊Stream Processing Redefined.
-RisingWave 1.0 is a battle-tested version that has undergone rigorous stress tests and performance evaluations. It has proven its reliability and efficiency through successful deployments in numerous production environments across dozens of companies.
+
-Learn more at [Introduction to RisingWave](https://docs.risingwave.com/docs/current/intro/).
+
+ Documentation 📑
+ Hands-on Tutorials 🎯
+ RisingWave Cloud 🚀
+
+ Get Instant Help
+
+
+
+
+RisingWave is a distributed SQL streaming database that enables simple , efficient , and reliable processing of streaming data.
![RisingWave](https://github.com/risingwavelabs/risingwave-docs/blob/0f7e1302b22493ba3c1c48e78810750ce9a5ff42/docs/images/archi_simple.png)
+## How to install
+**Ubuntu**
+```
+wget https://github.com/risingwavelabs/risingwave/releases/download/v1.3.0/risingwave-v1.3.0-x86_64-unknown-linux.tar.gz
+tar xvf risingwave-v1.3.0-x86_64-unknown-linux.tar.gz
+./risingwave playground
+```
+**Mac**
+```
+brew tap risingwavelabs/risingwave
+brew install risingwave
+risingwave playground
+```
+Now connect to RisingWave using `psql`:
+```
+psql -h localhost -p 4566 -d dev -U root
+```
+
+Learn more at [Quick Start](https://docs.risingwave.com/docs/current/get-started/).
+
+## Why RisingWave for stream processing?
+RisingWave adaptly tackles some of the most challenging problems in stream processing. Compared to existing stream processing systems, RisingWave shines through with the following key features:
+* **Easy to learn**
+ * RisingWave speaks PostgreSQL-style SQL, enabling users to dive into stream processing in much the same way as operating a PostgreSQL database.
+* **Highly efficient in multi-stream joins**
+ * RisingWave has made significant optimizations for multiple stream join scenarios. Users can easily join 10-20 streams (or more) efficiently in a production environment.
+* **High resource utilization**
+ * Queries in RisingWave leverage shared computational resources, eliminating the need for users to manually allocate resources for each query.
+* **No compromise on large state management**
+ * The decoupled compute-storage architecture of RisingWave ensures remote persistence of internal states, and users never need to worry about the size of internal states when handling complex queries.
+* **Transparent dynamic scaling**
+ * RisingWave supports near-instantaneous dynamic scaling without any service interruptions.
+* **Instant failure recovery**
+ * RisingWave's state management mechanism allows it to recover from failure in seconds, not minutes or hours.
+* **Easy to verify correctness**
+ * RisingWave persists results in materialized views and allow users to break down complex stream computation programs into stacked materialized views, simplifying program development and result verification.
+* **Simplified data stack**
+ * RisingWave's ability to store data and serve queries eliminates the need for separate maintenance of stream processors and databases. Users can effortlessly connect RisingWave to their preferred BI tools or through client libraries.
+* **Simple to maintain and operate**
+ * RisingWave abstracts away unnecessary low-level details, allowing users to concentrate solely on SQL code-level issues.
+* **Rich ecosystem**
+ * With integrations to a diverse range of cloud systems and the PostgreSQL ecosystem, RisingWave boasts a rich and expansive ecosystem.
+
+## RisingWave's limitations
+RisingWave isn’t a panacea for all data engineering hurdles. It has its own set of limitations:
+* **No programmable interfaces**
+ * RisingWave does not provide low-level APIs in languages like Java and Scala, and does not allow users to manage internal states manually (unless you want to hack!). For coding in Java, Scala, and other languages, please consider using RisingWave's User-Defined Functions (UDF).
+* **No support for transaction processing**
+ * RisingWave isn’t cut out for transactional workloads, thus it’s not a viable substitute for operational databases dedicated to transaction processing. However, it supports read-only transactions, ensuring data freshness and consistency. It also comprehends the transactional semantics of upstream database Change Data Capture (CDC).
+* **Not tailored for ad-hoc analytical queries**
+ * RisingWave's row store design is tailored for optimal stream processing performance rather than interactive analytical workloads. Hence, it's not a suitable replacement for OLAP databases. Yet, a reliable integration with many OLAP databases exists, and a collaborative use of RisingWave and OLAP databases is a common practice among many users.
+
## RisingWave Cloud
@@ -29,19 +117,10 @@ RisingWave Cloud is a fully-managed and scalable stream processing platform powe
## Notes on telemetry
-RisingWave collects anonymous usage statistics to better understand how the community is using RisingWave. The sole intention of this exercise is to help improve the product. These statistics are related to system resource usage, OS versions and system uptime. RisingWave doesn't have access to any user data or metadata running on RisingWave clusters including source and sink connection parameters, sources, sinks, materialized views, and tables. Users have the option to opt out of this collection using a system parameter. Please refer to the RisingWave user documentation for more details.
-
-## Get started
-
-- To learn about how to install and run RisingWave, see [Get started](https://docs.risingwave.com/docs/current/get-started/).
-- To learn about how to ingest data and the supported data sources, see [Sources](https://docs.risingwave.com/docs/current/data-ingestion/).
-- To learn about how to transform data using the PostgreSQL-compatible SQL of RisingWave, see [SQL reference](https://docs.risingwave.com/docs/current/sql-references/).
-- To learn about how to deliver data and the supported data sinks, see [Sinks](https://docs.risingwave.com/docs/current/data-delivery/).
-- To learn about new features and changes in the current and previous versions, see [Release notes](https://docs.risingwave.com/release-notes/).
-
-## Documentation
+RisingWave collects anonymous usage statistics to better understand how the community is using RisingWave. The sole intention of this exercise is to help improve the product. Users may opt out easily at any time. Please refer to the [user documentation](https://docs.risingwave.com/docs/current/telemetry/) for more details.
-To learn about how to use RisingWave, refer to [RisingWave User Documentation](https://docs.risingwave.com/). To learn about the development process, see the [developer guide](docs/developer-guide.md). To understand the design and implementation of RisingWave, refer to the design docs listed in [readme.md](docs/README.md).
+## In-production use cases
+Like other stream processing systems, the primary use cases of RisingWave include monitoring, alerting, real-time dashboard reporting, streaming ETL (Extract, Transform, Load), machine learning feature engineering, and more. It has already been adopted in fields such as financial trading, manufacturing, new media, logistics, gaming, and more. Check out [customer stories](https://www.risingwave.com/use-cases/).
## Community
diff --git a/ci/build-ci-image.sh b/ci/build-ci-image.sh
index 43ff81ade2b85..59c88e5e9a9ae 100755
--- a/ci/build-ci-image.sh
+++ b/ci/build-ci-image.sh
@@ -13,7 +13,7 @@ cat ../rust-toolchain
# !!! CHANGE THIS WHEN YOU WANT TO BUMP CI IMAGE !!! #
# AND ALSO docker-compose.yml #
######################################################
-export BUILD_ENV_VERSION=v20230919
+export BUILD_ENV_VERSION=v20231022
export BUILD_TAG="public.ecr.aws/x5u3w5h6/rw-build-env:${BUILD_ENV_VERSION}"
diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml
index 6fe7cfbfdeca2..66dd2d175e675 100644
--- a/ci/docker-compose.yml
+++ b/ci/docker-compose.yml
@@ -71,7 +71,7 @@ services:
retries: 5
source-test-env:
- image: public.ecr.aws/x5u3w5h6/rw-build-env:v20230919
+ image: public.ecr.aws/x5u3w5h6/rw-build-env:v20231022
depends_on:
- mysql
- db
@@ -81,10 +81,11 @@ services:
- ..:/risingwave
sink-test-env:
- image: public.ecr.aws/x5u3w5h6/rw-build-env:v20230919
+ image: public.ecr.aws/x5u3w5h6/rw-build-env:v20231022
depends_on:
- mysql
- db
+ - message_queue
- elasticsearch
- clickhouse-server
- pulsar
@@ -92,12 +93,12 @@ services:
- ..:/risingwave
rw-build-env:
- image: public.ecr.aws/x5u3w5h6/rw-build-env:v20230919
+ image: public.ecr.aws/x5u3w5h6/rw-build-env:v20231022
volumes:
- ..:/risingwave
ci-flamegraph-env:
- image: public.ecr.aws/x5u3w5h6/rw-build-env:v20230919
+ image: public.ecr.aws/x5u3w5h6/rw-build-env:v20231022
# NOTE(kwannoel): This is used in order to permit
# syscalls for `nperf` (perf_event_open),
# so it can do CPU profiling.
@@ -108,7 +109,7 @@ services:
- ..:/risingwave
regress-test-env:
- image: public.ecr.aws/x5u3w5h6/rw-build-env:v20230919
+ image: public.ecr.aws/x5u3w5h6/rw-build-env:v20231022
depends_on:
db:
condition: service_healthy
diff --git a/ci/rust-toolchain b/ci/rust-toolchain
index ebc0b6c285a4e..fe2a026f6e40f 100644
--- a/ci/rust-toolchain
+++ b/ci/rust-toolchain
@@ -1,2 +1,2 @@
[toolchain]
-channel = "nightly-2023-09-09"
+channel = "nightly-2023-10-21"
diff --git a/ci/scripts/deterministic-recovery-test.sh b/ci/scripts/deterministic-recovery-test.sh
index 6514fe1f7c0c3..c5f89a2bbc7e0 100755
--- a/ci/scripts/deterministic-recovery-test.sh
+++ b/ci/scripts/deterministic-recovery-test.sh
@@ -11,6 +11,7 @@ chmod +x ./risingwave_simulation
export RUST_LOG="info,\
risingwave_meta::barrier::recovery=debug,\
+risingwave_meta::manager::catalog=debug,\
risingwave_meta::rpc::ddl_controller=debug,\
risingwave_meta::barrier::mod=debug,\
risingwave_simulation=debug"
diff --git a/ci/scripts/e2e-iceberg-cdc.sh b/ci/scripts/e2e-iceberg-cdc.sh
new file mode 100755
index 0000000000000..081f5bbd2afcb
--- /dev/null
+++ b/ci/scripts/e2e-iceberg-cdc.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+
+# Exits as soon as any line fails.
+set -euo pipefail
+
+source ci/scripts/common.sh
+
+# prepare environment
+export CONNECTOR_RPC_ENDPOINT="localhost:50051"
+export CONNECTOR_LIBS_PATH="./connector-node/libs"
+
+while getopts 'p:' opt; do
+ case ${opt} in
+ p )
+ profile=$OPTARG
+ ;;
+ \? )
+ echo "Invalid Option: -$OPTARG" 1>&2
+ exit 1
+ ;;
+ : )
+ echo "Invalid option: $OPTARG requires an argument" 1>&2
+ ;;
+ esac
+done
+shift $((OPTIND -1))
+
+download_and_prepare_rw "$profile" source
+
+echo "--- Download connector node package"
+buildkite-agent artifact download risingwave-connector.tar.gz ./
+mkdir ./connector-node
+tar xf ./risingwave-connector.tar.gz -C ./connector-node
+
+echo "--- e2e, ci-1cn-1fe, iceberg cdc"
+
+node_port=50051
+node_timeout=10
+
+wait_for_connector_node_start() {
+ start_time=$(date +%s)
+ while :
+ do
+ if nc -z localhost $node_port; then
+ echo "Port $node_port is listened! Connector Node is up!"
+ break
+ fi
+
+ current_time=$(date +%s)
+ elapsed_time=$((current_time - start_time))
+ if [ $elapsed_time -ge $node_timeout ]; then
+ echo "Timeout waiting for port $node_port to be listened!"
+ exit 1
+ fi
+ sleep 0.1
+ done
+ sleep 2
+}
+
+echo "--- starting risingwave cluster with connector node"
+
+RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \
+cargo make ci-start ci-1cn-1fe-with-recovery
+./connector-node/start-service.sh -p $node_port > .risingwave/log/connector-node.log 2>&1 &
+echo "waiting for connector node to start"
+wait_for_connector_node_start
+
+# prepare minio iceberg sink
+echo "--- preparing iceberg"
+.risingwave/bin/mcli -C .risingwave/config/mcli mb hummock-minio/icebergdata
+
+cd e2e_test/iceberg
+bash ./start_spark_connect_server.sh
+
+# Don't remove the `--quiet` option since poetry has a bug when printing output, see
+# https://github.com/python-poetry/poetry/issues/3412
+"$HOME"/.local/bin/poetry update --quiet
+
+# 1. import data to mysql
+mysql --host=mysql --port=3306 -u root -p123456 < ./test_case/cdc/mysql_cdc.sql
+
+# 2. create table and sink
+"$HOME"/.local/bin/poetry run python main.py -t ./test_case/cdc/no_partition_cdc_init.toml
+
+# 3. insert new data to mysql
+mysql --host=mysql --port=3306 -u root -p123456 < ./test_case/cdc/mysql_cdc_insert.sql
+
+sleep 20
+
+# 4. check change
+"$HOME"/.local/bin/poetry run python main.py -t ./test_case/cdc/no_partition_cdc.toml
\ No newline at end of file
diff --git a/ci/scripts/e2e-kafka-sink-test.sh b/ci/scripts/e2e-kafka-sink-test.sh
index 06ef185f46e8b..71a91f2d8fba9 100755
--- a/ci/scripts/e2e-kafka-sink-test.sh
+++ b/ci/scripts/e2e-kafka-sink-test.sh
@@ -3,10 +3,10 @@
# Exits as soon as any line fails.
set -euo pipefail
-./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-append-only --create > /dev/null 2>&1
-./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert --create > /dev/null 2>&1
-./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert-schema --create > /dev/null 2>&1
-./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-debezium --create > /dev/null 2>&1
+./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-append-only --create > /dev/null 2>&1
+./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-upsert --create > /dev/null 2>&1
+./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-upsert-schema --create > /dev/null 2>&1
+./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-debezium --create > /dev/null 2>&1
sqllogictest -p 4566 -d dev 'e2e_test/sink/kafka/create_sink.slt'
sleep 2
@@ -14,7 +14,7 @@ sleep 2
# test append-only kafka sink
echo "testing append-only kafka sink"
diff ./e2e_test/sink/kafka/append_only1.result \
-<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-append-only --from-beginning --max-messages 10 | sort) 2> /dev/null)
+<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-append-only --from-beginning --max-messages 10 | sort) 2> /dev/null)
if [ $? -ne 0 ]; then
echo "The output for append-only sink is not as expected."
exit 1
@@ -23,7 +23,7 @@ fi
# test upsert kafka sink
echo "testing upsert kafka sink"
diff ./e2e_test/sink/kafka/upsert1.result \
-<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert --from-beginning --property print.key=true --max-messages 10 | sort) 2> /dev/null)
+<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-upsert --from-beginning --property print.key=true --max-messages 10 | sort) 2> /dev/null)
if [ $? -ne 0 ]; then
echo "The output for upsert sink is not as expected."
exit 1
@@ -32,7 +32,7 @@ fi
# test upsert kafka sink with schema
echo "testing upsert kafka sink with schema"
diff ./e2e_test/sink/kafka/upsert_schema1.result \
-<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert-schema --from-beginning --property print.key=true --max-messages 10 | sort) 2> /dev/null)
+<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-upsert-schema --from-beginning --property print.key=true --max-messages 10 | sort) 2> /dev/null)
if [ $? -ne 0 ]; then
echo "The output for upsert sink with schema is not as expected."
exit 1
@@ -40,7 +40,7 @@ fi
# test debezium kafka sink
echo "testing debezium kafka sink"
-(./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-debezium --property print.key=true --from-beginning --max-messages 10 | sort) > ./e2e_test/sink/kafka/debezium1.tmp.result 2> /dev/null
+(./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-debezium --property print.key=true --from-beginning --max-messages 10 | sort) > ./e2e_test/sink/kafka/debezium1.tmp.result 2> /dev/null
python3 e2e_test/sink/kafka/debezium.py e2e_test/sink/kafka/debezium1.result e2e_test/sink/kafka/debezium1.tmp.result
if [ $? -ne 0 ]; then
echo "The output for debezium sink is not as expected."
@@ -57,7 +57,7 @@ psql -h localhost -p 4566 -d dev -U root -c "update t_kafka set v_varchar = '',
# test append-only kafka sink after update
echo "testing append-only kafka sink after updating data"
diff ./e2e_test/sink/kafka/append_only2.result \
-<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-append-only --from-beginning --max-messages 11 | sort) 2> /dev/null)
+<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-append-only --from-beginning --max-messages 11 | sort) 2> /dev/null)
if [ $? -ne 0 ]; then
echo "The output for append-only sink after update is not as expected."
exit 1
@@ -66,7 +66,7 @@ fi
# test upsert kafka sink after update
echo "testing upsert kafka sink after updating data"
diff ./e2e_test/sink/kafka/upsert2.result \
-<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert --from-beginning --property print.key=true --max-messages 11 | sort) 2> /dev/null)
+<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-upsert --from-beginning --property print.key=true --max-messages 11 | sort) 2> /dev/null)
if [ $? -ne 0 ]; then
echo "The output for upsert sink after update is not as expected."
exit 1
@@ -75,7 +75,7 @@ fi
# test upsert kafka sink with schema after update
echo "testing upsert kafka sink with schema after updating data"
diff ./e2e_test/sink/kafka/upsert_schema2.result \
-<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert-schema --from-beginning --property print.key=true --max-messages 11 | sort) 2> /dev/null)
+<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-upsert-schema --from-beginning --property print.key=true --max-messages 11 | sort) 2> /dev/null)
if [ $? -ne 0 ]; then
echo "The output for upsert sink with schema is not as expected."
exit 1
@@ -83,7 +83,7 @@ fi
# test debezium kafka sink after update
echo "testing debezium kafka sink after updating data"
-(./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-debezium --property print.key=true --from-beginning --max-messages 11 | sort) > ./e2e_test/sink/kafka/debezium2.tmp.result 2> /dev/null
+(./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-debezium --property print.key=true --from-beginning --max-messages 11 | sort) > ./e2e_test/sink/kafka/debezium2.tmp.result 2> /dev/null
python3 e2e_test/sink/kafka/debezium.py e2e_test/sink/kafka/debezium2.result e2e_test/sink/kafka/debezium2.tmp.result
if [ $? -ne 0 ]; then
echo "The output for debezium sink after update is not as expected."
@@ -100,7 +100,7 @@ psql -h localhost -p 4566 -d dev -U root -c "delete from t_kafka where id = 1;"
# test upsert kafka sink after delete
echo "testing upsert kafka sink after deleting data"
diff ./e2e_test/sink/kafka/upsert3.result \
-<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert --from-beginning --property print.key=true --max-messages 12 | sort) 2> /dev/null)
+<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-upsert --from-beginning --property print.key=true --max-messages 12 | sort) 2> /dev/null)
if [ $? -ne 0 ]; then
echo "The output for upsert sink after update is not as expected."
exit 1
@@ -109,7 +109,7 @@ fi
# test upsert kafka sink with schema after delete
echo "testing upsert kafka sink with schema after deleting data"
diff ./e2e_test/sink/kafka/upsert_schema3.result \
-<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert-schema --from-beginning --property print.key=true --max-messages 12 | sort) 2> /dev/null)
+<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-upsert-schema --from-beginning --property print.key=true --max-messages 12 | sort) 2> /dev/null)
if [ $? -ne 0 ]; then
echo "The output for upsert sink with schema is not as expected."
exit 1
@@ -117,7 +117,7 @@ fi
# test debezium kafka sink after delete
echo "testing debezium kafka sink after deleting data"
-(./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-debezium --property print.key=true --from-beginning --max-messages 13 | sort) > ./e2e_test/sink/kafka/debezium3.tmp.result 2> /dev/null
+(./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-debezium --property print.key=true --from-beginning --max-messages 13 | sort) > ./e2e_test/sink/kafka/debezium3.tmp.result 2> /dev/null
python3 e2e_test/sink/kafka/debezium.py e2e_test/sink/kafka/debezium3.result e2e_test/sink/kafka/debezium3.tmp.result
if [ $? -ne 0 ]; then
echo "The output for debezium sink after delete is not as expected."
@@ -128,13 +128,13 @@ else
fi
sqllogictest -p 4566 -d dev 'e2e_test/sink/kafka/drop_sink.slt'
-./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-append-only --delete > /dev/null 2>&1
-./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert --delete > /dev/null 2>&1
-./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-debezium --delete > /dev/null 2>&1
+./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-append-only --delete > /dev/null 2>&1
+./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-upsert --delete > /dev/null 2>&1
+./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-debezium --delete > /dev/null 2>&1
# test different encoding
echo "testing protobuf"
cp src/connector/src/test_data/proto_recursive/recursive.pb ./proto-recursive
-./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-append-only-protobuf --create > /dev/null 2>&1
+./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-append-only-protobuf --create > /dev/null 2>&1
sqllogictest -p 4566 -d dev 'e2e_test/sink/kafka/protobuf.slt'
-./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-append-only-protobuf --delete > /dev/null 2>&1
+./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server message_queue:29092 --topic test-rw-sink-append-only-protobuf --delete > /dev/null 2>&1
diff --git a/ci/scripts/e2e-sink-test.sh b/ci/scripts/e2e-sink-test.sh
index 2dc02f0eada7a..ce2cc46381eba 100755
--- a/ci/scripts/e2e-sink-test.sh
+++ b/ci/scripts/e2e-sink-test.sh
@@ -57,7 +57,7 @@ node_port=50051
node_timeout=10
echo "--- starting risingwave cluster with connector node"
-cargo make ci-start ci-kafka
+cargo make ci-start ci-1cn-1fe
./connector-node/start-service.sh -p $node_port > .risingwave/log/connector-node.log 2>&1 &
echo "waiting for connector node to start"
diff --git a/ci/scripts/run-micro-benchmarks.sh b/ci/scripts/run-micro-benchmarks.sh
index 568c90de425ca..371cc416e7ac5 100755
--- a/ci/scripts/run-micro-benchmarks.sh
+++ b/ci/scripts/run-micro-benchmarks.sh
@@ -46,6 +46,8 @@ main() {
echo "--- Getting aws instance type"
local instance_type=$(get_instance_type)
echo "instance_type: $instance_type"
+ echo "$instance_type" > microbench_instance_type.txt
+ buildkite-agent artifact upload ./microbench_instance_type.txt
if [[ $instance_type != "m6i.4xlarge" ]]; then
echo "Only m6i.4xlarge is supported, skipping microbenchmark"
exit 0
diff --git a/ci/scripts/upload-micro-bench-results.sh b/ci/scripts/upload-micro-bench-results.sh
index 2644ca936c5da..e72b69950bb7b 100755
--- a/ci/scripts/upload-micro-bench-results.sh
+++ b/ci/scripts/upload-micro-bench-results.sh
@@ -36,6 +36,19 @@ get_commit() {
| sed 's/\"//g'
}
+get_machine() {
+ buildkite-agent artifact download microbench_instance_type.txt ./
+ cat ./microbench_instance_type.txt
+}
+
+echo "--- Checking microbench_instance_type"
+INSTANCE_TYPE=$(get_machine)
+echo "instance type: $INSTANCE_TYPE"
+if [[ $INSTANCE_TYPE != "m6i.4xlarge" ]]; then
+ echo "Only m6i.4xlarge is supported, microbenchmark was skipped"
+ exit 0
+fi
+
setup
BUILDKITE_BUILD_URL="https://buildkite.com/risingwavelabs/main-cron/builds/$BUILDKITE_BUILD_NUMBER"
diff --git a/ci/workflows/integration-tests.yml b/ci/workflows/integration-tests.yml
index 4bd0ec1a000b1..455f29b210ec1 100644
--- a/ci/workflows/integration-tests.yml
+++ b/ci/workflows/integration-tests.yml
@@ -29,6 +29,7 @@ steps:
- "postgres-cdc"
- "mysql-sink"
- "postgres-sink"
+ - "iceberg-cdc"
# - "iceberg-sink"
- "debezium-mysql"
format:
@@ -79,6 +80,10 @@ steps:
# testcase: "iceberg-sink"
# format: "protobuf"
# skip: true
+ - with:
+ testcase: "iceberg-cdc"
+ format: "protobuf"
+ skip: true
- with:
testcase: "debezium-mysql"
format: "protobuf"
diff --git a/ci/workflows/pull-request.yml b/ci/workflows/pull-request.yml
index 985bd0be4b822..3aaa09f0d7716 100644
--- a/ci/workflows/pull-request.yml
+++ b/ci/workflows/pull-request.yml
@@ -209,6 +209,21 @@ steps:
timeout_in_minutes: 10
retry: *auto-retry
+ - label: "end-to-end iceberg cdc test"
+ if: build.pull_request.labels includes "ci/run-e2e-iceberg-sink-tests"
+ command: "ci/scripts/e2e-iceberg-cdc.sh -p ci-dev"
+ depends_on:
+ - "build"
+ - "build-other"
+ plugins:
+ - docker-compose#v4.9.0:
+ run: sink-test-env
+ config: ci/docker-compose.yml
+ mount-buildkite-agent: true
+ - ./ci/plugins/upload-failure-logs
+ timeout_in_minutes: 10
+ retry: *auto-retry
+
- label: "end-to-end pulsar sink test"
if: build.pull_request.labels includes "ci/run-e2e-pulsar-sink-tests"
command: "ci/scripts/e2e-pulsar-sink-test.sh -p ci-dev"
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 89aa99a1c8b5d..4dbd5fe5bb28d 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -2,7 +2,7 @@
version: "3"
services:
compactor-0:
- image: "ghcr.io/risingwavelabs/risingwave:${RW_IMAGE_VERSION:-v1.2.0}"
+ image: "ghcr.io/risingwavelabs/risingwave:${RW_IMAGE_VERSION:-v1.3.0}"
command:
- compactor-node
- "--listen-addr"
@@ -37,7 +37,7 @@ services:
timeout: 5s
retries: 5
compute-node-0:
- image: "ghcr.io/risingwavelabs/risingwave:${RW_IMAGE_VERSION:-v1.2.0}"
+ image: "ghcr.io/risingwavelabs/risingwave:${RW_IMAGE_VERSION:-v1.3.0}"
command:
- compute-node
- "--listen-addr"
@@ -122,7 +122,7 @@ services:
timeout: 5s
retries: 5
frontend-node-0:
- image: "ghcr.io/risingwavelabs/risingwave:${RW_IMAGE_VERSION:-v1.2.0}"
+ image: "ghcr.io/risingwavelabs/risingwave:${RW_IMAGE_VERSION:-v1.3.0}"
command:
- frontend-node
- "--listen-addr"
@@ -179,7 +179,7 @@ services:
timeout: 5s
retries: 5
meta-node-0:
- image: "ghcr.io/risingwavelabs/risingwave:${RW_IMAGE_VERSION:-v1.2.0}"
+ image: "ghcr.io/risingwavelabs/risingwave:${RW_IMAGE_VERSION:-v1.3.0}"
command:
- meta-node
- "--listen-addr"
@@ -260,6 +260,7 @@ services:
MINIO_PROMETHEUS_URL: "http://prometheus-0:9500"
MINIO_ROOT_PASSWORD: hummockadmin
MINIO_ROOT_USER: hummockadmin
+ MINIO_DOMAIN: "minio-0"
container_name: minio-0
healthcheck:
test:
@@ -295,7 +296,7 @@ services:
timeout: 5s
retries: 5
connector-node:
- image: ghcr.io/risingwavelabs/risingwave:${RW_IMAGE_VERSION:-v1.2.0}
+ image: ghcr.io/risingwavelabs/risingwave:${RW_IMAGE_VERSION:-v1.3.0}
entrypoint: "/risingwave/bin/connector-node/start-service.sh"
ports:
- 50051
diff --git a/docs/developer-guide.md b/docs/developer-guide.md
index 4ecc756131dff..7d072e7da2e44 100644
--- a/docs/developer-guide.md
+++ b/docs/developer-guide.md
@@ -2,7 +2,7 @@
This guide is intended to be used by contributors to learn about how to develop RisingWave. The instructions about how to submit code changes are included in [contributing guidelines](../CONTRIBUTING.md).
-If you have questions, you can search for existing discussions or start a new discussion in the [Discussions forum of RisingWave](https://github.com/risingwavelabs/risingwave/discussions), or ask in the RisingWave Community channel on Slack. Please use the [invitation link](https://join.slack.com/t/risingwave-community/shared_invite/zt-120rft0mr-d8uGk3d~NZiZAQWPnElOfw) to join the channel.
+If you have questions, you can search for existing discussions or start a new discussion in the [Discussions forum of RisingWave](https://github.com/risingwavelabs/risingwave/discussions), or ask in the RisingWave Community channel on Slack. Please use the [invitation link](https://risingwave.com/slack) to join the channel.
To report bugs, create a [GitHub issue](https://github.com/risingwavelabs/risingwave/issues/new/choose).
diff --git a/e2e_test/iceberg/main.py b/e2e_test/iceberg/main.py
index fa07aa367a9b3..3f3120227e6e7 100644
--- a/e2e_test/iceberg/main.py
+++ b/e2e_test/iceberg/main.py
@@ -42,14 +42,16 @@ def init_iceberg_table(args,init_sqls):
spark.sql(sql)
-def init_risingwave_mv(args,slt):
+def execute_slt(args,slt):
+ if slt is None or slt == "":
+ return
rw_config = args['risingwave']
cmd = f"sqllogictest -p {rw_config['port']} -d {rw_config['db']} {slt}"
print(f"Command line is [{cmd}]")
subprocess.run(cmd,
shell=True,
check=True)
- time.sleep(10)
+ time.sleep(30)
def verify_result(args,verify_sql,verify_schema,verify_data):
@@ -110,6 +112,6 @@ def drop_table(args,drop_sqls):
print({section: dict(config[section]) for section in config.sections()})
init_iceberg_table(config,init_sqls)
- init_risingwave_mv(config,slt)
+ execute_slt(config,slt)
verify_result(config,verify_sql,verify_schema,verify_data)
drop_table(config,drop_sqls)
diff --git a/e2e_test/iceberg/test_case/cdc/load.slt b/e2e_test/iceberg/test_case/cdc/load.slt
new file mode 100644
index 0000000000000..caefd1326bbda
--- /dev/null
+++ b/e2e_test/iceberg/test_case/cdc/load.slt
@@ -0,0 +1,46 @@
+# CDC source basic test
+
+# enable cdc backfill in ci
+statement ok
+set cdc_backfill='true';
+
+statement ok
+create table products ( id INT,
+ name STRING,
+ description STRING,
+ PRIMARY KEY (id)
+) with (
+ connector = 'mysql-cdc',
+ hostname = 'mysql',
+ port = '3306',
+ username = 'root',
+ password = '123456',
+ database.name = 'my@db',
+ table.name = 'products',
+ server.id = '5085'
+);
+
+
+statement ok
+CREATE SINK s1 AS select * from products WITH (
+ connector = 'iceberg',
+ type = 'upsert',
+ force_append_only = 'false',
+ database.name = 'demo',
+ table.name = 'demo_db.demo_table',
+ catalog.type = 'storage',
+ warehouse.path = 's3://icebergdata/demo',
+ s3.endpoint = 'http://127.0.0.1:9301',
+ s3.region = 'us-east-1',
+ s3.access.key = 'hummockadmin',
+ s3.secret.key = 'hummockadmin',
+ primary_key = 'id'
+);
+
+query I
+select count(*) from products;
+----
+8
+
+statement ok
+flush;
diff --git a/e2e_test/iceberg/test_case/cdc/mysql_cdc.sql b/e2e_test/iceberg/test_case/cdc/mysql_cdc.sql
new file mode 100644
index 0000000000000..b7b6f13af83cf
--- /dev/null
+++ b/e2e_test/iceberg/test_case/cdc/mysql_cdc.sql
@@ -0,0 +1,21 @@
+DROP DATABASE IF EXISTS `my@db`;
+CREATE DATABASE `my@db`;
+
+USE `my@db`;
+
+CREATE TABLE products (
+ id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ name VARCHAR(255) NOT NULL,
+ description VARCHAR(512)
+);
+
+ALTER TABLE products AUTO_INCREMENT = 101;
+
+INSERT INTO products VALUES (default,"101","101"),
+(default,"102","102"),
+(default,"103","103"),
+(default,"104","104"),
+(default,"105","105"),
+(default,"106","106"),
+(default,"107","107"),
+(default,"108","108")
diff --git a/e2e_test/iceberg/test_case/cdc/mysql_cdc_insert.sql b/e2e_test/iceberg/test_case/cdc/mysql_cdc_insert.sql
new file mode 100644
index 0000000000000..641d6220ea8dc
--- /dev/null
+++ b/e2e_test/iceberg/test_case/cdc/mysql_cdc_insert.sql
@@ -0,0 +1,7 @@
+USE `my@db`;
+
+INSERT INTO products VALUES (default,"109","109"),
+(default,"110","110"),
+(default,"111","111"),
+(default,"112","112"),
+(default,"113","113");
diff --git a/e2e_test/iceberg/test_case/cdc/no_partition_cdc.toml b/e2e_test/iceberg/test_case/cdc/no_partition_cdc.toml
new file mode 100644
index 0000000000000..5ab9647b12eb0
--- /dev/null
+++ b/e2e_test/iceberg/test_case/cdc/no_partition_cdc.toml
@@ -0,0 +1,25 @@
+init_sqls = []
+
+slt = ''
+
+verify_schema = ['int','string','string']
+
+verify_sql = 'SELECT * FROM demo_db.demo_table ORDER BY id ASC'
+
+verify_data = """
+101,101,101
+102,102,102
+103,103,103
+104,104,104
+105,105,105
+106,106,106
+107,107,107
+108,108,108
+109,109,109
+110,110,110
+111,111,111
+112,112,112
+113,113,113
+"""
+
+drop_sqls = []
diff --git a/e2e_test/iceberg/test_case/cdc/no_partition_cdc_init.toml b/e2e_test/iceberg/test_case/cdc/no_partition_cdc_init.toml
new file mode 100644
index 0000000000000..17e5f7497aae5
--- /dev/null
+++ b/e2e_test/iceberg/test_case/cdc/no_partition_cdc_init.toml
@@ -0,0 +1,31 @@
+init_sqls = [
+ 'CREATE SCHEMA IF NOT EXISTS demo_db',
+ 'DROP TABLE IF EXISTS demo_db.demo_table',
+ '''
+ CREATE TABLE demo_db.demo_table (
+ id int,
+ name string,
+ description string
+ ) USING iceberg
+ TBLPROPERTIES ('format-version'='2');
+ '''
+]
+
+slt = 'test_case/cdc/load.slt'
+
+verify_schema = ['int','string','string']
+
+verify_sql = 'SELECT * FROM demo_db.demo_table ORDER BY id ASC'
+
+verify_data = """
+101,101,101
+102,102,102
+103,103,103
+104,104,104
+105,105,105
+106,106,106
+107,107,107
+108,108,108
+"""
+
+drop_sqls = []
diff --git a/e2e_test/sink/kafka/create_sink.slt b/e2e_test/sink/kafka/create_sink.slt
index 25e3a59fdff3a..a1f296774f526 100644
--- a/e2e_test/sink/kafka/create_sink.slt
+++ b/e2e_test/sink/kafka/create_sink.slt
@@ -31,7 +31,7 @@ create connection mock with (
statement error
create sink si_kafka_append_only_conn from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-append-only',
type = 'append-only',
force_append_only = 'true',
@@ -42,7 +42,7 @@ create sink si_kafka_append_only_conn from t_kafka with (
statement ok
create sink si_kafka_append_only_conn from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-append-only',
type = 'append-only',
force_append_only = 'true',
@@ -66,7 +66,7 @@ drop connection mock;
statement error sink cannot be append-only
create sink si_kafka_append_only from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-append-only',
type = 'append-only',
);
@@ -74,7 +74,7 @@ create sink si_kafka_append_only from t_kafka with (
statement ok
create sink si_kafka_append_only from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-append-only',
type = 'append-only',
force_append_only = 'true'
@@ -83,7 +83,7 @@ create sink si_kafka_append_only from t_kafka with (
statement error primary key not defined
create sink si_kafka_upsert from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-upsert',
type = 'upsert',
);
@@ -91,7 +91,7 @@ create sink si_kafka_upsert from t_kafka with (
statement ok
create sink si_kafka_upsert from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-upsert',
type = 'upsert',
primary_key = 'id',
@@ -100,7 +100,7 @@ create sink si_kafka_upsert from t_kafka with (
statement ok
create sink si_kafka_upsert_schema from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-upsert-schema',
primary_key = 'id',
) format upsert encode json (
@@ -110,7 +110,7 @@ create sink si_kafka_upsert_schema from t_kafka with (
statement ok
create sink si_kafka_debezium from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-debezium',
type = 'debezium',
primary_key = 'id',
@@ -119,7 +119,7 @@ create sink si_kafka_debezium from t_kafka with (
statement error primary key not defined
create sink debezium_without_pk from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-debezium',
type = 'debezium',
);
@@ -127,7 +127,7 @@ create sink debezium_without_pk from t_kafka with (
statement ok
create sink multiple_pk from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-debezium',
type = 'debezium',
primary_key = 'id,v_varchar'
@@ -139,7 +139,7 @@ drop sink multiple_pk;
statement error Sink primary key column not found: invalid.
create sink invalid_pk_column from t_kafka with (
connector = 'kafka',
- properties.bootstrap.server = '127.0.0.1:29092',
+ properties.bootstrap.server = 'message_queue:29092',
topic = 'test-rw-sink-debezium',
type = 'debezium',
primary_key = 'id,invalid'
diff --git a/e2e_test/sink/kafka/protobuf.slt b/e2e_test/sink/kafka/protobuf.slt
index f69c4a9d07110..87ab884eddbde 100644
--- a/e2e_test/sink/kafka/protobuf.slt
+++ b/e2e_test/sink/kafka/protobuf.slt
@@ -2,7 +2,7 @@ statement ok
create table from_kafka with (
connector = 'kafka',
topic = 'test-rw-sink-append-only-protobuf',
- properties.bootstrap.server = '127.0.0.1:29092')
+ properties.bootstrap.server = 'message_queue:29092')
format plain encode protobuf (
schema.location = 'file:///risingwave/proto-recursive',
message = 'recursive.AllTypes');
@@ -37,7 +37,7 @@ statement ok
create sink sink0 from into_kafka with (
connector = 'kafka',
topic = 'test-rw-sink-append-only-protobuf',
- properties.bootstrap.server = '127.0.0.1:29092')
+ properties.bootstrap.server = 'message_queue:29092')
format plain encode protobuf (
force_append_only = true,
schema.location = 'file:///risingwave/proto-recursive',
@@ -70,7 +70,7 @@ statement error failed to read file
create sink sink_err from into_kafka with (
connector = 'kafka',
topic = 'test-rw-sink-append-only-protobuf',
- properties.bootstrap.server = '127.0.0.1:29092')
+ properties.bootstrap.server = 'message_queue:29092')
format plain encode protobuf (
force_append_only = true,
schema.location = 'file:///risingwave/proto-recursiv',
@@ -80,7 +80,7 @@ statement error encode extra_column error: field not in proto
create sink sink_err as select 1 as extra_column with (
connector = 'kafka',
topic = 'test-rw-sink-append-only-protobuf',
- properties.bootstrap.server = '127.0.0.1:29092')
+ properties.bootstrap.server = 'message_queue:29092')
format plain encode protobuf (
force_append_only = true,
schema.location = 'file:///risingwave/proto-recursive',
@@ -90,7 +90,7 @@ statement error s3 URL not supported yet
create sink sink_err from into_kafka with (
connector = 'kafka',
topic = 'test-rw-sink-append-only-protobuf',
- properties.bootstrap.server = '127.0.0.1:29092')
+ properties.bootstrap.server = 'message_queue:29092')
format plain encode protobuf (
force_append_only = true,
schema.location = 's3:///risingwave/proto-recursive',
diff --git a/integration_tests/clickhouse-sink/README.md b/integration_tests/clickhouse-sink/README.md
index 607621faefeae..a383f3fba5ee4 100644
--- a/integration_tests/clickhouse-sink/README.md
+++ b/integration_tests/clickhouse-sink/README.md
@@ -23,6 +23,8 @@ docker compose exec clickhouse-server bash /opt/clickhouse/clickhouse-sql/run-sq
- create_mv.sql
- create_sink.sql
+We only support `upsert` with clickhouse' `CollapsingMergeTree` and `VersionedCollapsingMergeTree`
+
4. Execute a simple query:
```sh
diff --git a/integration_tests/iceberg-cdc/README.md b/integration_tests/iceberg-cdc/README.md
new file mode 100644
index 0000000000000..56f40172c3dfa
--- /dev/null
+++ b/integration_tests/iceberg-cdc/README.md
@@ -0,0 +1,5 @@
+# Iceberg CDC Integration Tests
+`mysql -> rw -> iceberg`
+
+# How to run
+./run_test.sh
\ No newline at end of file
diff --git a/integration_tests/iceberg-cdc/docker-compose.yaml b/integration_tests/iceberg-cdc/docker-compose.yaml
new file mode 100644
index 0000000000000..8e9ad1062ef38
--- /dev/null
+++ b/integration_tests/iceberg-cdc/docker-compose.yaml
@@ -0,0 +1,142 @@
+version: '3.8'
+
+services:
+ compactor-0:
+ extends:
+ file: ../../docker/docker-compose.yml
+ service: compactor-0
+ compute-node-0:
+ extends:
+ file: ../../docker/docker-compose.yml
+ service: compute-node-0
+ etcd-0:
+ extends:
+ file: ../../docker/docker-compose.yml
+ service: etcd-0
+ frontend-node-0:
+ extends:
+ file: ../../docker/docker-compose.yml
+ service: frontend-node-0
+ meta-node-0:
+ extends:
+ file: ../../docker/docker-compose.yml
+ service: meta-node-0
+ grafana-0:
+ extends:
+ file: ../../docker/docker-compose.yml
+ service: grafana-0
+ prometheus-0:
+ extends:
+ file: ../../docker/docker-compose.yml
+ service: prometheus-0
+ minio-0:
+ extends:
+ file: ../../docker/docker-compose.yml
+ service: minio-0
+ mc:
+ depends_on:
+ - minio-0
+ image: minio/mc
+ environment:
+ - AWS_ACCESS_KEY_ID=hummockadmin
+ - AWS_SECRET_ACCESS_KEY=hummockadmin
+ - AWS_REGION=us-east-1
+ entrypoint: >
+ /bin/sh -c "
+ until (/usr/bin/mc config host add minio http://minio-0:9301 hummockadmin hummockadmin) do echo '...waiting...' && sleep 1; done;
+ /usr/bin/mc rm -r --force minio/icebergdata;
+ /usr/bin/mc mb minio/icebergdata;
+ /usr/bin/mc anonymous set public minio/icebergdata;
+ tail -f /dev/null
+ "
+
+ mysql:
+ image: mysql:8.0
+ expose:
+ - 3306
+ ports:
+ - "3306:3306"
+ environment:
+ - MYSQL_ROOT_PASSWORD=123456
+ - MYSQL_USER=mysqluser
+ - MYSQL_PASSWORD=mysqlpw
+ - MYSQL_DATABASE=mydb
+ healthcheck:
+ test: [ "CMD-SHELL", "mysqladmin ping -h 127.0.0.1 -u root -p123456" ]
+ interval: 5s
+ timeout: 5s
+ retries: 5
+ container_name: mysql
+ prepare_mysql:
+ image: mysql:8.0
+ depends_on:
+ - mysql
+ command:
+ - /bin/sh
+ - -c
+ - "mysql -p123456 -h mysql mydb < mysql_prepare.sql"
+ volumes:
+ - "./mysql_prepare.sql:/mysql_prepare.sql"
+ container_name: prepare_mysql
+ restart: on-failure
+
+ rest:
+ image: tabulario/iceberg-rest:0.6.0
+ environment:
+ - AWS_ACCESS_KEY_ID=hummockadmin
+ - AWS_SECRET_ACCESS_KEY=hummockadmin
+ - AWS_REGION=us-east-1
+ - CATALOG_CATOLOG__IMPL=org.apache.iceberg.jdbc.JdbcCatalog
+ - CATALOG_URI=jdbc:sqlite:file:/tmp/iceberg_rest_mode=memory
+ - CATALOG_WAREHOUSE=s3://icebergdata/demo
+ - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
+ - CATALOG_S3_ENDPOINT=http://minio-0:9301
+ depends_on:
+ - minio-0
+ # let the rest access minio through: hummock001.minio-0
+ links:
+ - minio-0:icebergdata.minio-0
+ expose:
+ - 8181
+ ports:
+ - "8181:8181"
+
+ spark:
+ depends_on:
+ - minio-0
+ - rest
+ image: ghcr.io/icelake-io/icelake-spark:latest
+ environment:
+ - AWS_ACCESS_KEY_ID=hummockadmin
+ - AWS_SECRET_ACCESS_KEY=hummockadmin
+ - AWS_REGION=us-east-1
+ - SPARK_HOME=/opt/spark
+ - PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/spark/bin:/opt/spark/sbin
+ user: root
+ links:
+ - minio-0:icebergdata.minio-0
+ expose:
+ - 15002
+ ports:
+ - "15002:15002"
+ healthcheck:
+ test: netstat -ltn | grep -c 15002
+ interval: 1s
+ retries: 1200
+ volumes:
+ - ./spark:/spark
+ command: [ "bash", "/spark/spark-connect-server.sh" ]
+
+volumes:
+ compute-node-0:
+ external: false
+ etcd-0:
+ external: false
+ grafana-0:
+ external: false
+ minio-0:
+ external: false
+ prometheus-0:
+ external: false
+ spark:
+ external: false
diff --git a/integration_tests/iceberg-cdc/mysql_prepare.sql b/integration_tests/iceberg-cdc/mysql_prepare.sql
new file mode 100644
index 0000000000000..3e5a236a41205
--- /dev/null
+++ b/integration_tests/iceberg-cdc/mysql_prepare.sql
@@ -0,0 +1,15 @@
+-- mysql -p123456 -uroot -h 127.0.0.1 mydb < mysql_prepare.sql
+--
+-- Mysql
+USE mydb;
+
+CREATE TABLE user_behaviors (
+ user_id VARCHAR(60),
+ target_id VARCHAR(60),
+ target_type VARCHAR(60),
+ event_timestamp VARCHAR(100),
+ behavior_type VARCHAR(60),
+ parent_target_type VARCHAR(60),
+ parent_target_id VARCHAR(60),
+ PRIMARY KEY(user_id, target_id, event_timestamp)
+);
diff --git a/integration_tests/iceberg-cdc/python/check.py b/integration_tests/iceberg-cdc/python/check.py
new file mode 100644
index 0000000000000..699fa4df29c30
--- /dev/null
+++ b/integration_tests/iceberg-cdc/python/check.py
@@ -0,0 +1,25 @@
+from pyspark.sql import SparkSession
+import configparser
+import psycopg2
+
+def check_spark_table(args):
+ expect_row_count = 0
+ rw_config = args['risingwave']
+ with psycopg2.connect(database=rw_config['db'], user=rw_config['user'], host=rw_config['host'],
+ port=rw_config['port']) as conn:
+ with conn.cursor() as cursor:
+ cursor.execute("SELECT COUNT(*) FROM user_behaviors")
+ expect_row_count = cursor.fetchone()[0]
+ print(f"expect_row_count is {expect_row_count}")
+ spark_config = args['spark']
+ spark = SparkSession.builder.remote(spark_config['url']).getOrCreate()
+ actual_row_count = spark.sql("SELECT COUNT(*) FROM s1.t1").collect()[0][0]
+ print(f"actual_row_count is {actual_row_count}")
+ assert actual_row_count==expect_row_count
+
+
+if __name__ == "__main__":
+ config = configparser.ConfigParser()
+ config.read("config.ini")
+ print({section: dict(config[section]) for section in config.sections()})
+ check_spark_table(config)
diff --git a/integration_tests/iceberg-cdc/python/config.ini b/integration_tests/iceberg-cdc/python/config.ini
new file mode 100644
index 0000000000000..bd95eddc5b80e
--- /dev/null
+++ b/integration_tests/iceberg-cdc/python/config.ini
@@ -0,0 +1,8 @@
+[spark]
+url=sc://localhost:15002
+
+[risingwave]
+db=dev
+user=root
+host=127.0.0.1
+port=4566
diff --git a/integration_tests/iceberg-cdc/python/init.py b/integration_tests/iceberg-cdc/python/init.py
new file mode 100644
index 0000000000000..289fa2f161889
--- /dev/null
+++ b/integration_tests/iceberg-cdc/python/init.py
@@ -0,0 +1,103 @@
+from pyspark.sql import SparkSession
+import configparser
+import psycopg2
+
+
+def init_spark_table(args):
+ spark_config = args['spark']
+ spark = SparkSession.builder.remote(spark_config['url']).getOrCreate()
+
+ init_table_sqls = [
+ "CREATE SCHEMA IF NOT EXISTS s1",
+ "DROP TABLE IF EXISTS s1.t1",
+ """
+ CREATE TABLE s1.t1
+ (
+ user_id string,
+ target_id string,
+ target_type string,
+ event_timestamp string,
+ behavior_type string,
+ parent_target_type string,
+ parent_target_id string
+ ) USING iceberg
+ TBLPROPERTIES ('format-version'='2');
+ """,
+ ]
+
+ for sql in init_table_sqls:
+ print(f"Executing sql: {sql}")
+ spark.sql(sql)
+
+
+def init_risingwave_mv(args):
+ rw_config = args['risingwave']
+ sqls = [
+ "set streaming_parallelism = 4",
+ """
+ CREATE TABLE user_behaviors (
+ user_id VARCHAR,
+ target_id VARCHAR,
+ target_type VARCHAR,
+ event_timestamp VARCHAR,
+ behavior_type VARCHAR,
+ parent_target_type VARCHAR,
+ parent_target_id VARCHAR,
+ PRIMARY KEY(user_id, target_id, event_timestamp)
+ ) with (
+ connector = 'mysql-cdc',
+ hostname = 'mysql',
+ port = '3306',
+ username = 'root',
+ password = '123456',
+ database.name = 'mydb',
+ table.name = 'user_behaviors',
+ server.id = '1'
+ );
+ """,
+ # f"""
+ # CREATE SINK s1
+ # AS SELECT * FROM user_behaviors
+ # WITH (
+ # connector='iceberg',
+ # type='upsert',
+ # primary_key = 'user_id, target_id, event_timestamp',
+ # catalog.type = 'storage',
+ # s3.endpoint = 'http://minio-0:9301',
+ # s3.access.key = 'hummockadmin',
+ # s3.secret.key = 'hummockadmin',
+ # database.name='demo',
+ # table.name='s1.t1',warehouse.path = 's3://hummock001/icebergdata/demo',s3.region = 'us-east-1'
+ # );
+ # """
+ f"""
+ CREATE SINK s1
+ AS SELECT * FROM user_behaviors
+ WITH (
+ connector='iceberg',
+ type='upsert',
+ primary_key = 'user_id, target_id, event_timestamp',
+ catalog.type = 'rest',
+ catalog.uri = 'http://rest:8181',
+ s3.endpoint = 'http://minio-0:9301',
+ s3.access.key = 'hummockadmin',
+ s3.secret.key = 'hummockadmin',
+ database.name='demo',
+ table.name='s1.t1',warehouse.path = 's3://icebergdata/demo/s1/t1',s3.region = 'us-east-1'
+ );
+ """
+ ]
+ with psycopg2.connect(database=rw_config['db'], user=rw_config['user'], host=rw_config['host'],
+ port=rw_config['port']) as conn:
+ with conn.cursor() as cursor:
+ for sql in sqls:
+ print(f"Executing sql {sql}")
+ cursor.execute(sql)
+
+
+if __name__ == "__main__":
+ config = configparser.ConfigParser()
+ config.read("config.ini")
+ print({section: dict(config[section]) for section in config.sections()})
+ init_spark_table(config)
+ init_risingwave_mv(config)
diff --git a/integration_tests/iceberg-cdc/python/pyproject.toml b/integration_tests/iceberg-cdc/python/pyproject.toml
new file mode 100644
index 0000000000000..4c7bce1165796
--- /dev/null
+++ b/integration_tests/iceberg-cdc/python/pyproject.toml
@@ -0,0 +1,16 @@
+[tool.poetry]
+name = "icelake-integration-tests"
+version = "0.0.9"
+description = ""
+authors = ["Renjie Liu "]
+readme = "README.md"
+packages = [{include = "icelake_integration_tests"}]
+
+[tool.poetry.dependencies]
+python = "^3.11"
+pyspark = { version = "3.4.1", extras = ["sql", "connect"] }
+psycopg2-binary = "^2.9"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/integration_tests/iceberg-cdc/run_test.sh b/integration_tests/iceberg-cdc/run_test.sh
new file mode 100755
index 0000000000000..2d8b691bc7284
--- /dev/null
+++ b/integration_tests/iceberg-cdc/run_test.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Start test environment.
+docker-compose up -d --wait
+
+# To avoid exiting by unhealth, set it after start environment.
+set -ex
+
+# Generate data
+docker build -t iceberg-cdc-datagen ../datagen
+timeout 20 docker run --network=iceberg-cdc_default iceberg-cdc-datagen /datagen --mode clickstream --qps 1 mysql --user mysqluser --password mysqlpw --host mysql --port 3306 --db mydb &
+
+cd python
+poetry update --quiet
+# Init source, mv, and sink.
+poetry run python init.py
+# Wait for sink to be finished.
+sleep 40;
+poetry run python check.py
diff --git a/integration_tests/iceberg-cdc/spark/.gitignore b/integration_tests/iceberg-cdc/spark/.gitignore
new file mode 100644
index 0000000000000..51dcf07222856
--- /dev/null
+++ b/integration_tests/iceberg-cdc/spark/.gitignore
@@ -0,0 +1,3 @@
+derby.log
+metastore_db
+.ivy
\ No newline at end of file
diff --git a/integration_tests/iceberg-cdc/spark/spark-connect-server.sh b/integration_tests/iceberg-cdc/spark/spark-connect-server.sh
new file mode 100755
index 0000000000000..7c1cd64f1a2f2
--- /dev/null
+++ b/integration_tests/iceberg-cdc/spark/spark-connect-server.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -ex
+
+JARS=$(find /opt/spark/deps -type f -name "*.jar" | tr '\n' ':')
+
+/opt/spark/sbin/start-connect-server.sh \
+ --master local[3] \
+ --driver-class-path $JARS \
+ --conf spark.driver.bindAddress=0.0.0.0 \
+ --conf spark.sql.catalog.demo=org.apache.iceberg.spark.SparkCatalog \
+ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
+ --conf spark.sql.catalog.demo.catalog-impl=org.apache.iceberg.rest.RESTCatalog \
+ --conf spark.sql.catalog.demo.uri=http://rest:8181 \
+ --conf spark.sql.catalog.demo.s3.endpoint=http://minio-0:9301 \
+ --conf spark.sql.catalog.demo.s3.path.style.access=true \
+ --conf spark.sql.catalog.demo.s3.access.key=hummockadmin \
+ --conf spark.sql.catalog.demo.s3.secret.key=hummockadmin \
+ --conf spark.sql.defaultCatalog=demo
+
+tail -f /opt/spark/logs/spark*.out
diff --git a/integration_tests/redis-sink/create_sink.sql b/integration_tests/redis-sink/create_sink.sql
index 03bfc2d0b0df1..2ba9ba67feb39 100644
--- a/integration_tests/redis-sink/create_sink.sql
+++ b/integration_tests/redis-sink/create_sink.sql
@@ -3,19 +3,13 @@ FROM
bhv_mv WITH (
primary_key = 'user_id',
connector = 'redis',
- type = 'append-only',
- force_append_only='true',
redis.url= 'redis://127.0.0.1:6379/',
-);
+)FORMAT PLAIN ENCODE JSON(force_append_only='true');
CREATE SINK bhv_redis_sink_2
FROM
bhv_mv WITH (
primary_key = 'user_id',
connector = 'redis',
- type = 'append-only',
- force_append_only='true',
redis.url= 'redis://127.0.0.1:6379/',
- redis.keyformat='user_id:{user_id}',
- redis.valueformat='username:{username},event_timestamp{event_timestamp}'
-);
\ No newline at end of file
+)FORMAT PLAIN ENCODE TEMPLATE(force_append_only='true', key_format = 'UserID:{user_id}', value_format = 'TargetID:{target_id},EventTimestamp{event_timestamp}');
\ No newline at end of file
diff --git a/integration_tests/scripts/run_demos.py b/integration_tests/scripts/run_demos.py
index 28623f7ddc4a7..da2519e18db44 100644
--- a/integration_tests/scripts/run_demos.py
+++ b/integration_tests/scripts/run_demos.py
@@ -42,6 +42,13 @@ def run_demo(demo: str, format: str, wait_time = 40):
run_sql_file(sql_file, demo_dir)
sleep(10)
+def iceberg_cdc_demo():
+ demo = "iceberg-cdc"
+ file_dir = dirname(abspath(__file__))
+ project_dir = dirname(file_dir)
+ demo_dir = os.path.join(project_dir, demo)
+ print("Running demo: iceberg-cdc")
+ subprocess.run(["bash","./run_test.sh"], cwd=demo_dir, check=True)
def run_iceberg_demo():
demo = "iceberg-sink"
@@ -149,5 +156,7 @@ def run_clickhouse_demo():
run_iceberg_demo()
elif args.case == "clickhouse-sink":
run_clickhouse_demo()
+elif args.case == "iceberg-cdc":
+ iceberg_cdc_demo()
else:
run_demo(args.case, args.format)
diff --git a/proto/ddl_service.proto b/proto/ddl_service.proto
index 27c9f2ee82f83..1efc933a7d033 100644
--- a/proto/ddl_service.proto
+++ b/proto/ddl_service.proto
@@ -314,6 +314,10 @@ message GetTablesResponse {
map tables = 1;
}
+message WaitRequest {}
+
+message WaitResponse {}
+
service DdlService {
rpc CreateDatabase(CreateDatabaseRequest) returns (CreateDatabaseResponse);
rpc DropDatabase(DropDatabaseRequest) returns (DropDatabaseResponse);
@@ -343,4 +347,5 @@ service DdlService {
rpc ListConnections(ListConnectionsRequest) returns (ListConnectionsResponse);
rpc DropConnection(DropConnectionRequest) returns (DropConnectionResponse);
rpc GetTables(GetTablesRequest) returns (GetTablesResponse);
+ rpc Wait(WaitRequest) returns (WaitResponse);
}
diff --git a/proto/expr.proto b/proto/expr.proto
index 769532d8dbe19..2f252d67c8400 100644
--- a/proto/expr.proto
+++ b/proto/expr.proto
@@ -348,6 +348,7 @@ message AggCall {
MODE = 24;
LAST_VALUE = 25;
GROUPING = 26;
+ INTERNAL_LAST_SEEN_VALUE = 27;
}
Type type = 1;
repeated InputRef args = 2;
diff --git a/proto/plan_common.proto b/proto/plan_common.proto
index a88242a572693..d4c7a2e04f138 100644
--- a/proto/plan_common.proto
+++ b/proto/plan_common.proto
@@ -106,6 +106,7 @@ enum EncodeType {
ENCODE_TYPE_PROTOBUF = 4;
ENCODE_TYPE_JSON = 5;
ENCODE_TYPE_BYTES = 6;
+ ENCODE_TYPE_TEMPLATE = 7;
}
enum RowFormatType {
diff --git a/risedev.yml b/risedev.yml
index a5ba8a7b43f97..135a33f602a6a 100644
--- a/risedev.yml
+++ b/risedev.yml
@@ -685,40 +685,6 @@ profile:
- use: pubsub
persist-data: true
- ci-kafka:
- config-path: src/config/ci.toml
- steps:
- - use: minio
- - use: etcd
- unsafe-no-fsync: true
- - use: meta-node
- - use: compute-node
- enable-tiered-cache: true
- - use: frontend
- - use: compactor
- - use: zookeeper
- persist-data: true
- - use: kafka
- persist-data: true
-
- ci-kafka-plus-pubsub:
- config-path: src/config/ci.toml
- steps:
- - use: minio
- - use: etcd
- unsafe-no-fsync: true
- - use: meta-node
- - use: compute-node
- enable-tiered-cache: true
- - use: frontend
- - use: compactor
- - use: zookeeper
- persist-data: true
- - use: kafka
- persist-data: true
- - use: pubsub
- persist-data: true
-
ci-redis:
config-path: src/config/ci.toml
steps:
diff --git a/src/batch/src/executor/aggregation/filter.rs b/src/batch/src/executor/aggregation/filter.rs
index 2db2320ed3534..9cfbeabffe417 100644
--- a/src/batch/src/executor/aggregation/filter.rs
+++ b/src/batch/src/executor/aggregation/filter.rs
@@ -75,7 +75,7 @@ impl AggregateFunction for Filter {
mod tests {
use risingwave_common::test_prelude::StreamChunkTestExt;
use risingwave_expr::aggregate::{build_append_only, AggCall};
- use risingwave_expr::expr::{build_from_pretty, Expression, LiteralExpression};
+ use risingwave_expr::expr::{build_from_pretty, ExpressionBoxExt, LiteralExpression};
use super::*;
diff --git a/src/batch/src/executor/project_set.rs b/src/batch/src/executor/project_set.rs
index 670933a6bb50c..fa3dfac917e8a 100644
--- a/src/batch/src/executor/project_set.rs
+++ b/src/batch/src/executor/project_set.rs
@@ -171,7 +171,7 @@ mod tests {
use risingwave_common::catalog::{Field, Schema};
use risingwave_common::test_prelude::*;
use risingwave_common::types::DataType;
- use risingwave_expr::expr::{Expression, InputRefExpression, LiteralExpression};
+ use risingwave_expr::expr::{ExpressionBoxExt, InputRefExpression, LiteralExpression};
use risingwave_expr::table_function::repeat;
use super::*;
diff --git a/src/batch/src/executor/source.rs b/src/batch/src/executor/source.rs
index 8bf9fc5b7e610..ae3fc7056a6a6 100644
--- a/src/batch/src/executor/source.rs
+++ b/src/batch/src/executor/source.rs
@@ -159,7 +159,10 @@ impl SourceExecutor {
for chunk in stream {
match chunk {
Ok(chunk) => {
- yield covert_stream_chunk_to_batch_chunk(chunk.chunk)?;
+ let data_chunk = covert_stream_chunk_to_batch_chunk(chunk.chunk)?;
+ if data_chunk.capacity() > 0 {
+ yield data_chunk;
+ }
}
Err(e) => {
return Err(e);
diff --git a/src/batch/src/lib.rs b/src/batch/src/lib.rs
index 9104c96c951f5..809c096eb49df 100644
--- a/src/batch/src/lib.rs
+++ b/src/batch/src/lib.rs
@@ -17,8 +17,8 @@
#![feature(trait_alias)]
#![feature(exact_size_is_empty)]
#![feature(type_alias_impl_trait)]
-#![cfg_attr(coverage, feature(no_coverage))]
-#![feature(generators)]
+#![cfg_attr(coverage, feature(coverage_attribute))]
+#![feature(coroutines)]
#![feature(proc_macro_hygiene, stmt_expr_attributes)]
#![feature(iterator_try_collect)]
#![feature(lint_reasons)]
@@ -27,13 +27,11 @@
#![feature(let_chains)]
#![feature(bound_map)]
#![feature(int_roundings)]
-#![feature(async_fn_in_trait)]
#![feature(allocator_api)]
#![feature(impl_trait_in_assoc_type)]
#![feature(result_option_inspect)]
#![feature(assert_matches)]
#![feature(lazy_cell)]
-#![feature(return_position_impl_trait_in_trait)]
mod error;
pub mod exchange_source;
diff --git a/src/batch/src/rpc/service/task_service.rs b/src/batch/src/rpc/service/task_service.rs
index b49a023acb22b..fb60e352ec293 100644
--- a/src/batch/src/rpc/service/task_service.rs
+++ b/src/batch/src/rpc/service/task_service.rs
@@ -53,7 +53,7 @@ impl TaskService for BatchServiceImpl {
type CreateTaskStream = ReceiverStream;
type ExecuteStream = ReceiverStream;
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn create_task(
&self,
request: Request,
@@ -97,7 +97,7 @@ impl TaskService for BatchServiceImpl {
}
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn cancel_task(
&self,
req: Request,
@@ -109,7 +109,7 @@ impl TaskService for BatchServiceImpl {
Ok(Response::new(CancelTaskResponse { status: None }))
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn execute(
&self,
req: Request,
diff --git a/src/batch/src/task/task_execution.rs b/src/batch/src/task/task_execution.rs
index 6bd83c5d62c67..445c71ee51d66 100644
--- a/src/batch/src/task/task_execution.rs
+++ b/src/batch/src/task/task_execution.rs
@@ -656,7 +656,7 @@ impl BatchTaskExecution {
let error = error.map(Arc::new);
*self.failure.lock() = error.clone().map(to_rw_error);
- let err_str = error.as_ref().map(|e| format!("{:?}", e));
+ let err_str = error.as_ref().map(|e| e.to_string());
if let Err(e) = sender.close(error).await {
match e {
SenderError => {
diff --git a/src/cmd/src/bin/compactor.rs b/src/cmd/src/bin/compactor.rs
index 21b7db2405e2d..554168d8a6683 100644
--- a/src/cmd/src/bin/compactor.rs
+++ b/src/cmd/src/bin/compactor.rs
@@ -12,6 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#![cfg_attr(coverage, feature(no_coverage))]
+#![cfg_attr(coverage, feature(coverage_attribute))]
risingwave_cmd::main!(compactor);
diff --git a/src/cmd/src/bin/compute_node.rs b/src/cmd/src/bin/compute_node.rs
index 0bb1e5211ac57..a24d132b70b94 100644
--- a/src/cmd/src/bin/compute_node.rs
+++ b/src/cmd/src/bin/compute_node.rs
@@ -12,6 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#![cfg_attr(coverage, feature(no_coverage))]
+#![cfg_attr(coverage, feature(coverage_attribute))]
risingwave_cmd::main!(compute);
diff --git a/src/cmd/src/bin/ctl.rs b/src/cmd/src/bin/ctl.rs
index 38345c7a3fc2e..7b4c3132e747d 100644
--- a/src/cmd/src/bin/ctl.rs
+++ b/src/cmd/src/bin/ctl.rs
@@ -12,6 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#![cfg_attr(coverage, feature(no_coverage))]
+#![cfg_attr(coverage, feature(coverage_attribute))]
risingwave_cmd::main!(ctl);
diff --git a/src/cmd/src/bin/frontend_node.rs b/src/cmd/src/bin/frontend_node.rs
index 32d563be109fc..546bacbf1a901 100644
--- a/src/cmd/src/bin/frontend_node.rs
+++ b/src/cmd/src/bin/frontend_node.rs
@@ -12,6 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#![cfg_attr(coverage, feature(no_coverage))]
+#![cfg_attr(coverage, feature(coverage_attribute))]
risingwave_cmd::main!(frontend);
diff --git a/src/cmd/src/bin/meta_node.rs b/src/cmd/src/bin/meta_node.rs
index 032cc6bc28285..4bebfc5f915a2 100644
--- a/src/cmd/src/bin/meta_node.rs
+++ b/src/cmd/src/bin/meta_node.rs
@@ -12,6 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#![cfg_attr(coverage, feature(no_coverage))]
+#![cfg_attr(coverage, feature(coverage_attribute))]
risingwave_cmd::main!(meta);
diff --git a/src/cmd/src/lib.rs b/src/cmd/src/lib.rs
index 12de26657bd33..93df94a63816a 100644
--- a/src/cmd/src/lib.rs
+++ b/src/cmd/src/lib.rs
@@ -30,7 +30,7 @@ macro_rules! main {
#[cfg(not(enable_task_local_alloc))]
risingwave_common::enable_jemalloc!();
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
fn main() {
let opts = clap::Parser::parse();
$crate::$component(opts);
diff --git a/src/cmd_all/src/bin/risingwave.rs b/src/cmd_all/src/bin/risingwave.rs
index 3e9088e16b9e2..b7693c6fa06a2 100644
--- a/src/cmd_all/src/bin/risingwave.rs
+++ b/src/cmd_all/src/bin/risingwave.rs
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#![cfg_attr(coverage, feature(no_coverage))]
+#![cfg_attr(coverage, feature(coverage_attribute))]
use std::str::FromStr;
@@ -158,7 +158,7 @@ impl Component {
}
}
-#[cfg_attr(coverage, no_coverage)]
+#[cfg_attr(coverage, coverage(off))]
fn main() -> Result<()> {
let risingwave = || {
command!(BINARY_NAME)
diff --git a/src/common/Cargo.toml b/src/common/Cargo.toml
index ddd1fe5a33cdb..168ba836d4c1b 100644
--- a/src/common/Cargo.toml
+++ b/src/common/Cargo.toml
@@ -49,6 +49,7 @@ hyper = "0.14"
hytra = { workspace = true }
itertools = "0.11"
itoa = "1.0"
+jsonbb = "0.1"
lru = { git = "https://github.com/risingwavelabs/lru-rs.git", rev = "cb2d7c7" }
memcomparable = { version = "0.2", features = ["decimal"] }
num-integer = "0.1"
diff --git a/src/common/proc_macro/src/config.rs b/src/common/proc_macro/src/config.rs
index 285834eb123cf..6e369fbad33eb 100644
--- a/src/common/proc_macro/src/config.rs
+++ b/src/common/proc_macro/src/config.rs
@@ -41,7 +41,7 @@ fn type_is_option(ty: &syn::Type) -> bool {
false
}
-#[cfg_attr(coverage, no_coverage)]
+#[cfg_attr(coverage, coverage(off))]
pub fn produce_override_config(input: DeriveInput) -> TokenStream {
let syn::Data::Struct(syn::DataStruct { fields, .. }) = input.data else {
abort!(input, "Only struct is supported");
diff --git a/src/common/proc_macro/src/lib.rs b/src/common/proc_macro/src/lib.rs
index 060ee1950624e..a11e407c6c053 100644
--- a/src/common/proc_macro/src/lib.rs
+++ b/src/common/proc_macro/src/lib.rs
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#![cfg_attr(coverage, feature(no_coverage))]
+#![cfg_attr(coverage, feature(coverage_attribute))]
use estimate_size::{
add_trait_bounds, extract_ignored_generics_list, has_nested_flag_attribute_list,
@@ -52,7 +52,7 @@ mod estimate_size;
/// }
/// }
/// ```
-#[cfg_attr(coverage, no_coverage)]
+#[cfg_attr(coverage, coverage(off))]
#[proc_macro_derive(OverrideConfig, attributes(override_opts))]
#[proc_macro_error]
pub fn override_config(input: TokenStream) -> TokenStream {
diff --git a/src/common/src/array/jsonb_array.rs b/src/common/src/array/jsonb_array.rs
index 0e9ba7c48511d..3c4ca23fff04e 100644
--- a/src/common/src/array/jsonb_array.rs
+++ b/src/common/src/array/jsonb_array.rs
@@ -12,36 +12,35 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-use std::mem::size_of;
-
use risingwave_pb::data::{PbArray, PbArrayType};
-use serde_json::Value;
-use super::{Array, ArrayBuilder};
+use super::{Array, ArrayBuilder, ArrayImpl, ArrayResult};
use crate::buffer::{Bitmap, BitmapBuilder};
use crate::estimate_size::EstimateSize;
-use crate::types::{DataType, JsonbRef, JsonbVal, F32, F64};
-use crate::util::iter_util::ZipEqFast;
+use crate::types::{DataType, JsonbRef, JsonbVal, Scalar};
#[derive(Debug)]
pub struct JsonbArrayBuilder {
bitmap: BitmapBuilder,
- data: Vec,
+ builder: jsonbb::Builder,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct JsonbArray {
bitmap: Bitmap,
- data: Vec,
+ /// Elements are stored as a single JSONB array value.
+ data: jsonbb::Value,
}
impl ArrayBuilder for JsonbArrayBuilder {
type ArrayType = JsonbArray;
fn new(capacity: usize) -> Self {
+ let mut builder = jsonbb::Builder::with_capacity(capacity);
+ builder.begin_array();
Self {
bitmap: BitmapBuilder::with_capacity(capacity),
- data: Vec::with_capacity(capacity),
+ builder,
}
}
@@ -54,13 +53,15 @@ impl ArrayBuilder for JsonbArrayBuilder {
match value {
Some(x) => {
self.bitmap.append_n(n, true);
- self.data
- .extend(std::iter::repeat(x).take(n).map(|x| x.0.clone()));
+ for _ in 0..n {
+ self.builder.add_value(x.0);
+ }
}
None => {
self.bitmap.append_n(n, false);
- self.data
- .extend(std::iter::repeat(*JsonbVal::dummy().0).take(n));
+ for _ in 0..n {
+ self.builder.add_null();
+ }
}
}
}
@@ -69,29 +70,44 @@ impl ArrayBuilder for JsonbArrayBuilder {
for bit in other.bitmap.iter() {
self.bitmap.append(bit);
}
- self.data.extend_from_slice(&other.data);
+ for value in other.data.as_array().unwrap().iter() {
+ self.builder.add_value(value);
+ }
}
fn pop(&mut self) -> Option<()> {
- self.data.pop().map(|_| self.bitmap.pop().unwrap())
+ self.bitmap.pop()?;
+ self.builder.pop();
+ Some(())
}
fn len(&self) -> usize {
self.bitmap.len()
}
- fn finish(self) -> Self::ArrayType {
+ fn finish(mut self) -> Self::ArrayType {
+ self.builder.end_array();
Self::ArrayType {
bitmap: self.bitmap.finish(),
- data: self.data,
+ data: self.builder.finish(),
}
}
}
-impl JsonbArrayBuilder {
- pub fn append_move(&mut self, value: JsonbVal) {
- self.bitmap.append(true);
- self.data.push(*value.0);
+impl JsonbArray {
+ /// Loads a `JsonbArray` from a protobuf array.
+ ///
+ /// See also `JsonbArray::to_protobuf`.
+ pub fn from_protobuf(array: &PbArray) -> ArrayResult {
+ ensure!(
+ array.values.len() == 1,
+ "Must have exactly 1 buffer in a jsonb array"
+ );
+ let arr = JsonbArray {
+ bitmap: array.get_null_bitmap()?.into(),
+ data: jsonbb::Value::from_bytes(&array.values[0].body),
+ };
+ Ok(arr.into())
}
}
@@ -101,52 +117,23 @@ impl Array for JsonbArray {
type RefItem<'a> = JsonbRef<'a>;
unsafe fn raw_value_at_unchecked(&self, idx: usize) -> Self::RefItem<'_> {
- JsonbRef(self.data.get_unchecked(idx))
+ JsonbRef(self.data.as_array().unwrap().get(idx).unwrap())
}
fn len(&self) -> usize {
- self.data.len()
+ self.bitmap.len()
}
fn to_protobuf(&self) -> PbArray {
- // The memory layout contains `serde_json::Value` trees, but in protobuf we transmit this as
- // variable length bytes in value encoding. That is, one buffer of length n+1 containing
- // start and end offsets into the 2nd buffer containing all value bytes concatenated.
-
use risingwave_pb::common::buffer::CompressionType;
use risingwave_pb::common::Buffer;
- let mut offset_buffer =
- Vec::::with_capacity((1 + self.data.len()) * std::mem::size_of::());
- let mut data_buffer = Vec::::with_capacity(self.data.len());
-
- let mut offset = 0;
- for (v, not_null) in self.data.iter().zip_eq_fast(self.null_bitmap().iter()) {
- if !not_null {
- continue;
- }
- let d = JsonbRef(v).value_serialize();
- offset_buffer.extend_from_slice(&(offset as u64).to_be_bytes());
- data_buffer.extend_from_slice(&d);
- offset += d.len();
- }
- offset_buffer.extend_from_slice(&(offset as u64).to_be_bytes());
-
- let values = vec![
- Buffer {
- compression: CompressionType::None as i32,
- body: offset_buffer,
- },
- Buffer {
- compression: CompressionType::None as i32,
- body: data_buffer,
- },
- ];
-
- let null_bitmap = self.null_bitmap().to_protobuf();
PbArray {
- null_bitmap: Some(null_bitmap),
- values,
+ null_bitmap: Some(self.null_bitmap().to_protobuf()),
+ values: vec![Buffer {
+ compression: CompressionType::None as i32,
+ body: self.data.as_bytes().to_vec(),
+ }],
array_type: PbArrayType::Jsonb as i32,
struct_array_data: None,
list_array_data: None,
@@ -176,7 +163,7 @@ impl FromIterator> for JsonbArray {
let mut builder = ::Builder::new(iter.size_hint().0);
for i in iter {
match i {
- Some(x) => builder.append_move(x),
+ Some(x) => builder.append(Some(x.as_scalar_ref())),
None => builder.append(None),
}
}
@@ -190,31 +177,8 @@ impl FromIterator for JsonbArray {
}
}
-// TODO: We need to fix this later.
impl EstimateSize for JsonbArray {
fn estimated_heap_size(&self) -> usize {
- self.bitmap.estimated_heap_size() + self.data.capacity() * size_of::()
- }
-}
-
-impl From for Value {
- fn from(v: F32) -> Value {
- serde_json::Number::from_f64(v.0 as f64)
- .expect("todo: convert Inf/NaN to jsonb")
- .into()
- }
-}
-
-impl From for Value {
- fn from(v: F64) -> Value {
- serde_json::Number::from_f64(v.0)
- .expect("todo: convert Inf/NaN to jsonb")
- .into()
- }
-}
-
-impl From> for Value {
- fn from(v: JsonbRef<'_>) -> Value {
- v.0.clone()
+ self.bitmap.estimated_heap_size() + self.data.capacity()
}
}
diff --git a/src/common/src/array/proto_reader.rs b/src/common/src/array/proto_reader.rs
index 55d505343dadd..4ca6bf7b70d05 100644
--- a/src/common/src/array/proto_reader.rs
+++ b/src/common/src/array/proto_reader.rs
@@ -52,9 +52,7 @@ impl ArrayImpl {
PbArrayType::Timestamp => read_timestamp_array(array, cardinality)?,
PbArrayType::Timestamptz => read_timestamptz_array(array, cardinality)?,
PbArrayType::Interval => read_interval_array(array, cardinality)?,
- PbArrayType::Jsonb => {
- read_string_array::(array, cardinality)?
- }
+ PbArrayType::Jsonb => JsonbArray::from_protobuf(array)?,
PbArrayType::Struct => StructArray::from_protobuf(array)?,
PbArrayType::List => ListArray::from_protobuf(array)?,
PbArrayType::Unspecified => unreachable!(),
diff --git a/src/common/src/array/value_reader.rs b/src/common/src/array/value_reader.rs
index 96ed7c31b88aa..45db47f23242b 100644
--- a/src/common/src/array/value_reader.rs
+++ b/src/common/src/array/value_reader.rs
@@ -19,8 +19,7 @@ use byteorder::{BigEndian, ReadBytesExt};
use super::ArrayResult;
use crate::array::{
- ArrayBuilder, BytesArrayBuilder, JsonbArrayBuilder, PrimitiveArrayItemType, Serial,
- Utf8ArrayBuilder,
+ ArrayBuilder, BytesArrayBuilder, PrimitiveArrayItemType, Serial, Utf8ArrayBuilder,
};
use crate::types::{Decimal, F32, F64};
@@ -89,15 +88,3 @@ impl VarSizedValueReader for BytesValueReader {
Ok(())
}
}
-
-pub struct JsonbValueReader;
-
-impl VarSizedValueReader for JsonbValueReader {
- fn read(buf: &[u8], builder: &mut JsonbArrayBuilder) -> ArrayResult<()> {
- let Some(v) = super::JsonbVal::value_deserialize(buf) else {
- bail!("failed to read jsonb from bytes");
- };
- builder.append_move(v);
- Ok(())
- }
-}
diff --git a/src/common/src/lib.rs b/src/common/src/lib.rs
index 2a3575d8dae78..fbcd3854fa572 100644
--- a/src/common/src/lib.rs
+++ b/src/common/src/lib.rs
@@ -24,12 +24,11 @@
#![feature(trusted_len)]
#![feature(allocator_api)]
#![feature(lint_reasons)]
-#![feature(generators)]
+#![feature(coroutines)]
#![feature(map_try_insert)]
#![feature(lazy_cell)]
#![feature(error_generic_member_access)]
#![feature(let_chains)]
-#![feature(return_position_impl_trait_in_trait)]
#![feature(portable_simd)]
#![feature(array_chunks)]
#![feature(inline_const_pat)]
@@ -43,7 +42,6 @@
#![feature(result_option_inspect)]
#![feature(map_entry_replace)]
#![feature(negative_impls)]
-#![feature(async_fn_in_trait)]
#![feature(bound_map)]
#![feature(array_methods)]
diff --git a/src/common/src/test_utils/rand_array.rs b/src/common/src/test_utils/rand_array.rs
index 70d0cb73d4dfa..f2dd8ad42854b 100644
--- a/src/common/src/test_utils/rand_array.rs
+++ b/src/common/src/test_utils/rand_array.rs
@@ -135,7 +135,7 @@ impl RandValue for Int256 {
impl RandValue for JsonbVal {
fn rand_value(_rand: &mut R) -> Self {
- JsonbVal::dummy()
+ JsonbVal::null()
}
}
diff --git a/src/common/src/types/jsonb.rs b/src/common/src/types/jsonb.rs
index 7f4c002037060..590b693e47891 100644
--- a/src/common/src/types/jsonb.rs
+++ b/src/common/src/types/jsonb.rs
@@ -15,23 +15,21 @@
use std::fmt;
use std::hash::Hash;
-use postgres_types::{FromSql as _, ToSql as _, Type};
-use serde_json::Value;
+use bytes::Buf;
+use jsonbb::{Value, ValueRef};
use crate::estimate_size::EstimateSize;
-use crate::types::{Scalar, ScalarRef};
+use crate::types::{Scalar, ScalarRef, F32, F64};
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct JsonbVal(pub(crate) Box); // The `Box` is just to keep `size_of::` smaller.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct JsonbVal(pub(crate) Value);
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-pub struct JsonbRef<'a>(pub(crate) &'a Value);
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub struct JsonbRef<'a>(pub(crate) ValueRef<'a>);
impl EstimateSize for JsonbVal {
fn estimated_heap_size(&self) -> usize {
- // https://github.com/risingwavelabs/risingwave/issues/8957
- // FIXME: correctly handle jsonb size
- 0
+ self.0.capacity()
}
}
@@ -63,7 +61,7 @@ impl<'a> ScalarRef<'a> for JsonbRef<'a> {
type ScalarType = JsonbVal;
fn to_owned_scalar(&self) -> Self::ScalarType {
- JsonbVal(self.0.clone().into())
+ JsonbVal(self.0.into())
}
fn hash_scalar(&self, state: &mut H) {
@@ -71,22 +69,6 @@ impl<'a> ScalarRef<'a> for JsonbRef<'a> {
}
}
-impl Hash for JsonbRef<'_> {
- fn hash(&self, state: &mut H) {
- // We do not intend to support hashing `jsonb` type.
- // Before #7981 is done, we do not panic but just hash its string representation.
- // Note that `serde_json` without feature `preserve_order` uses `BTreeMap` for json object.
- // So its string form always have keys sorted.
- self.0.to_string().hash(state)
- }
-}
-
-impl Hash for JsonbVal {
- fn hash(&self, state: &mut H) {
- self.0.to_string().hash(state)
- }
-}
-
impl PartialOrd for JsonbVal {
fn partial_cmp(&self, other: &Self) -> Option {
Some(self.cmp(other))
@@ -160,9 +142,7 @@ impl crate::types::to_binary::ToBinary for JsonbRef<'_> {
&self,
_ty: &crate::types::DataType,
) -> crate::error::Result> {
- let mut output = bytes::BytesMut::new();
- self.0.to_sql(&Type::JSONB, &mut output).unwrap();
- Ok(Some(output.freeze()))
+ Ok(Some(self.value_serialize().into()))
}
}
@@ -170,43 +150,130 @@ impl std::str::FromStr for JsonbVal {
type Err = ::Err;
fn from_str(s: &str) -> Result {
- let v: Value = s.parse()?;
- Ok(Self(v.into()))
+ Ok(Self(s.parse()?))
}
}
impl JsonbVal {
- /// Constructs a value without specific meaning. Usually used as a lightweight placeholder.
- pub fn dummy() -> Self {
- Self(Value::Null.into())
+ /// Returns a jsonb `null`.
+ pub fn null() -> Self {
+ Self(Value::null())
+ }
+
+ /// Returns an empty array `[]`.
+ pub fn empty_array() -> Self {
+ Self(Value::array([]))
+ }
+
+ /// Returns an empty array `{}`.
+ pub fn empty_object() -> Self {
+ Self(Value::object([]))
}
+ /// Deserialize from a memcomparable encoding.
pub fn memcmp_deserialize(
deserializer: &mut memcomparable::Deserializer,
) -> memcomparable::Result {
- let v: Value = ::deserialize(deserializer)?
+ let v = ::deserialize(deserializer)?
.parse()
.map_err(|_| memcomparable::Error::Message("invalid json".into()))?;
- Ok(Self(v.into()))
+ Ok(Self(v))
+ }
+
+ /// Deserialize from a pgwire "BINARY" encoding.
+ pub fn value_deserialize(mut buf: &[u8]) -> Option {
+ if buf.is_empty() || buf.get_u8() != 1 {
+ return None;
+ }
+ Value::from_text(buf).ok().map(Self)
+ }
+
+ /// Convert the value to a [`serde_json::Value`].
+ pub fn take(self) -> serde_json::Value {
+ self.0.into()
+ }
+}
+
+impl From for JsonbVal {
+ fn from(v: serde_json::Value) -> Self {
+ Self(v.into())
+ }
+}
+
+impl From for JsonbVal {
+ fn from(v: bool) -> Self {
+ Self(v.into())
+ }
+}
+
+impl From for JsonbVal {
+ fn from(v: i16) -> Self {
+ Self(v.into())
}
+}
- pub fn value_deserialize(buf: &[u8]) -> Option {
- let v = Value::from_sql(&Type::JSONB, buf).ok()?;
- Some(Self(v.into()))
+impl From for JsonbVal {
+ fn from(v: i32) -> Self {
+ Self(v.into())
}
+}
- pub fn take(mut self) -> Value {
- self.0.take()
+impl From for JsonbVal {
+ fn from(v: i64) -> Self {
+ Self(v.into())
}
+}
- pub fn as_serde_mut(&mut self) -> &mut Value {
- &mut self.0
+impl From for JsonbVal {
+ fn from(v: F32) -> Self {
+ if v.0 == f32::INFINITY {
+ Self("Infinity".into())
+ } else if v.0 == f32::NEG_INFINITY {
+ Self("-Infinity".into())
+ } else if v.0.is_nan() {
+ Self("NaN".into())
+ } else {
+ Self(v.0.into())
+ }
+ }
+}
+
+// NOTE: Infinite or NaN values are not JSON numbers. They are stored as strings in Postgres.
+impl From for JsonbVal {
+ fn from(v: F64) -> Self {
+ if v.0 == f64::INFINITY {
+ Self("Infinity".into())
+ } else if v.0 == f64::NEG_INFINITY {
+ Self("-Infinity".into())
+ } else if v.0.is_nan() {
+ Self("NaN".into())
+ } else {
+ Self(v.0.into())
+ }
+ }
+}
+
+impl From<&str> for JsonbVal {
+ fn from(v: &str) -> Self {
+ Self(v.into())
+ }
+}
+
+impl From> for JsonbVal {
+ fn from(v: JsonbRef<'_>) -> Self {
+ Self(v.0.to_owned())
}
}
impl From for JsonbVal {
fn from(v: Value) -> Self {
- Self(v.into())
+ Self(v)
+ }
+}
+
+impl<'a> From> for ValueRef<'a> {
+ fn from(v: JsonbRef<'a>) -> Self {
+ v.0
}
}
@@ -221,49 +288,52 @@ impl<'a> JsonbRef<'a> {
serde::Serialize::serialize(&s, serializer)
}
+ /// Serialize to a pgwire "BINARY" encoding.
pub fn value_serialize(&self) -> Vec {
+ use std::io::Write;
// Reuse the pgwire "BINARY" encoding for jsonb type.
// It is not truly binary, but one byte of version `1u8` followed by string form.
// This version number helps us maintain compatibility when we switch to more efficient
// encoding later.
- let mut output = bytes::BytesMut::new();
- self.0.to_sql(&Type::JSONB, &mut output).unwrap();
- output.freeze().into()
+ let mut buf = Vec::with_capacity(self.0.capacity());
+ buf.push(1);
+ write!(&mut buf, "{}", self.0).unwrap();
+ buf
}
+ /// Returns true if this is a jsonb `null`.
pub fn is_jsonb_null(&self) -> bool {
- matches!(self.0, Value::Null)
+ self.0.as_null().is_some()
}
+ /// Returns the type name of this jsonb.
+ ///
+ /// Possible values are: `null`, `boolean`, `number`, `string`, `array`, `object`.
pub fn type_name(&self) -> &'static str {
match self.0 {
- Value::Null => "null",
- Value::Bool(_) => "boolean",
- Value::Number(_) => "number",
- Value::String(_) => "string",
- Value::Array(_) => "array",
- Value::Object(_) => "object",
+ ValueRef::Null => "null",
+ ValueRef::Bool(_) => "boolean",
+ ValueRef::Number(_) => "number",
+ ValueRef::String(_) => "string",
+ ValueRef::Array(_) => "array",
+ ValueRef::Object(_) => "object",
}
}
+ /// Returns the length of this json array.
pub fn array_len(&self) -> Result {
- match self.0 {
- Value::Array(v) => Ok(v.len()),
- _ => Err(format!(
- "cannot get array length of a jsonb {}",
- self.type_name()
- )),
- }
+ let array = self
+ .0
+ .as_array()
+ .ok_or_else(|| format!("cannot get array length of a jsonb {}", self.type_name()))?;
+ Ok(array.len())
}
+ /// If the JSON is a boolean, returns the associated bool.
pub fn as_bool(&self) -> Result {
- match self.0 {
- Value::Bool(v) => Ok(*v),
- _ => Err(format!(
- "cannot cast jsonb {} to type boolean",
- self.type_name()
- )),
- }
+ self.0
+ .as_bool()
+ .ok_or_else(|| format!("cannot cast jsonb {} to type boolean", self.type_name()))
}
/// Attempt to read jsonb as a JSON number.
@@ -271,13 +341,11 @@ impl<'a> JsonbRef<'a> {
/// According to RFC 8259, only number within IEEE 754 binary64 (double precision) has good
/// interoperability. We do not support arbitrary precision like PostgreSQL `numeric` right now.
pub fn as_number(&self) -> Result {
- match self.0 {
- Value::Number(v) => v.as_f64().ok_or_else(|| "jsonb number out of range".into()),
- _ => Err(format!(
- "cannot cast jsonb {} to type number",
- self.type_name()
- )),
- }
+ self.0
+ .as_number()
+ .ok_or_else(|| format!("cannot cast jsonb {} to type number", self.type_name()))?
+ .as_f64()
+ .ok_or_else(|| "jsonb number out of range".into())
}
/// This is part of the `->>` or `#>>` syntax to access a child as string.
@@ -291,9 +359,9 @@ impl<'a> JsonbRef<'a> {
/// * Jsonb string is displayed with quotes but treated as its inner value here.
pub fn force_str(&self, writer: &mut W) -> std::fmt::Result {
match self.0 {
- Value::String(v) => writer.write_str(v),
- Value::Null => Ok(()),
- Value::Bool(_) | Value::Number(_) | Value::Array(_) | Value::Object(_) => {
+ ValueRef::String(v) => writer.write_str(v),
+ ValueRef::Null => Ok(()),
+ ValueRef::Bool(_) | ValueRef::Number(_) | ValueRef::Array(_) | ValueRef::Object(_) => {
use crate::types::to_text::ToText as _;
self.write_with_type(&crate::types::DataType::Jsonb, writer)
}
@@ -316,38 +384,33 @@ impl<'a> JsonbRef<'a> {
/// Returns an iterator over the elements if this is an array.
pub fn array_elements(self) -> Result>, String> {
- match &self.0 {
- Value::Array(array) => Ok(array.iter().map(Self)),
- _ => Err(format!(
- "cannot extract elements from a jsonb {}",
- self.type_name()
- )),
- }
+ let array = self
+ .0
+ .as_array()
+ .ok_or_else(|| format!("cannot extract elements from a jsonb {}", self.type_name()))?;
+ Ok(array.iter().map(Self))
}
/// Returns an iterator over the keys if this is an object.
pub fn object_keys(self) -> Result, String> {
- match &self.0 {
- Value::Object(object) => Ok(object.keys().map(|s| s.as_str())),
- _ => Err(format!(
+ let object = self.0.as_object().ok_or_else(|| {
+ format!(
"cannot call jsonb_object_keys on a jsonb {}",
self.type_name()
- )),
- }
+ )
+ })?;
+ Ok(object.keys())
}
/// Returns an iterator over the key-value pairs if this is an object.
pub fn object_key_values(
self,
) -> Result)>, String> {
- match &self.0 {
- Value::Object(object) => Ok(object.iter().map(|(k, v)| (k.as_str(), Self(v)))),
- _ => Err(format!("cannot deconstruct a jsonb {}", self.type_name())),
- }
- }
-
- pub fn value(&self) -> &'a Value {
- self.0
+ let object = self
+ .0
+ .as_object()
+ .ok_or_else(|| format!("cannot deconstruct a jsonb {}", self.type_name()))?;
+ Ok(object.iter().map(|(k, v)| (k, Self(v))))
}
}
diff --git a/src/common/src/types/mod.rs b/src/common/src/types/mod.rs
index 83d281c5238e6..386f63280a557 100644
--- a/src/common/src/types/mod.rs
+++ b/src/common/src/types/mod.rs
@@ -416,7 +416,7 @@ impl DataType {
DataType::Timestamptz => ScalarImpl::Timestamptz(Timestamptz::MIN),
DataType::Decimal => ScalarImpl::Decimal(Decimal::NegativeInf),
DataType::Interval => ScalarImpl::Interval(Interval::MIN),
- DataType::Jsonb => ScalarImpl::Jsonb(JsonbVal::dummy()), // NOT `min` #7981
+ DataType::Jsonb => ScalarImpl::Jsonb(JsonbVal::null()), // NOT `min` #7981
DataType::Struct(data_types) => ScalarImpl::Struct(StructValue::new(
data_types
.types()
@@ -1303,7 +1303,7 @@ mod tests {
ScalarImpl::Interval(Interval::from_month_day_usec(2, 3, 3333)),
DataType::Interval,
),
- DataTypeName::Jsonb => (ScalarImpl::Jsonb(JsonbVal::dummy()), DataType::Jsonb),
+ DataTypeName::Jsonb => (ScalarImpl::Jsonb(JsonbVal::null()), DataType::Jsonb),
DataTypeName::Struct => (
ScalarImpl::Struct(StructValue::new(vec![
ScalarImpl::Int64(233).into(),
diff --git a/src/common/src/types/ordered.rs b/src/common/src/types/ordered.rs
index 75b07e529d7b9..68cd6329287e2 100644
--- a/src/common/src/types/ordered.rs
+++ b/src/common/src/types/ordered.rs
@@ -138,7 +138,7 @@ impl From for DefaultOrdered {
}
}
-#[allow(clippy::incorrect_partial_ord_impl_on_ord_type)]
+#[allow(clippy::non_canonical_partial_ord_impl)]
impl PartialOrd for DefaultOrdered {
fn partial_cmp(&self, other: &Self) -> Option {
self.0.default_partial_cmp(other.as_inner())
diff --git a/src/common/src/util/chunk_coalesce.rs b/src/common/src/util/chunk_coalesce.rs
index 9a41fc83e8f0e..3bd56b19e434d 100644
--- a/src/common/src/util/chunk_coalesce.rs
+++ b/src/common/src/util/chunk_coalesce.rs
@@ -285,7 +285,12 @@ impl SlicedDataChunk {
}
pub fn with_offset_checked(data_chunk: DataChunk, offset: usize) -> Self {
- assert!(offset < data_chunk.capacity());
+ assert!(
+ offset < data_chunk.capacity(),
+ "offset {}, data_chunk capacity {}",
+ offset,
+ data_chunk.capacity()
+ );
Self { data_chunk, offset }
}
diff --git a/src/common/src/util/epoch.rs b/src/common/src/util/epoch.rs
index 86ed158c2e206..4d57c97b054b3 100644
--- a/src/common/src/util/epoch.rs
+++ b/src/common/src/util/epoch.rs
@@ -73,6 +73,10 @@ impl Epoch {
Epoch(time << EPOCH_PHYSICAL_SHIFT_BITS)
}
+ pub fn from_unix_millis(mi: u64) -> Self {
+ Epoch((mi - UNIX_RISINGWAVE_DATE_SEC * 1000) << EPOCH_PHYSICAL_SHIFT_BITS)
+ }
+
pub fn physical_now() -> u64 {
UNIX_RISINGWAVE_DATE_EPOCH
.elapsed()
diff --git a/src/common/src/util/future_utils.rs b/src/common/src/util/future_utils.rs
index 75c38488457ac..20844d8cd15d4 100644
--- a/src/common/src/util/future_utils.rs
+++ b/src/common/src/util/future_utils.rs
@@ -13,9 +13,11 @@
// limitations under the License.
use std::future::pending;
+use std::pin::{pin, Pin};
-use futures::future::Either;
-use futures::{Future, FutureExt, Stream};
+use futures::future::{select, Either};
+use futures::stream::Peekable;
+use futures::{Future, FutureExt, Stream, StreamExt};
/// Convert a list of streams into a [`Stream`] of results from the streams.
pub fn select_all(
@@ -43,3 +45,34 @@ pub fn drop_either_future(
Either::Right((right, _)) => Either::Right(right),
}
}
+
+/// Await on a future while monitoring on a peekable stream that may return error.
+/// The peekable stream is polled at a higher priority than the future.
+///
+/// When the peekable stream returns with a error and end of stream, the future will
+/// return the error immediately. Otherwise, it will keep polling the given future.
+///
+/// Return:
+/// - Ok(output) as the output of the given future.
+/// - Err(None) to indicate that the stream has reached the end.
+/// - Err(e) to indicate that the stream returns an error.
+pub async fn await_future_with_monitor_error_stream(
+ peek_stream: &mut Peekable> + Unpin>,
+ future: F,
+) -> Result> {
+ // Poll the response stream to early see the error
+ match select(pin!(Pin::new(&mut *peek_stream).peek()), pin!(future)).await {
+ Either::Left((response_result, send_future)) => match response_result {
+ None => Err(None),
+ Some(Err(_)) => {
+ let err = match peek_stream.next().now_or_never() {
+ Some(Some(Err(err))) => err,
+ _ => unreachable!("peek has output, peek output not None, have check err"),
+ };
+ Err(Some(err))
+ }
+ Some(Ok(_)) => Ok(send_future.await),
+ },
+ Either::Right((output, _)) => Ok(output),
+ }
+}
diff --git a/src/common/src/util/mod.rs b/src/common/src/util/mod.rs
index f4140b558faa7..e1f85263e1415 100644
--- a/src/common/src/util/mod.rs
+++ b/src/common/src/util/mod.rs
@@ -45,7 +45,9 @@ pub mod tracing;
pub mod value_encoding;
pub mod worker_util;
-pub use future_utils::{drop_either_future, pending_on_none, select_all};
+pub use future_utils::{
+ await_future_with_monitor_error_stream, drop_either_future, pending_on_none, select_all,
+};
#[macro_use]
pub mod match_util;
diff --git a/src/compute/src/lib.rs b/src/compute/src/lib.rs
index 65bf59eedf19e..fc5ae9ff19854 100644
--- a/src/compute/src/lib.rs
+++ b/src/compute/src/lib.rs
@@ -13,14 +13,14 @@
// limitations under the License.
#![feature(trait_alias)]
-#![feature(generators)]
+#![feature(coroutines)]
#![feature(type_alias_impl_trait)]
#![feature(let_chains)]
#![feature(result_option_inspect)]
#![feature(lint_reasons)]
#![feature(impl_trait_in_assoc_type)]
#![feature(lazy_cell)]
-#![cfg_attr(coverage, feature(no_coverage))]
+#![cfg_attr(coverage, feature(coverage_attribute))]
#[macro_use]
extern crate tracing;
diff --git a/src/compute/src/rpc/service/exchange_service.rs b/src/compute/src/rpc/service/exchange_service.rs
index b59cc39587c2f..6225cef2a7e30 100644
--- a/src/compute/src/rpc/service/exchange_service.rs
+++ b/src/compute/src/rpc/service/exchange_service.rs
@@ -49,7 +49,7 @@ impl ExchangeService for ExchangeServiceImpl {
type GetDataStream = BatchDataStream;
type GetStreamStream = StreamDataStream;
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn get_data(
&self,
request: Request,
diff --git a/src/compute/src/rpc/service/monitor_service.rs b/src/compute/src/rpc/service/monitor_service.rs
index 97a0b80773791..8fc24664ec016 100644
--- a/src/compute/src/rpc/service/monitor_service.rs
+++ b/src/compute/src/rpc/service/monitor_service.rs
@@ -53,7 +53,7 @@ impl MonitorServiceImpl {
#[async_trait::async_trait]
impl MonitorService for MonitorServiceImpl {
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn stack_trace(
&self,
request: Request,
@@ -85,7 +85,7 @@ impl MonitorService for MonitorServiceImpl {
}))
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn profiling(
&self,
request: Request,
@@ -115,7 +115,7 @@ impl MonitorService for MonitorServiceImpl {
}
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn heap_profiling(
&self,
request: Request,
@@ -166,7 +166,7 @@ impl MonitorService for MonitorServiceImpl {
}
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn list_heap_profiling(
&self,
_request: Request,
@@ -206,7 +206,7 @@ impl MonitorService for MonitorServiceImpl {
}))
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn analyze_heap(
&self,
request: Request,
diff --git a/src/compute/src/rpc/service/stream_service.rs b/src/compute/src/rpc/service/stream_service.rs
index 525364b60dc1c..1c1448b3d1e45 100644
--- a/src/compute/src/rpc/service/stream_service.rs
+++ b/src/compute/src/rpc/service/stream_service.rs
@@ -45,7 +45,7 @@ impl StreamServiceImpl {
#[async_trait::async_trait]
impl StreamService for StreamServiceImpl {
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn update_actors(
&self,
request: Request,
@@ -61,7 +61,7 @@ impl StreamService for StreamServiceImpl {
}
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn build_actors(
&self,
request: Request,
@@ -85,7 +85,7 @@ impl StreamService for StreamServiceImpl {
}
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn broadcast_actor_info_table(
&self,
request: Request,
@@ -104,7 +104,7 @@ impl StreamService for StreamServiceImpl {
}
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn drop_actors(
&self,
request: Request,
@@ -118,7 +118,7 @@ impl StreamService for StreamServiceImpl {
}))
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn force_stop_actors(
&self,
request: Request,
@@ -132,7 +132,7 @@ impl StreamService for StreamServiceImpl {
}))
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn inject_barrier(
&self,
request: Request,
@@ -173,7 +173,7 @@ impl StreamService for StreamServiceImpl {
}))
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn barrier_complete(
&self,
request: Request,
@@ -243,7 +243,7 @@ impl StreamService for StreamServiceImpl {
}))
}
- #[cfg_attr(coverage, no_coverage)]
+ #[cfg_attr(coverage, coverage(off))]
async fn wait_epoch_commit(
&self,
request: Request,
diff --git a/src/compute/tests/cdc_tests.rs b/src/compute/tests/cdc_tests.rs
index b3e39ece95002..6a50b8410bbd4 100644
--- a/src/compute/tests/cdc_tests.rs
+++ b/src/compute/tests/cdc_tests.rs
@@ -13,7 +13,7 @@
// limitations under the License.
#![feature(let_chains)]
-#![feature(generators)]
+#![feature(coroutines)]
use std::sync::atomic::AtomicU64;
use std::sync::Arc;
diff --git a/src/compute/tests/integration_tests.rs b/src/compute/tests/integration_tests.rs
index a43ae2e5762da..6d7e93365c275 100644
--- a/src/compute/tests/integration_tests.rs
+++ b/src/compute/tests/integration_tests.rs
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#![feature(generators)]
+#![feature(coroutines)]
#![feature(proc_macro_hygiene, stmt_expr_attributes)]
use std::sync::atomic::AtomicU64;
diff --git a/src/connector/Cargo.toml b/src/connector/Cargo.toml
index 45ccba627af85..d8ba8f7c6d4a7 100644
--- a/src/connector/Cargo.toml
+++ b/src/connector/Cargo.toml
@@ -114,7 +114,7 @@ strum = "0.25"
strum_macros = "0.25"
tempfile = "3"
thiserror = "1"
-time = "0.3.28"
+time = "0.3.30"
tokio = { version = "0.2", package = "madsim-tokio", features = [
"rt",
"rt-multi-thread",
diff --git a/src/connector/src/lib.rs b/src/connector/src/lib.rs
index 8ccf62486ce65..aa613b4043c23 100644
--- a/src/connector/src/lib.rs
+++ b/src/connector/src/lib.rs
@@ -14,7 +14,7 @@
#![expect(dead_code)]
#![allow(clippy::derive_partial_eq_without_eq)]
-#![feature(generators)]
+#![feature(coroutines)]
#![feature(proc_macro_hygiene)]
#![feature(stmt_expr_attributes)]
#![feature(box_patterns)]
@@ -25,11 +25,9 @@
#![feature(let_chains)]
#![feature(box_into_inner)]
#![feature(type_alias_impl_trait)]
-#![feature(return_position_impl_trait_in_trait)]
-#![feature(async_fn_in_trait)]
#![feature(associated_type_defaults)]
#![feature(impl_trait_in_assoc_type)]
-#![feature(iter_from_generator)]
+#![feature(iter_from_coroutine)]
#![feature(if_let_guard)]
#![feature(iterator_try_collect)]
diff --git a/src/connector/src/sink/blackhole.rs b/src/connector/src/sink/blackhole.rs
index 1f1ace3b0d104..60b0506604c97 100644
--- a/src/connector/src/sink/blackhole.rs
+++ b/src/connector/src/sink/blackhole.rs
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+use async_trait::async_trait;
+
use crate::sink::log_store::{LogReader, LogStoreReadItem, TruncateOffset};
use crate::sink::{
DummySinkCommitCoordinator, LogSinker, Result, Sink, SinkError, SinkParam, SinkWriterParam,
@@ -45,6 +47,7 @@ impl Sink for BlackHoleSink {
}
}
+#[async_trait]
impl LogSinker for BlackHoleSink {
async fn consume_log_and_sink(self, mut log_reader: impl LogReader) -> Result<()> {
log_reader.init().await?;
diff --git a/src/connector/src/sink/catalog/mod.rs b/src/connector/src/sink/catalog/mod.rs
index c18dd7d10a92c..ca3a09e7f2eda 100644
--- a/src/connector/src/sink/catalog/mod.rs
+++ b/src/connector/src/sink/catalog/mod.rs
@@ -132,6 +132,7 @@ pub enum SinkEncode {
Json,
Protobuf,
Avro,
+ Template,
}
impl SinkFormatDesc {
@@ -177,6 +178,7 @@ impl SinkFormatDesc {
SinkEncode::Json => E::Json,
SinkEncode::Protobuf => E::Protobuf,
SinkEncode::Avro => E::Avro,
+ SinkEncode::Template => E::Template,
};
let options = self
.options
@@ -212,6 +214,7 @@ impl TryFrom for SinkFormatDesc {
let encode = match value.encode() {
E::Json => SinkEncode::Json,
E::Protobuf => SinkEncode::Protobuf,
+ E::Template => SinkEncode::Template,
E::Avro => SinkEncode::Avro,
e @ (E::Unspecified | E::Native | E::Csv | E::Bytes) => {
return Err(SinkError::Config(anyhow!(
diff --git a/src/connector/src/sink/clickhouse.rs b/src/connector/src/sink/clickhouse.rs
index 2bddf8026216f..fb06baf42920c 100644
--- a/src/connector/src/sink/clickhouse.rs
+++ b/src/connector/src/sink/clickhouse.rs
@@ -11,29 +11,36 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-
use core::fmt::Debug;
use std::collections::{HashMap, HashSet};
use std::time::Duration;
use anyhow::anyhow;
-use clickhouse::{Client, Client as ClickHouseClient, Row as ClickHouseRow};
+use clickhouse::{Client as ClickHouseClient, Row as ClickHouseRow};
use itertools::Itertools;
-use risingwave_common::array::{Op, RowRef, StreamChunk};
+use risingwave_common::array::{Op, StreamChunk};
use risingwave_common::catalog::Schema;
use risingwave_common::row::Row;
-use risingwave_common::types::{DataType, ScalarRefImpl, Serial};
+use risingwave_common::types::{DataType, Decimal, ScalarRefImpl, Serial};
use serde::ser::{SerializeSeq, SerializeStruct};
use serde::Serialize;
use serde_derive::Deserialize;
use serde_with::serde_as;
use super::{DummySinkCommitCoordinator, SinkWriterParam};
-use crate::sink::writer::{LogSinkerOf, SinkWriter, SinkWriterExt};
+use crate::sink::catalog::desc::SinkDesc;
+use crate::sink::log_store::DeliveryFutureManagerAddFuture;
+use crate::sink::writer::{
+ AsyncTruncateLogSinkerOf, AsyncTruncateSinkWriter, AsyncTruncateSinkWriterExt,
+};
use crate::sink::{
Result, Sink, SinkError, SinkParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT,
};
+const QUERY_ENGINE: &str =
+ "select distinct ?fields from system.tables where database = ? and table = ?";
+const QUERY_COLUMN: &str =
+ "select distinct ?fields from system.columns where database = ? and table = ? order by ?";
pub const CLICKHOUSE_SINK: &str = "clickhouse";
const BUFFER_SIZE: usize = 1024;
@@ -51,6 +58,75 @@ pub struct ClickHouseCommon {
pub table: String,
}
+#[allow(clippy::enum_variant_names)]
+#[derive(Debug)]
+enum ClickHouseEngine {
+ MergeTree,
+ ReplacingMergeTree,
+ SummingMergeTree,
+ AggregatingMergeTree,
+ CollapsingMergeTree(String),
+ VersionedCollapsingMergeTree(String),
+ GraphiteMergeTree,
+}
+impl ClickHouseEngine {
+ pub fn is_collapsing_engine(&self) -> bool {
+ matches!(
+ self,
+ ClickHouseEngine::CollapsingMergeTree(_)
+ | ClickHouseEngine::VersionedCollapsingMergeTree(_)
+ )
+ }
+
+ pub fn get_sign_name(&self) -> Option {
+ match self {
+ ClickHouseEngine::CollapsingMergeTree(sign_name) => Some(sign_name.to_string()),
+ ClickHouseEngine::VersionedCollapsingMergeTree(sign_name) => {
+ Some(sign_name.to_string())
+ }
+ _ => None,
+ }
+ }
+
+ pub fn from_query_engine(engine_name: &ClickhouseQueryEngine) -> Result {
+ match engine_name.engine.as_str() {
+ "MergeTree" => Ok(ClickHouseEngine::MergeTree),
+ "ReplacingMergeTree" => Ok(ClickHouseEngine::ReplacingMergeTree),
+ "SummingMergeTree" => Ok(ClickHouseEngine::SummingMergeTree),
+ "AggregatingMergeTree" => Ok(ClickHouseEngine::AggregatingMergeTree),
+ "VersionedCollapsingMergeTree" => {
+ let sign_name = engine_name
+ .create_table_query
+ .split("VersionedCollapsingMergeTree(")
+ .last()
+ .ok_or_else(|| SinkError::ClickHouse("must have last".to_string()))?
+ .split(',')
+ .next()
+ .ok_or_else(|| SinkError::ClickHouse("must have next".to_string()))?
+ .to_string();
+ Ok(ClickHouseEngine::VersionedCollapsingMergeTree(sign_name))
+ }
+ "CollapsingMergeTree" => {
+ let sign_name = engine_name
+ .create_table_query
+ .split("CollapsingMergeTree(")
+ .last()
+ .ok_or_else(|| SinkError::ClickHouse("must have last".to_string()))?
+ .split(')')
+ .next()
+ .ok_or_else(|| SinkError::ClickHouse("must have next".to_string()))?
+ .to_string();
+ Ok(ClickHouseEngine::CollapsingMergeTree(sign_name))
+ }
+ "GraphiteMergeTree" => Ok(ClickHouseEngine::GraphiteMergeTree),
+ _ => Err(SinkError::ClickHouse(format!(
+ "Cannot find clickhouse engine {:?}",
+ engine_name.engine
+ ))),
+ }
+ }
+}
+
const POOL_IDLE_TIMEOUT: Duration = Duration::from_secs(5);
impl ClickHouseCommon {
@@ -194,9 +270,7 @@ impl ClickHouseSink {
}
risingwave_common::types::DataType::Float32 => Ok(ck_column.r#type.contains("Float32")),
risingwave_common::types::DataType::Float64 => Ok(ck_column.r#type.contains("Float64")),
- risingwave_common::types::DataType::Decimal => {
- Err(SinkError::ClickHouse("can not support Decimal".to_string()))
- }
+ risingwave_common::types::DataType::Decimal => Ok(ck_column.r#type.contains("Decimal")),
risingwave_common::types::DataType::Date => Ok(ck_column.r#type.contains("Date32")),
risingwave_common::types::DataType::Varchar => Ok(ck_column.r#type.contains("String")),
risingwave_common::types::DataType::Time => Err(SinkError::ClickHouse(
@@ -228,7 +302,7 @@ impl ClickHouseSink {
Ok(ck_column.r#type.contains("UInt64") | ck_column.r#type.contains("Int64"))
}
risingwave_common::types::DataType::Int256 => Err(SinkError::ClickHouse(
- "clickhouse can not support Interval".to_string(),
+ "clickhouse can not support Int256".to_string(),
)),
};
if !is_match? {
@@ -243,10 +317,14 @@ impl ClickHouseSink {
}
impl Sink for ClickHouseSink {
type Coordinator = DummySinkCommitCoordinator;
- type LogSinker = LogSinkerOf;
+ type LogSinker = AsyncTruncateLogSinkerOf;
const SINK_NAME: &'static str = CLICKHOUSE_SINK;
+ fn default_sink_decouple(desc: &SinkDesc) -> bool {
+ desc.sink_type.is_append_only()
+ }
+
async fn validate(&self) -> Result<()> {
// For upsert clickhouse sink, the primary key must be defined.
if !self.is_append_only && self.pk_indices.is_empty() {
@@ -256,20 +334,15 @@ impl Sink for ClickHouseSink {
// check reachability
let client = self.config.common.build_client()?;
- let query_column = "select distinct ?fields from system.columns where database = ? and table = ? order by ?".to_string();
- let clickhouse_column = client
- .query(&query_column)
- .bind(self.config.common.database.clone())
- .bind(self.config.common.table.clone())
- .bind("position")
- .fetch_all::()
- .await?;
- if clickhouse_column.is_empty() {
- return Err(SinkError::ClickHouse(format!(
- "table {:?}.{:?} is not find in clickhouse",
- self.config.common.database, self.config.common.table
- )));
+
+ let (clickhouse_column, clickhouse_engine) =
+ query_column_engine_from_ck(client, &self.config).await?;
+
+ if !self.is_append_only && !clickhouse_engine.is_collapsing_engine() {
+ return Err(SinkError::ClickHouse(
+ "If you want to use upsert, please modify your engine is `VersionedCollapsingMergeTree` or `CollapsingMergeTree` in ClickHouse".to_owned()));
}
+
self.check_column_name_and_type(&clickhouse_column)?;
if !self.is_append_only {
self.check_pk_match(&clickhouse_column)?;
@@ -277,7 +350,7 @@ impl Sink for ClickHouseSink {
Ok(())
}
- async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result {
+ async fn new_log_sinker(&self, _writer_param: SinkWriterParam) -> Result {
Ok(ClickHouseSinkWriter::new(
self.config.clone(),
self.schema.clone(),
@@ -285,24 +358,27 @@ impl Sink for ClickHouseSink {
self.is_append_only,
)
.await?
- .into_log_sinker(writer_param.sink_metrics))
+ .into_log_sinker(usize::MAX))
}
}
pub struct ClickHouseSinkWriter {
pub config: ClickHouseConfig,
schema: Schema,
pk_indices: Vec,
- client: Client,
+ client: ClickHouseClient,
is_append_only: bool,
// Save some features of the clickhouse column type
column_correct_vec: Vec,
- clickhouse_fields_name: Vec,
+ rw_fields_name_after_calibration: Vec,
+ clickhouse_engine: ClickHouseEngine,
}
#[derive(Debug)]
struct ClickHouseSchemaFeature {
can_null: bool,
// Time accuracy in clickhouse for rw and ck conversions
accuracy_time: u8,
+
+ accuracy_decimal: (u8, u8),
}
impl ClickHouseSinkWriter {
@@ -312,25 +388,23 @@ impl ClickHouseSinkWriter {
pk_indices: Vec,
is_append_only: bool,
) -> Result {
- if !is_append_only {
- tracing::warn!("Update and delete are not recommended because of their impact on clickhouse performance.");
- }
let client = config.common.build_client()?;
- let query_column = "select distinct ?fields from system.columns where database = ? and table = ? order by position".to_string();
- let clickhouse_column = client
- .query(&query_column)
- .bind(config.common.database.clone())
- .bind(config.common.table.clone())
- .fetch_all::()
- .await?;
+
+ let (clickhouse_column, clickhouse_engine) =
+ query_column_engine_from_ck(client.clone(), &config).await?;
+
let column_correct_vec: Result> = clickhouse_column
.iter()
.map(Self::build_column_correct_vec)
.collect();
- let clickhouse_fields_name = build_fields_name_type_from_schema(&schema)?
+ let mut rw_fields_name_after_calibration = build_fields_name_type_from_schema(&schema)?
.iter()
.map(|(a, _)| a.clone())
.collect_vec();
+
+ if let Some(sign) = clickhouse_engine.get_sign_name() {
+ rw_fields_name_after_calibration.push(sign);
+ }
Ok(Self {
config,
schema,
@@ -338,7 +412,8 @@ impl ClickHouseSinkWriter {
client,
is_append_only,
column_correct_vec: column_correct_vec?,
- clickhouse_fields_name,
+ rw_fields_name_after_calibration,
+ clickhouse_engine,
})
}
@@ -360,159 +435,95 @@ impl ClickHouseSinkWriter {
} else {
0_u8
};
+ let accuracy_decimal = if ck_column.r#type.contains("Decimal(") {
+ let decimal_all = ck_column
+ .r#type
+ .split("Decimal(")
+ .last()
+ .ok_or_else(|| SinkError::ClickHouse("must have last".to_string()))?
+ .split(')')
+ .next()
+ .ok_or_else(|| SinkError::ClickHouse("must have next".to_string()))?
+ .split(", ")
+ .collect_vec();
+ let length = decimal_all
+ .first()
+ .ok_or_else(|| SinkError::ClickHouse("must have next".to_string()))?
+ .parse::()
+ .map_err(|e| SinkError::ClickHouse(format!("clickhouse sink error {}", e)))?;
+
+ if length > 38 {
+ return Err(SinkError::ClickHouse(
+ "RW don't support Decimal256".to_string(),
+ ));
+ }
+
+ let scale = decimal_all
+ .last()
+ .ok_or_else(|| SinkError::ClickHouse("must have next".to_string()))?
+ .parse::()
+ .map_err(|e| SinkError::ClickHouse(format!("clickhouse sink error {}", e)))?;
+ (length, scale)
+ } else {
+ (0_u8, 0_u8)
+ };
Ok(ClickHouseSchemaFeature {
can_null,
accuracy_time,
+ accuracy_decimal,
})
}
- async fn append_only(&mut self, chunk: StreamChunk) -> Result<()> {
+ async fn write(&mut self, chunk: StreamChunk) -> Result<()> {
let mut insert = self.client.insert_with_fields_name(
&self.config.common.table,
- self.clickhouse_fields_name.clone(),
+ self.rw_fields_name_after_calibration.clone(),
)?;
for (op, row) in chunk.rows() {
- if op != Op::Insert {
- tracing::warn!(
- "append only click house sink receive an {:?} which will be ignored.",
- op
- );
- continue;
- }
let mut clickhouse_filed_vec = vec![];
for (index, data) in row.iter().enumerate() {
clickhouse_filed_vec.extend(ClickHouseFieldWithNull::from_scalar_ref(
data,
&self.column_correct_vec,
index,
- true,
)?);
}
- let clickhouse_column = ClickHouseColumn {
- row: clickhouse_filed_vec,
- };
- insert.write(&clickhouse_column).await?;
- }
- insert.end().await?;
- Ok(())
- }
-
- async fn upsert(&mut self, chunk: StreamChunk) -> Result<()> {
- let get_pk_names_and_data = |row: RowRef<'_>, index: usize| {
- let pk_names = self
- .schema
- .names()
- .iter()
- .cloned()
- .enumerate()
- .filter(|(index, _)| self.pk_indices.contains(index))
- .map(|(_, b)| b)
- .collect_vec();
- let mut pk_data = vec![];
- for pk_index in &self.pk_indices {
- if let ClickHouseFieldWithNull::WithoutSome(v) =
- ClickHouseFieldWithNull::from_scalar_ref(
- row.datum_at(*pk_index),
- &self.column_correct_vec,
- index,
- false,
- )?
- .pop()
- .unwrap()
- {
- pk_data.push(v)
- } else {
- return Err(SinkError::ClickHouse("pk can not be null".to_string()));
- }
- }
- Ok((pk_names, pk_data))
- };
-
- for (index, (op, row)) in chunk.rows().enumerate() {
match op {
- Op::Insert => {
- let mut insert = self.client.insert_with_fields_name(
- &self.config.common.table,
- self.clickhouse_fields_name.clone(),
- )?;
- let mut clickhouse_filed_vec = vec![];
- for (index, data) in row.iter().enumerate() {
- clickhouse_filed_vec.extend(ClickHouseFieldWithNull::from_scalar_ref(
- data,
- &self.column_correct_vec,
- index,
- true,
- )?);
+ Op::Insert | Op::UpdateInsert => {
+ if self.clickhouse_engine.get_sign_name().is_some() {
+ clickhouse_filed_vec.push(ClickHouseFieldWithNull::WithoutSome(
+ ClickHouseField::Int8(1),
+ ));
}
- let clickhouse_column = ClickHouseColumn {
- row: clickhouse_filed_vec,
- };
- insert.write(&clickhouse_column).await?;
- insert.end().await?;
- }
- Op::Delete => {
- let (delete_pk_names, delete_pk_data) = get_pk_names_and_data(row, index)?;
- self.client
- .delete(&self.config.common.table, delete_pk_names)
- .delete(delete_pk_data)
- .await?;
}
- Op::UpdateDelete => continue,
- Op::UpdateInsert => {
- let (update_pk_names, update_pk_data) = get_pk_names_and_data(row, index)?;
- let mut clickhouse_update_filed_vec = vec![];
- for (index, data) in row.iter().enumerate() {
- if !self.pk_indices.contains(&index) {
- clickhouse_update_filed_vec.extend(
- ClickHouseFieldWithNull::from_scalar_ref(
- data,
- &self.column_correct_vec,
- index,
- false,
- )?,
- );
- }
+ Op::Delete | Op::UpdateDelete => {
+ if !self.clickhouse_engine.is_collapsing_engine() {
+ return Err(SinkError::ClickHouse(
+ "Clickhouse engine don't support upsert".to_string(),
+ ));
}
- // Get the names of the columns excluding pk, and use them to update.
- let fields_name_update = self
- .clickhouse_fields_name
- .iter()
- .filter(|n| !update_pk_names.contains(n))
- .map(|s| s.to_string())
- .collect_vec();
-
- let update = self.client.update(
- &self.config.common.table,
- update_pk_names,
- fields_name_update.clone(),
- );
- update
- .update_fields(clickhouse_update_filed_vec, update_pk_data)
- .await?;
+ clickhouse_filed_vec.push(ClickHouseFieldWithNull::WithoutSome(
+ ClickHouseField::Int8(-1),
+ ))
}
}
+ let clickhouse_column = ClickHouseColumn {
+ row: clickhouse_filed_vec,
+ };
+ insert.write(&clickhouse_column).await?;
}
+ insert.end().await?;
Ok(())
}
}
-#[async_trait::async_trait]
-impl SinkWriter for ClickHouseSinkWriter {
- async fn write_batch(&mut self, chunk: StreamChunk) -> Result<()> {
- if self.is_append_only {
- self.append_only(chunk).await
- } else {
- self.upsert(chunk).await
- }
- }
-
- async fn begin_epoch(&mut self, _epoch: u64) -> Result<()> {
- // clickhouse no transactional guarantees, so we do nothing here.
- Ok(())
- }
-
- async fn barrier(&mut self, _is_checkpoint: bool) -> Result<()> {
- Ok(())
+impl AsyncTruncateSinkWriter for ClickHouseSinkWriter {
+ async fn write_chunk<'a>(
+ &'a mut self,
+ chunk: StreamChunk,
+ _add_future: DeliveryFutureManagerAddFuture<'a, Self::DeliveryFuture>,
+ ) -> Result<()> {
+ self.write(chunk).await
}
}
@@ -523,6 +534,48 @@ struct SystemColumn {
is_in_primary_key: u8,
}
+#[derive(ClickHouseRow, Deserialize)]
+struct ClickhouseQueryEngine {
+ name: String,
+ engine: String,
+ create_table_query: String,
+}
+
+async fn query_column_engine_from_ck(
+ client: ClickHouseClient,
+ config: &ClickHouseConfig,
+) -> Result<(Vec, ClickHouseEngine)> {
+ let query_engine = QUERY_ENGINE;
+ let query_column = QUERY_COLUMN;
+
+ let clickhouse_engine = client
+ .query(query_engine)
+ .bind(config.common.database.clone())
+ .bind(config.common.table.clone())
+ .fetch_all::()
+ .await?;
+ let mut clickhouse_column = client
+ .query(query_column)
+ .bind(config.common.database.clone())
+ .bind(config.common.table.clone())
+ .bind("position")
+ .fetch_all::()
+ .await?;
+ if clickhouse_engine.is_empty() || clickhouse_column.is_empty() {
+ return Err(SinkError::ClickHouse(format!(
+ "table {:?}.{:?} is not find in clickhouse",
+ config.common.database, config.common.table
+ )));
+ }
+
+ let clickhouse_engine = ClickHouseEngine::from_query_engine(clickhouse_engine.get(0).unwrap())?;
+
+ if let Some(sign) = &clickhouse_engine.get_sign_name() {
+ clickhouse_column.retain(|a| sign.ne(&a.name))
+ }
+ Ok((clickhouse_column, clickhouse_engine))
+}
+
/// Serialize this structure to simulate the `struct` call clickhouse interface
#[derive(ClickHouseRow, Debug)]
struct ClickHouseColumn {
@@ -541,6 +594,26 @@ enum ClickHouseField {
String(String),
Bool(bool),
List(Vec),
+ Int8(i8),
+ Decimal(ClickHouseDecimal),
+}
+#[derive(Debug)]
+enum ClickHouseDecimal {
+ Decimal32(i32),
+ Decimal64(i64),
+ Decimal128(i128),
+}
+impl Serialize for ClickHouseDecimal {
+ fn serialize(&self, serializer: S) -> std::result::Result
+ where
+ S: serde::Serializer,
+ {
+ match self {
+ ClickHouseDecimal::Decimal32(v) => serializer.serialize_i32(*v),
+ ClickHouseDecimal::Decimal64(v) => serializer.serialize_i64(*v),
+ ClickHouseDecimal::Decimal128(v) => serializer.serialize_i128(*v),
+ }
+ }
}
/// Enum that support clickhouse nullable
@@ -556,7 +629,6 @@ impl ClickHouseFieldWithNull {
data: Option>,
clickhouse_schema_feature_vec: &Vec,
clickhouse_schema_feature_index: usize,
- is_insert: bool,
) -> Result> {
let clickhouse_schema_feature = clickhouse_schema_feature_vec
.get(clickhouse_schema_feature_index)
@@ -584,8 +656,29 @@ impl ClickHouseFieldWithNull {
ScalarRefImpl::Float64(v) => ClickHouseField::Float64(v.into_inner()),
ScalarRefImpl::Utf8(v) => ClickHouseField::String(v.to_string()),
ScalarRefImpl::Bool(v) => ClickHouseField::Bool(v),
- ScalarRefImpl::Decimal(_) => {
- return Err(SinkError::ClickHouse("can not support Decimal".to_string()))
+ ScalarRefImpl::Decimal(d) => {
+ if let Decimal::Normalized(d) = d {
+ let scale =
+ clickhouse_schema_feature.accuracy_decimal.1 as i32 - d.scale() as i32;
+
+ let scale = if scale < 0 {
+ d.mantissa() / 10_i128.pow(scale.unsigned_abs())
+ } else {
+ d.mantissa() * 10_i128.pow(scale as u32)
+ };
+
+ if clickhouse_schema_feature.accuracy_decimal.0 <= 9 {
+ ClickHouseField::Decimal(ClickHouseDecimal::Decimal32(scale as i32))
+ } else if clickhouse_schema_feature.accuracy_decimal.0 <= 18 {
+ ClickHouseField::Decimal(ClickHouseDecimal::Decimal64(scale as i64))
+ } else {
+ ClickHouseField::Decimal(ClickHouseDecimal::Decimal128(scale))
+ }
+ } else {
+ return Err(SinkError::ClickHouse(
+ "clickhouse can not support Decimal NAN,-INF and INF".to_string(),
+ ));
+ }
}
ScalarRefImpl::Interval(_) => {
return Err(SinkError::ClickHouse(
@@ -602,14 +695,9 @@ impl ClickHouseFieldWithNull {
))
}
ScalarRefImpl::Timestamp(v) => {
- if is_insert {
- let time = v.get_timestamp_nanos()
- / 10_i32.pow((9 - clickhouse_schema_feature.accuracy_time).into()) as i64;
- ClickHouseField::Int64(time)
- } else {
- let time = v.truncate_micros().to_string();
- ClickHouseField::String(time)
- }
+ let time = v.get_timestamp_nanos()
+ / 10_i32.pow((9 - clickhouse_schema_feature.accuracy_time).into()) as i64;
+ ClickHouseField::Int64(time)
}
ScalarRefImpl::Timestamptz(_) => {
return Err(SinkError::ClickHouse(
@@ -628,7 +716,6 @@ impl ClickHouseFieldWithNull {
field,
clickhouse_schema_feature_vec,
clickhouse_schema_feature_index + index,
- is_insert,
)?;
struct_vec.push(ClickHouseFieldWithNull::WithoutSome(ClickHouseField::List(
a,
@@ -643,7 +730,6 @@ impl ClickHouseFieldWithNull {
i,
clickhouse_schema_feature_vec,
clickhouse_schema_feature_index,
- is_insert,
)?)
}
return Ok(vec![ClickHouseFieldWithNull::WithoutSome(
@@ -656,9 +742,7 @@ impl ClickHouseFieldWithNull {
))
}
};
- // Insert needs to be serialized with `Some`, update doesn't need to be serialized with
- // `Some`
- let data = if is_insert && clickhouse_schema_feature.can_null {
+ let data = if clickhouse_schema_feature.can_null {
vec![ClickHouseFieldWithNull::WithSome(data)]
} else {
vec![ClickHouseFieldWithNull::WithoutSome(data)]
@@ -688,6 +772,8 @@ impl Serialize for ClickHouseField {
}
s.end()
}
+ ClickHouseField::Decimal(v) => v.serialize(serializer),
+ ClickHouseField::Int8(v) => serializer.serialize_i8(*v),
}
}
}
diff --git a/src/connector/src/sink/encoder/template.rs b/src/connector/src/sink/encoder/template.rs
index 85f085989b6c4..97d8271f9e83a 100644
--- a/src/connector/src/sink/encoder/template.rs
+++ b/src/connector/src/sink/encoder/template.rs
@@ -12,11 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+use std::collections::HashSet;
+
+use regex::Regex;
use risingwave_common::catalog::Schema;
use risingwave_common::row::Row;
use risingwave_common::types::ToText;
use super::{Result, RowEncoder};
+use crate::sink::SinkError;
/// Encode a row according to a specified string template `user_id:{user_id}`
pub struct TemplateEncoder {
@@ -34,6 +38,24 @@ impl TemplateEncoder {
template,
}
}
+
+ pub fn check_string_format(format: &str, set: &HashSet) -> Result<()> {
+ // We will check if the string inside {} corresponds to a column name in rw.
+ // In other words, the content within {} should exclusively consist of column names from rw,
+ // which means '{{column_name}}' or '{{column_name1},{column_name2}}' would be incorrect.
+ let re = Regex::new(r"\{([^}]*)\}").unwrap();
+ if !re.is_match(format) {
+ return Err(SinkError::Redis(
+ "Can't find {} in key_format or value_format".to_string(),
+ ));
+ }
+ for capture in re.captures_iter(format) {
+ if let Some(inner_content) = capture.get(1) && !set.contains(inner_content.as_str()){
+ return Err(SinkError::Redis(format!("Can't find field({:?}) in key_format or value_format",inner_content.as_str())))
+ }
+ }
+ Ok(())
+ }
}
impl RowEncoder for TemplateEncoder {
diff --git a/src/connector/src/sink/formatter/append_only.rs b/src/connector/src/sink/formatter/append_only.rs
index 523a52dab91bb..f0efcc21d9009 100644
--- a/src/connector/src/sink/formatter/append_only.rs
+++ b/src/connector/src/sink/formatter/append_only.rs
@@ -40,7 +40,7 @@ impl SinkFormatter for AppendOnlyFormatter impl Iterator- , Option
)>> {
- std::iter::from_generator(|| {
+ std::iter::from_coroutine(|| {
for (op, row) in chunk.rows() {
if op != Op::Insert {
continue;
diff --git a/src/connector/src/sink/formatter/debezium_json.rs b/src/connector/src/sink/formatter/debezium_json.rs
index 637aa23f06410..ce98daab88756 100644
--- a/src/connector/src/sink/formatter/debezium_json.rs
+++ b/src/connector/src/sink/formatter/debezium_json.rs
@@ -85,7 +85,7 @@ impl SinkFormatter for DebeziumJsonFormatter {
&self,
chunk: &StreamChunk,
) -> impl Iterator- , Option
)>> {
- std::iter::from_generator(|| {
+ std::iter::from_coroutine(|| {
let DebeziumJsonFormatter {
schema,
pk_indices,
diff --git a/src/connector/src/sink/formatter/mod.rs b/src/connector/src/sink/formatter/mod.rs
index a7463f7e3b306..17cb708292890 100644
--- a/src/connector/src/sink/formatter/mod.rs
+++ b/src/connector/src/sink/formatter/mod.rs
@@ -29,6 +29,7 @@ pub use upsert::UpsertFormatter;
use super::catalog::{SinkEncode, SinkFormat, SinkFormatDesc};
use super::encoder::template::TemplateEncoder;
use super::encoder::KafkaConnectParams;
+use super::redis::{KEY_FORMAT, VALUE_FORMAT};
use crate::sink::encoder::{JsonEncoder, ProtoEncoder, TimestampHandlingMode};
/// Transforms a `StreamChunk` into a sequence of key-value pairs according a specific format,
@@ -92,7 +93,7 @@ impl SinkFormatterImpl {
let key_encoder = (!pk_indices.is_empty()).then(|| {
JsonEncoder::new(
schema.clone(),
- Some(pk_indices),
+ Some(pk_indices.clone()),
TimestampHandlingMode::Milli,
)
});
@@ -115,6 +116,28 @@ impl SinkFormatterImpl {
Ok(SinkFormatterImpl::AppendOnlyProto(formatter))
}
SinkEncode::Avro => err_unsupported(),
+ SinkEncode::Template => {
+ let key_format = format_desc.options.get(KEY_FORMAT).ok_or_else(|| {
+ SinkError::Config(anyhow!(
+ "Cannot find 'key_format',please set it or use JSON"
+ ))
+ })?;
+ let value_format =
+ format_desc.options.get(VALUE_FORMAT).ok_or_else(|| {
+ SinkError::Config(anyhow!(
+ "Cannot find 'redis_value_format',please set it or use JSON"
+ ))
+ })?;
+ let key_encoder = TemplateEncoder::new(
+ schema.clone(),
+ Some(pk_indices),
+ key_format.clone(),
+ );
+ let val_encoder = TemplateEncoder::new(schema, None, value_format.clone());
+ Ok(SinkFormatterImpl::AppendOnlyTemplate(
+ AppendOnlyFormatter::new(Some(key_encoder), val_encoder),
+ ))
+ }
}
}
SinkFormat::Debezium => {
@@ -131,85 +154,66 @@ impl SinkFormatterImpl {
)))
}
SinkFormat::Upsert => {
- if format_desc.encode != SinkEncode::Json {
- return err_unsupported();
- }
+ match format_desc.encode {
+ SinkEncode::Json => {
+ let mut key_encoder = JsonEncoder::new(
+ schema.clone(),
+ Some(pk_indices),
+ TimestampHandlingMode::Milli,
+ );
+ let mut val_encoder =
+ JsonEncoder::new(schema, None, TimestampHandlingMode::Milli);
- let mut key_encoder = JsonEncoder::new(
- schema.clone(),
- Some(pk_indices),
- TimestampHandlingMode::Milli,
- );
- let mut val_encoder = JsonEncoder::new(schema, None, TimestampHandlingMode::Milli);
-
- if let Some(s) = format_desc.options.get("schemas.enable") {
- match s.to_lowercase().parse::() {
- Ok(true) => {
- let kafka_connect = KafkaConnectParams {
- schema_name: format!("{}.{}", db_name, sink_from_name),
- };
- key_encoder = key_encoder.with_kafka_connect(kafka_connect.clone());
- val_encoder = val_encoder.with_kafka_connect(kafka_connect);
- }
- Ok(false) => {}
- _ => {
- return Err(SinkError::Config(anyhow!(
- "schemas.enable is expected to be `true` or `false`, got {}",
- s
- )));
- }
+ if let Some(s) = format_desc.options.get("schemas.enable") {
+ match s.to_lowercase().parse::() {
+ Ok(true) => {
+ let kafka_connect = KafkaConnectParams {
+ schema_name: format!("{}.{}", db_name, sink_from_name),
+ };
+ key_encoder =
+ key_encoder.with_kafka_connect(kafka_connect.clone());
+ val_encoder = val_encoder.with_kafka_connect(kafka_connect);
+ }
+ Ok(false) => {}
+ _ => {
+ return Err(SinkError::Config(anyhow!(
+ "schemas.enable is expected to be `true` or `false`, got {}",
+ s
+ )));
+ }
+ }
+ };
+
+ // Initialize the upsert_stream
+ let formatter = UpsertFormatter::new(key_encoder, val_encoder);
+ Ok(SinkFormatterImpl::UpsertJson(formatter))
}
- };
-
- // Initialize the upsert_stream
- let formatter = UpsertFormatter::new(key_encoder, val_encoder);
- Ok(SinkFormatterImpl::UpsertJson(formatter))
- }
- }
- }
-
- pub fn new_with_redis(
- schema: Schema,
- pk_indices: Vec,
- is_append_only: bool,
- key_format: Option,
- value_format: Option,
- ) -> Result {
- match (key_format, value_format) {
- (Some(k), Some(v)) => {
- let key_encoder = TemplateEncoder::new(
- schema.clone(),
- Some(pk_indices),
- k,
- );
- let val_encoder =
- TemplateEncoder::new(schema, None, v);
- if is_append_only {
- Ok(SinkFormatterImpl::AppendOnlyTemplate(AppendOnlyFormatter::new(Some(key_encoder), val_encoder)))
- } else {
- Ok(SinkFormatterImpl::UpsertTemplate(UpsertFormatter::new(key_encoder, val_encoder)))
- }
- }
- (None, None) => {
- let key_encoder = JsonEncoder::new(
- schema.clone(),
- Some(pk_indices),
- TimestampHandlingMode::Milli,
- );
- let val_encoder = JsonEncoder::new(
- schema,
- None,
- TimestampHandlingMode::Milli,
- );
- if is_append_only {
- Ok(SinkFormatterImpl::AppendOnlyJson(AppendOnlyFormatter::new(Some(key_encoder), val_encoder)))
- } else {
- Ok(SinkFormatterImpl::UpsertJson(UpsertFormatter::new(key_encoder, val_encoder)))
+ SinkEncode::Template => {
+ let key_format = format_desc.options.get(KEY_FORMAT).ok_or_else(|| {
+ SinkError::Config(anyhow!(
+ "Cannot find 'key_format',please set it or use JSON"
+ ))
+ })?;
+ let value_format =
+ format_desc.options.get(VALUE_FORMAT).ok_or_else(|| {
+ SinkError::Config(anyhow!(
+ "Cannot find 'redis_value_format',please set it or use JSON"
+ ))
+ })?;
+ let key_encoder = TemplateEncoder::new(
+ schema.clone(),
+ Some(pk_indices),
+ key_format.clone(),
+ );
+ let val_encoder = TemplateEncoder::new(schema, None, value_format.clone());
+ Ok(SinkFormatterImpl::UpsertTemplate(UpsertFormatter::new(
+ key_encoder,
+ val_encoder,
+ )))
+ }
+ _ => err_unsupported(),
}
}
- _ => {
- Err(SinkError::Encode("Please provide template formats for both key and value, or choose the JSON format.".to_string()))
- }
}
}
}
diff --git a/src/connector/src/sink/formatter/upsert.rs b/src/connector/src/sink/formatter/upsert.rs
index 6ef2b5f2ca333..af8e70ff92850 100644
--- a/src/connector/src/sink/formatter/upsert.rs
+++ b/src/connector/src/sink/formatter/upsert.rs
@@ -40,7 +40,7 @@ impl SinkFormatter for UpsertFormatter {
&self,
chunk: &StreamChunk,
) -> impl Iterator- , Option
)>> {
- std::iter::from_generator(|| {
+ std::iter::from_coroutine(|| {
for (op, row) in chunk.rows() {
let event_key_object = Some(tri!(self.key_encoder.encode(row)));
diff --git a/src/connector/src/sink/kafka.rs b/src/connector/src/sink/kafka.rs
index a204a8d121706..f77b2b0a88c36 100644
--- a/src/connector/src/sink/kafka.rs
+++ b/src/connector/src/sink/kafka.rs
@@ -14,20 +14,18 @@
use std::collections::HashMap;
use std::fmt::Debug;
-use std::pin::pin;
use std::sync::Arc;
use std::time::Duration;
use anyhow::anyhow;
-use futures::future::{select, Either};
use futures::{Future, FutureExt, TryFuture};
use rdkafka::error::KafkaError;
use rdkafka::message::ToBytes;
use rdkafka::producer::{DeliveryFuture, FutureProducer, FutureRecord};
use rdkafka::types::RDKafkaErrorCode;
use rdkafka::ClientConfig;
+use risingwave_common::array::StreamChunk;
use risingwave_common::catalog::Schema;
-use risingwave_common::util::drop_either_future;
use serde_derive::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use strum_macros::{Display, EnumString};
@@ -37,11 +35,11 @@ use super::{Sink, SinkError, SinkParam};
use crate::common::KafkaCommon;
use crate::sink::catalog::desc::SinkDesc;
use crate::sink::formatter::SinkFormatterImpl;
-use crate::sink::log_store::{
- DeliveryFutureManager, DeliveryFutureManagerAddFuture, LogReader, LogStoreReadItem,
+use crate::sink::log_store::DeliveryFutureManagerAddFuture;
+use crate::sink::writer::{
+ AsyncTruncateLogSinkerOf, AsyncTruncateSinkWriter, AsyncTruncateSinkWriterExt, FormattedSink,
};
-use crate::sink::writer::FormattedSink;
-use crate::sink::{DummySinkCommitCoordinator, LogSinker, Result, SinkWriterParam};
+use crate::sink::{DummySinkCommitCoordinator, Result, SinkWriterParam};
use crate::source::kafka::{KafkaProperties, KafkaSplitEnumerator, PrivateLinkProducerContext};
use crate::source::{SourceEnumeratorContext, SplitEnumerator};
use crate::{
@@ -299,7 +297,7 @@ impl TryFrom for KafkaSink {
impl Sink for KafkaSink {
type Coordinator = DummySinkCommitCoordinator;
- type LogSinker = KafkaLogSinker;
+ type LogSinker = AsyncTruncateLogSinkerOf;
const SINK_NAME: &'static str = KAFKA_SINK;
@@ -316,7 +314,18 @@ impl Sink for KafkaSink {
self.sink_from_name.clone(),
)
.await?;
- KafkaLogSinker::new(self.config.clone(), formatter).await
+ let max_delivery_buffer_size = (self
+ .config
+ .rdkafka_properties
+ .queue_buffering_max_messages
+ .as_ref()
+ .cloned()
+ .unwrap_or(KAFKA_WRITER_MAX_QUEUE_SIZE) as f32
+ * KAFKA_WRITER_MAX_QUEUE_SIZE_RATIO) as usize;
+
+ Ok(KafkaSinkWriter::new(self.config.clone(), formatter)
+ .await?
+ .into_log_sinker(max_delivery_buffer_size))
}
async fn validate(&self) -> Result<()> {
@@ -370,16 +379,15 @@ struct KafkaPayloadWriter<'a> {
config: &'a KafkaConfig,
}
-type KafkaSinkDeliveryFuture = impl TryFuture + Unpin + 'static;
+pub type KafkaSinkDeliveryFuture = impl TryFuture + Unpin + 'static;
-pub struct KafkaLogSinker {
+pub struct KafkaSinkWriter {
formatter: SinkFormatterImpl,
inner: FutureProducer,
- future_manager: DeliveryFutureManager,
config: KafkaConfig,
}
-impl KafkaLogSinker {
+impl KafkaSinkWriter {
async fn new(config: KafkaConfig, formatter: SinkFormatterImpl) -> Result {
let inner: FutureProducer = {
let mut c = ClientConfig::new();
@@ -403,19 +411,29 @@ impl KafkaLogSinker {
c.create_with_context(producer_ctx).await?
};
- let max_delivery_buffer_size = (config
- .rdkafka_properties
- .queue_buffering_max_messages
- .as_ref()
- .cloned()
- .unwrap_or(KAFKA_WRITER_MAX_QUEUE_SIZE) as f32
- * KAFKA_WRITER_MAX_QUEUE_SIZE_RATIO) as usize;
-
- Ok(KafkaLogSinker {
+ Ok(KafkaSinkWriter {
formatter,
inner,
config: config.clone(),
- future_manager: DeliveryFutureManager::new(max_delivery_buffer_size),
+ })
+ }
+}
+
+impl AsyncTruncateSinkWriter for KafkaSinkWriter {
+ type DeliveryFuture = KafkaSinkDeliveryFuture;
+
+ async fn write_chunk<'a>(
+ &'a mut self,
+ chunk: StreamChunk,
+ add_future: DeliveryFutureManagerAddFuture<'a, Self::DeliveryFuture>,
+ ) -> Result<()> {
+ let mut payload_writer = KafkaPayloadWriter {
+ inner: &mut self.inner,
+ add_future,
+ config: &self.config,
+ };
+ dispatch_sink_formatter_impl!(&self.formatter, formatter, {
+ payload_writer.write_chunk(chunk, formatter).await
})
}
}
@@ -537,50 +555,6 @@ impl<'a> FormattedSink for KafkaPayloadWriter<'a> {
}
}
-impl LogSinker for KafkaLogSinker {
- async fn consume_log_and_sink(mut self, mut log_reader: impl LogReader) -> Result<()> {
- log_reader.init().await?;
- loop {
- let select_result = drop_either_future(
- select(
- pin!(log_reader.next_item()),
- pin!(self.future_manager.next_truncate_offset()),
- )
- .await,
- );
- match select_result {
- Either::Left(item_result) => {
- let (epoch, item) = item_result?;
- match item {
- LogStoreReadItem::StreamChunk { chunk_id, chunk } => {
- dispatch_sink_formatter_impl!(&self.formatter, formatter, {
- let mut writer = KafkaPayloadWriter {
- inner: &self.inner,
- add_future: self
- .future_manager
- .start_write_chunk(epoch, chunk_id),
- config: &self.config,
- };
- writer.write_chunk(chunk, formatter).await?;
- })
- }
- LogStoreReadItem::Barrier {
- is_checkpoint: _is_checkpoint,
- } => {
- self.future_manager.add_barrier(epoch);
- }
- LogStoreReadItem::UpdateVnodeBitmap(_) => {}
- }
- }
- Either::Right(offset_result) => {
- let offset = offset_result?;
- log_reader.truncate(offset).await?;
- }
- }
- }
- }
-}
-
#[cfg(test)]
mod test {
use maplit::hashmap;
@@ -748,7 +722,7 @@ mod test {
let kafka_config = KafkaConfig::from_hashmap(properties)?;
// Create the actual sink writer to Kafka
- let mut sink = KafkaLogSinker::new(
+ let sink = KafkaSinkWriter::new(
kafka_config.clone(),
SinkFormatterImpl::AppendOnlyJson(AppendOnlyFormatter::new(
// We do not specify primary key for this schema
@@ -759,12 +733,16 @@ mod test {
.await
.unwrap();
+ use crate::sink::log_store::DeliveryFutureManager;
+
+ let mut future_manager = DeliveryFutureManager::new(usize::MAX);
+
for i in 0..10 {
println!("epoch: {}", i);
for j in 0..100 {
let mut writer = KafkaPayloadWriter {
inner: &sink.inner,
- add_future: sink.future_manager.start_write_chunk(i, j),
+ add_future: future_manager.start_write_chunk(i, j),
config: &sink.config,
};
match writer
diff --git a/src/connector/src/sink/kinesis.rs b/src/connector/src/sink/kinesis.rs
index dd8518af39948..605edde3b1eb0 100644
--- a/src/connector/src/sink/kinesis.rs
+++ b/src/connector/src/sink/kinesis.rs
@@ -30,8 +30,12 @@ use super::catalog::SinkFormatDesc;
use super::SinkParam;
use crate::common::KinesisCommon;
use crate::dispatch_sink_formatter_impl;
+use crate::sink::catalog::desc::SinkDesc;
use crate::sink::formatter::SinkFormatterImpl;
-use crate::sink::writer::{FormattedSink, LogSinkerOf, SinkWriter, SinkWriterExt};
+use crate::sink::log_store::DeliveryFutureManagerAddFuture;
+use crate::sink::writer::{
+ AsyncTruncateLogSinkerOf, AsyncTruncateSinkWriter, AsyncTruncateSinkWriterExt, FormattedSink,
+};
use crate::sink::{DummySinkCommitCoordinator, Result, Sink, SinkError, SinkWriterParam};
pub const KINESIS_SINK: &str = "kinesis";
@@ -67,10 +71,14 @@ impl TryFrom for KinesisSink {
impl Sink for KinesisSink {
type Coordinator = DummySinkCommitCoordinator;
- type LogSinker = LogSinkerOf;
+ type LogSinker = AsyncTruncateLogSinkerOf;
const SINK_NAME: &'static str = KINESIS_SINK;
+ fn default_sink_decouple(desc: &SinkDesc) -> bool {
+ desc.sink_type.is_append_only()
+ }
+
async fn validate(&self) -> Result<()> {
// Kinesis requires partition key. There is no builtin support for round-robin as in kafka/pulsar.
// https://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecord.html#Streams-PutRecord-request-PartitionKey
@@ -103,7 +111,7 @@ impl Sink for KinesisSink {
Ok(())
}
- async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result {
+ async fn new_log_sinker(&self, _writer_param: SinkWriterParam) -> Result {
Ok(KinesisSinkWriter::new(
self.config.clone(),
self.schema.clone(),
@@ -113,7 +121,7 @@ impl Sink for KinesisSink {
self.sink_from_name.clone(),
)
.await?
- .into_log_sinker(writer_param.sink_metrics))
+ .into_log_sinker(usize::MAX))
}
}
@@ -214,20 +222,16 @@ impl FormattedSink for KinesisSinkPayloadWriter {
}
}
-#[async_trait::async_trait]
-impl SinkWriter for KinesisSinkWriter {
- async fn write_batch(&mut self, chunk: StreamChunk) -> Result<()> {
- dispatch_sink_formatter_impl!(&self.formatter, formatter, {
+impl AsyncTruncateSinkWriter for KinesisSinkWriter {
+ async fn write_chunk<'a>(
+ &'a mut self,
+ chunk: StreamChunk,
+ _add_future: DeliveryFutureManagerAddFuture<'a, Self::DeliveryFuture>,
+ ) -> Result<()> {
+ dispatch_sink_formatter_impl!(
+ &self.formatter,
+ formatter,
self.payload_writer.write_chunk(chunk, formatter).await
- })
- }
-
- async fn begin_epoch(&mut self, _epoch: u64) -> Result<()> {
- // Kinesis offers no transactional guarantees, so we do nothing here.
- Ok(())
- }
-
- async fn barrier(&mut self, _is_checkpoint: bool) -> Result<()> {
- Ok(())
+ )
}
}
diff --git a/src/connector/src/sink/mod.rs b/src/connector/src/sink/mod.rs
index 639fc8d734758..fc590d2fa6935 100644
--- a/src/connector/src/sink/mod.rs
+++ b/src/connector/src/sink/mod.rs
@@ -35,7 +35,6 @@ pub mod utils;
pub mod writer;
use std::collections::HashMap;
-use std::future::Future;
use ::clickhouse::error::Error as ClickHouseError;
use ::redis::RedisError;
@@ -280,11 +279,9 @@ pub trait Sink: TryFrom {
}
}
-pub trait LogSinker: Send + 'static {
- fn consume_log_and_sink(
- self,
- log_reader: impl LogReader,
- ) -> impl Future> + Send + 'static;
+#[async_trait]
+pub trait LogSinker: 'static {
+ async fn consume_log_and_sink(self, log_reader: impl LogReader) -> Result<()>;
}
#[async_trait]
diff --git a/src/connector/src/sink/nats.rs b/src/connector/src/sink/nats.rs
index 8e3f3e2c18022..2f810eed786a9 100644
--- a/src/connector/src/sink/nats.rs
+++ b/src/connector/src/sink/nats.rs
@@ -25,10 +25,14 @@ use tokio_retry::strategy::{jitter, ExponentialBackoff};
use tokio_retry::Retry;
use super::utils::chunk_to_json;
-use super::{DummySinkCommitCoordinator, SinkWriter, SinkWriterParam};
+use super::{DummySinkCommitCoordinator, SinkWriterParam};
use crate::common::NatsCommon;
+use crate::sink::catalog::desc::SinkDesc;
use crate::sink::encoder::{JsonEncoder, TimestampHandlingMode};
-use crate::sink::writer::{LogSinkerOf, SinkWriterExt};
+use crate::sink::log_store::DeliveryFutureManagerAddFuture;
+use crate::sink::writer::{
+ AsyncTruncateLogSinkerOf, AsyncTruncateSinkWriter, AsyncTruncateSinkWriterExt,
+};
use crate::sink::{Result, Sink, SinkError, SinkParam, SINK_TYPE_APPEND_ONLY};
pub const NATS_SINK: &str = "nats";
@@ -88,10 +92,14 @@ impl TryFrom for NatsSink {
impl Sink for NatsSink {
type Coordinator = DummySinkCommitCoordinator;
- type LogSinker = LogSinkerOf;
+ type LogSinker = AsyncTruncateLogSinkerOf;
const SINK_NAME: &'static str = NATS_SINK;
+ fn default_sink_decouple(desc: &SinkDesc) -> bool {
+ desc.sink_type.is_append_only()
+ }
+
async fn validate(&self) -> Result<()> {
if !self.is_append_only {
return Err(SinkError::Nats(anyhow!(
@@ -110,11 +118,11 @@ impl Sink for NatsSink {
Ok(())
}
- async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result {
+ async fn new_log_sinker(&self, _writer_param: SinkWriterParam) -> Result {
Ok(
NatsSinkWriter::new(self.config.clone(), self.schema.clone())
.await?
- .into_log_sinker(writer_param.sink_metrics),
+ .into_log_sinker(usize::MAX),
)
}
}
@@ -153,17 +161,12 @@ impl NatsSinkWriter {
}
}
-#[async_trait::async_trait]
-impl SinkWriter for NatsSinkWriter {
- async fn write_batch(&mut self, chunk: StreamChunk) -> Result<()> {
+impl AsyncTruncateSinkWriter for NatsSinkWriter {
+ async fn write_chunk<'a>(
+ &'a mut self,
+ chunk: StreamChunk,
+ _add_future: DeliveryFutureManagerAddFuture<'a, Self::DeliveryFuture>,
+ ) -> Result<()> {
self.append_only(chunk).await
}
-
- async fn begin_epoch(&mut self, _epoch_id: u64) -> Result<()> {
- Ok(())
- }
-
- async fn barrier(&mut self, _is_checkpoint: bool) -> Result<()> {
- Ok(())
- }
}
diff --git a/src/connector/src/sink/pulsar.rs b/src/connector/src/sink/pulsar.rs
index f980b2ad9f9b1..9eb57c1ae0771 100644
--- a/src/connector/src/sink/pulsar.rs
+++ b/src/connector/src/sink/pulsar.rs
@@ -12,14 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-use std::collections::{HashMap, VecDeque};
+use std::collections::HashMap;
use std::fmt::Debug;
use std::time::Duration;
use anyhow::anyhow;
-use async_trait::async_trait;
-use futures::future::try_join_all;
-use futures::TryFutureExt;
+use futures::{FutureExt, TryFuture, TryFutureExt};
use pulsar::producer::{Message, SendFuture};
use pulsar::{Producer, ProducerOptions, Pulsar, TokioExecutor};
use risingwave_common::array::StreamChunk;
@@ -28,10 +26,15 @@ use serde::Deserialize;
use serde_with::{serde_as, DisplayFromStr};
use super::catalog::{SinkFormat, SinkFormatDesc};
-use super::{Sink, SinkError, SinkParam, SinkWriter, SinkWriterParam};
+use super::{Sink, SinkError, SinkParam, SinkWriterParam};
use crate::common::PulsarCommon;
-use crate::sink::formatter::SinkFormatterImpl;
-use crate::sink::writer::{FormattedSink, LogSinkerOf, SinkWriterExt};
+use crate::sink::catalog::desc::SinkDesc;
+use crate::sink::encoder::SerTo;
+use crate::sink::formatter::{SinkFormatter, SinkFormatterImpl};
+use crate::sink::log_store::DeliveryFutureManagerAddFuture;
+use crate::sink::writer::{
+ AsyncTruncateLogSinkerOf, AsyncTruncateSinkWriter, AsyncTruncateSinkWriterExt, FormattedSink,
+};
use crate::sink::{DummySinkCommitCoordinator, Result};
use crate::{deserialize_duration_from_string, dispatch_sink_formatter_impl};
@@ -155,11 +158,15 @@ impl TryFrom for PulsarSink {
impl Sink for PulsarSink {
type Coordinator = DummySinkCommitCoordinator;
- type LogSinker = LogSinkerOf;
+ type LogSinker = AsyncTruncateLogSinkerOf;
const SINK_NAME: &'static str = PULSAR_SINK;
- async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result {
+ fn default_sink_decouple(desc: &SinkDesc) -> bool {
+ desc.sink_type.is_append_only()
+ }
+
+ async fn new_log_sinker(&self, _writer_param: SinkWriterParam) -> Result {
Ok(PulsarSinkWriter::new(
self.config.clone(),
self.schema.clone(),
@@ -169,7 +176,7 @@ impl Sink for PulsarSink {
self.sink_from_name.clone(),
)
.await?
- .into_log_sinker(writer_param.sink_metrics))
+ .into_log_sinker(PULSAR_SEND_FUTURE_BUFFER_MAX_SIZE))
}
async fn validate(&self) -> Result<()> {
@@ -199,15 +206,26 @@ impl Sink for PulsarSink {
}
pub struct PulsarSinkWriter {
- payload_writer: PulsarPayloadWriter,
formatter: SinkFormatterImpl,
-}
-
-struct PulsarPayloadWriter {
pulsar: Pulsar,
producer: Producer,
config: PulsarConfig,
- send_future_buffer: VecDeque,
+}
+
+struct PulsarPayloadWriter<'w> {
+ producer: &'w mut Producer,
+ config: &'w PulsarConfig,
+ add_future: DeliveryFutureManagerAddFuture<'w, PulsarDeliveryFuture>,
+}
+
+pub type PulsarDeliveryFuture = impl TryFuture + Unpin + 'static;
+
+fn may_delivery_future(future: SendFuture) -> PulsarDeliveryFuture {
+ future.map(|result| {
+ result
+ .map(|_| ())
+ .map_err(|e: pulsar::Error| SinkError::Pulsar(anyhow!(e)))
+ })
}
impl PulsarSinkWriter {
@@ -226,17 +244,14 @@ impl PulsarSinkWriter {
let producer = build_pulsar_producer(&pulsar, &config).await?;
Ok(Self {
formatter,
- payload_writer: PulsarPayloadWriter {
- pulsar,
- producer,
- config,
- send_future_buffer: VecDeque::new(),
- },
+ pulsar,
+ producer,
+ config,
})
}
}
-impl PulsarPayloadWriter {
+impl<'w> PulsarPayloadWriter<'w> {
async fn send_message(&mut self, message: Message) -> Result<()> {
let mut success_flag = false;
let mut connection_err = None;
@@ -247,17 +262,10 @@ impl PulsarPayloadWriter {
// a SendFuture holding the message receipt
// or error after sending is returned
Ok(send_future) => {
- // Check if send_future_buffer is greater than the preset limit
- while self.send_future_buffer.len() >= PULSAR_SEND_FUTURE_BUFFER_MAX_SIZE {
- self.send_future_buffer
- .pop_front()
- .expect("Expect the SendFuture not to be None")
- .map_err(|e| SinkError::Pulsar(anyhow!(e)))
- .await?;
- }
-
+ self.add_future
+ .add_future_may_await(may_delivery_future(send_future))
+ .await?;
success_flag = true;
- self.send_future_buffer.push_back(send_future);
break;
}
// error upon sending
@@ -295,24 +303,9 @@ impl PulsarPayloadWriter {
self.send_message(message).await?;
Ok(())
}
-
- async fn commit_inner(&mut self) -> Result<()> {
- self.producer
- .send_batch()
- .map_err(pulsar_to_sink_err)
- .await?;
- try_join_all(
- self.send_future_buffer
- .drain(..)
- .map(|send_future| send_future.map_err(|e| SinkError::Pulsar(anyhow!(e)))),
- )
- .await?;
-
- Ok(())
- }
}
-impl FormattedSink for PulsarPayloadWriter {
+impl<'w> FormattedSink for PulsarPayloadWriter<'w> {
type K = String;
type V = Vec;
@@ -321,23 +314,33 @@ impl FormattedSink for PulsarPayloadWriter {
}
}
-#[async_trait]
-impl SinkWriter for PulsarSinkWriter {
- async fn write_batch(&mut self, chunk: StreamChunk) -> Result<()> {
+impl AsyncTruncateSinkWriter for PulsarSinkWriter {
+ type DeliveryFuture = PulsarDeliveryFuture;
+
+ async fn write_chunk<'a>(
+ &'a mut self,
+ chunk: StreamChunk,
+ add_future: DeliveryFutureManagerAddFuture<'a, Self::DeliveryFuture>,
+ ) -> Result<()> {
dispatch_sink_formatter_impl!(&self.formatter, formatter, {
- self.payload_writer.write_chunk(chunk, formatter).await
+ let mut payload_writer = PulsarPayloadWriter {
+ producer: &mut self.producer,
+ add_future,
+ config: &self.config,
+ };
+ // TODO: we can call `payload_writer.write_chunk(chunk, formatter)`,
+ // but for an unknown reason, this will greatly increase the compile time,
+ // by nearly 4x. May investigate it later.
+ for r in formatter.format_chunk(&chunk) {
+ let (key, value) = r?;
+ payload_writer
+ .write_inner(
+ key.map(SerTo::ser_to).transpose()?,
+ value.map(SerTo::ser_to).transpose()?,
+ )
+ .await?;
+ }
+ Ok(())
})
}
-
- async fn begin_epoch(&mut self, _epoch: u64) -> Result<()> {
- Ok(())
- }
-
- async fn barrier(&mut self, is_checkpoint: bool) -> Result {
- if is_checkpoint {
- self.payload_writer.commit_inner().await?;
- }
-
- Ok(())
- }
}
diff --git a/src/connector/src/sink/redis.rs b/src/connector/src/sink/redis.rs
index cc8ff74d0c9c5..af3ec3b981620 100644
--- a/src/connector/src/sink/redis.rs
+++ b/src/connector/src/sink/redis.rs
@@ -18,29 +18,30 @@ use anyhow::anyhow;
use async_trait::async_trait;
use redis::aio::Connection;
use redis::{Client as RedisClient, Pipeline};
-use regex::Regex;
use risingwave_common::array::StreamChunk;
use risingwave_common::catalog::Schema;
use serde_derive::{Deserialize, Serialize};
use serde_with::serde_as;
+use super::catalog::SinkFormatDesc;
+use super::encoder::template::TemplateEncoder;
use super::formatter::SinkFormatterImpl;
use super::writer::FormattedSink;
-use super::{SinkError, SinkParam, SINK_TYPE_APPEND_ONLY, SINK_TYPE_OPTION, SINK_TYPE_UPSERT};
+use super::{SinkError, SinkParam};
use crate::dispatch_sink_formatter_impl;
-use crate::sink::writer::{LogSinkerOf, SinkWriterExt};
-use crate::sink::{DummySinkCommitCoordinator, Result, Sink, SinkWriter, SinkWriterParam};
+use crate::sink::log_store::DeliveryFutureManagerAddFuture;
+use crate::sink::writer::{
+ AsyncTruncateLogSinkerOf, AsyncTruncateSinkWriter, AsyncTruncateSinkWriterExt,
+};
+use crate::sink::{DummySinkCommitCoordinator, Result, Sink, SinkWriterParam};
pub const REDIS_SINK: &str = "redis";
-
+pub const KEY_FORMAT: &str = "key_format";
+pub const VALUE_FORMAT: &str = "value_format";
#[derive(Deserialize, Serialize, Debug, Clone)]
pub struct RedisCommon {
#[serde(rename = "redis.url")]
pub url: String,
- #[serde(rename = "redis.keyformat")]
- pub key_format: Option,
- #[serde(rename = "redis.valueformat")]
- pub value_format: Option,
}
impl RedisCommon {
@@ -54,23 +55,13 @@ impl RedisCommon {
pub struct RedisConfig {
#[serde(flatten)]
pub common: RedisCommon,
-
- pub r#type: String, // accept "append-only" or "upsert"
}
impl RedisConfig {
pub fn from_hashmap(properties: HashMap) -> Result {
let config =
serde_json::from_value::(serde_json::to_value(properties).unwrap())
- .map_err(|e| SinkError::Config(anyhow!(e)))?;
- if config.r#type != SINK_TYPE_APPEND_ONLY && config.r#type != SINK_TYPE_UPSERT {
- return Err(SinkError::Config(anyhow!(
- "`{}` must be {}, or {}",
- SINK_TYPE_OPTION,
- SINK_TYPE_APPEND_ONLY,
- SINK_TYPE_UPSERT
- )));
- }
+ .map_err(|e| SinkError::Config(anyhow!("{:?}", e)))?;
Ok(config)
}
}
@@ -79,28 +70,10 @@ impl RedisConfig {
pub struct RedisSink {
config: RedisConfig,
schema: Schema,
- is_append_only: bool,
pk_indices: Vec,
-}
-
-fn check_string_format(format: &Option, set: &HashSet) -> Result<()> {
- if let Some(format) = format {
- // We will check if the string inside {} corresponds to a column name in rw.
- // In other words, the content within {} should exclusively consist of column names from rw,
- // which means '{{column_name}}' or '{{column_name1},{column_name2}}' would be incorrect.
- let re = Regex::new(r"\{([^}]*)\}").unwrap();
- if !re.is_match(format) {
- return Err(SinkError::Redis(
- "Can't find {} in key_format or value_format".to_string(),
- ));
- }
- for capture in re.captures_iter(format) {
- if let Some(inner_content) = capture.get(1) && !set.contains(inner_content.as_str()){
- return Err(SinkError::Redis(format!("Can't find field({:?}) in key_format or value_format",inner_content.as_str())))
- }
- }
- }
- Ok(())
+ format_desc: SinkFormatDesc,
+ db_name: String,
+ sink_from_name: String,
}
#[async_trait]
@@ -117,27 +90,33 @@ impl TryFrom for RedisSink {
Ok(Self {
config,
schema: param.schema(),
- is_append_only: param.sink_type.is_append_only(),
pk_indices: param.downstream_pk,
+ format_desc: param
+ .format_desc
+ .ok_or_else(|| SinkError::Config(anyhow!("missing FORMAT ... ENCODE ...")))?,
+ db_name: param.db_name,
+ sink_from_name: param.sink_from_name,
})
}
}
impl Sink for RedisSink {
type Coordinator = DummySinkCommitCoordinator;
- type LogSinker = LogSinkerOf;
+ type LogSinker = AsyncTruncateLogSinkerOf;
const SINK_NAME: &'static str = "redis";
- async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result {
+ async fn new_log_sinker(&self, _writer_param: SinkWriterParam) -> Result {
Ok(RedisSinkWriter::new(
self.config.clone(),
self.schema.clone(),
self.pk_indices.clone(),
- self.is_append_only,
+ &self.format_desc,
+ self.db_name.clone(),
+ self.sink_from_name.clone(),
)
.await?
- .into_log_sinker(writer_param.sink_metrics))
+ .into_log_sinker(usize::MAX))
}
async fn validate(&self) -> Result<()> {
@@ -157,8 +136,23 @@ impl Sink for RedisSink {
.filter(|(k, _)| self.pk_indices.contains(k))
.map(|(_, v)| v.name.clone())
.collect();
- check_string_format(&self.config.common.key_format, &pk_set)?;
- check_string_format(&self.config.common.value_format, &all_set)?;
+ if matches!(
+ self.format_desc.encode,
+ super::catalog::SinkEncode::Template
+ ) {
+ let key_format = self.format_desc.options.get(KEY_FORMAT).ok_or_else(|| {
+ SinkError::Config(anyhow!(
+ "Cannot find 'key_format',please set it or use JSON"
+ ))
+ })?;
+ let value_format = self.format_desc.options.get(VALUE_FORMAT).ok_or_else(|| {
+ SinkError::Config(anyhow!(
+ "Cannot find 'value_format',please set it or use JSON"
+ ))
+ })?;
+ TemplateEncoder::check_string_format(key_format, &pk_set)?;
+ TemplateEncoder::check_string_format(value_format, &all_set)?;
+ }
Ok(())
}
}
@@ -166,7 +160,6 @@ impl Sink for RedisSink {
pub struct RedisSinkWriter {
epoch: u64,
schema: Schema,
- is_append_only: bool,
pk_indices: Vec,
formatter: SinkFormatterImpl,
payload_writer: RedisSinkPayloadWriter,
@@ -220,21 +213,23 @@ impl RedisSinkWriter {
config: RedisConfig,
schema: Schema,
pk_indices: Vec,
- is_append_only: bool,
+ format_desc: &SinkFormatDesc,
+ db_name: String,
+ sink_from_name: String,
) -> Result {
let payload_writer = RedisSinkPayloadWriter::new(config.clone()).await?;
- let formatter = SinkFormatterImpl::new_with_redis(
+ let formatter = SinkFormatterImpl::new(
+ format_desc,
schema.clone(),
pk_indices.clone(),
- is_append_only,
- config.common.key_format,
- config.common.value_format,
- )?;
+ db_name,
+ sink_from_name,
+ )
+ .await?;
Ok(Self {
schema,
pk_indices,
- is_append_only,
epoch: 0,
formatter,
payload_writer,
@@ -242,24 +237,22 @@ impl RedisSinkWriter {
}
#[cfg(test)]
- pub fn mock(
+ pub async fn mock(
schema: Schema,
pk_indices: Vec,
- is_append_only: bool,
- key_format: Option,
- value_format: Option,
+ format_desc: &SinkFormatDesc,
) -> Result {
- let formatter = SinkFormatterImpl::new_with_redis(
+ let formatter = SinkFormatterImpl::new(
+ format_desc,
schema.clone(),
pk_indices.clone(),
- is_append_only,
- key_format,
- value_format,
- )?;
+ "d1".to_string(),
+ "t1".to_string(),
+ )
+ .await?;
Ok(Self {
schema,
pk_indices,
- is_append_only,
epoch: 0,
formatter,
payload_writer: RedisSinkPayloadWriter::mock(),
@@ -267,29 +260,22 @@ impl RedisSinkWriter {
}
}
-#[async_trait]
-impl SinkWriter for RedisSinkWriter {
- async fn write_batch(&mut self, chunk: StreamChunk) -> Result<()> {
+impl AsyncTruncateSinkWriter for RedisSinkWriter {
+ async fn write_chunk<'a>(
+ &'a mut self,
+ chunk: StreamChunk,
+ _add_future: DeliveryFutureManagerAddFuture<'a, Self::DeliveryFuture>,
+ ) -> Result<()> {
dispatch_sink_formatter_impl!(&self.formatter, formatter, {
self.payload_writer.write_chunk(chunk, formatter).await
})
}
-
- async fn begin_epoch(&mut self, epoch: u64) -> Result<()> {
- self.epoch = epoch;
- Ok(())
- }
-
- async fn barrier(&mut self, is_checkpoint: bool) -> Result<()> {
- if is_checkpoint {
- self.payload_writer.commit().await?;
- }
- Ok(())
- }
}
#[cfg(test)]
mod test {
+ use std::collections::BTreeMap;
+
use rdkafka::message::FromBytes;
use risingwave_common::array::{Array, I32Array, Op, StreamChunk, Utf8Array};
use risingwave_common::catalog::{Field, Schema};
@@ -297,6 +283,8 @@ mod test {
use risingwave_common::util::iter_util::ZipEqDebug;
use super::*;
+ use crate::sink::catalog::{SinkEncode, SinkFormat};
+ use crate::sink::log_store::DeliveryFutureManager;
#[tokio::test]
async fn test_write() {
@@ -315,8 +303,15 @@ mod test {
},
]);
- let mut redis_sink_writer =
- RedisSinkWriter::mock(schema, vec![0], true, None, None).unwrap();
+ let format_desc = SinkFormatDesc {
+ format: SinkFormat::AppendOnly,
+ encode: SinkEncode::Json,
+ options: BTreeMap::default(),
+ };
+
+ let mut redis_sink_writer = RedisSinkWriter::mock(schema, vec![0], &format_desc)
+ .await
+ .unwrap();
let chunk_a = StreamChunk::new(
vec![Op::Insert, Op::Insert, Op::Insert],
@@ -326,8 +321,10 @@ mod test {
],
);
+ let mut manager = DeliveryFutureManager::new(0);
+
redis_sink_writer
- .write_batch(chunk_a)
+ .write_chunk(chunk_a, manager.start_write_chunk(0, 0))
.await
.expect("failed to write batch");
let expected_a =
@@ -367,14 +364,23 @@ mod test {
},
]);
- let mut redis_sink_writer = RedisSinkWriter::mock(
- schema,
- vec![0],
- true,
- Some("key-{id}".to_string()),
- Some("values:{id:{id},name:{name}}".to_string()),
- )
- .unwrap();
+ let mut btree_map = BTreeMap::default();
+ btree_map.insert(KEY_FORMAT.to_string(), "key-{id}".to_string());
+ btree_map.insert(
+ VALUE_FORMAT.to_string(),
+ "values:{id:{id},name:{name}}".to_string(),
+ );
+ let format_desc = SinkFormatDesc {
+ format: SinkFormat::AppendOnly,
+ encode: SinkEncode::Template,
+ options: btree_map,
+ };
+
+ let mut redis_sink_writer = RedisSinkWriter::mock(schema, vec![0], &format_desc)
+ .await
+ .unwrap();
+
+ let mut future_manager = DeliveryFutureManager::new(0);
let chunk_a = StreamChunk::new(
vec![Op::Insert, Op::Insert, Op::Insert],
@@ -385,7 +391,7 @@ mod test {
);
redis_sink_writer
- .write_batch(chunk_a)
+ .write_chunk(chunk_a, future_manager.start_write_chunk(0, 0))
.await
.expect("failed to write batch");
let expected_a = vec![
diff --git a/src/connector/src/sink/remote.rs b/src/connector/src/sink/remote.rs
index ad182e734a33a..3c52cb720dbd4 100644
--- a/src/connector/src/sink/remote.rs
+++ b/src/connector/src/sink/remote.rs
@@ -13,17 +13,23 @@
// limitations under the License.
use std::collections::HashMap;
+use std::fmt::Formatter;
+use std::future::Future;
use std::marker::PhantomData;
use std::ops::Deref;
+use std::time::Instant;
use anyhow::anyhow;
use async_trait::async_trait;
+use futures::stream::Peekable;
+use futures::{StreamExt, TryFutureExt, TryStreamExt};
use itertools::Itertools;
use jni::objects::{JByteArray, JValue, JValueOwned};
use prost::Message;
use risingwave_common::array::StreamChunk;
use risingwave_common::error::anyhow_error;
use risingwave_common::types::DataType;
+use risingwave_common::util::await_future_with_monitor_error_stream;
use risingwave_jni_core::jvm_runtime::JVM;
use risingwave_pb::connector_service::sink_coordinator_stream_request::{
CommitMetadata, StartCoordinator,
@@ -43,15 +49,17 @@ use risingwave_pb::connector_service::{
};
use tokio::sync::mpsc;
use tokio::sync::mpsc::{Receiver, Sender};
+use tokio_stream::wrappers::ReceiverStream;
use tracing::warn;
use super::encoder::{JsonEncoder, RowEncoder};
use crate::sink::coordinate::CoordinatedSinkWriter;
use crate::sink::encoder::TimestampHandlingMode;
+use crate::sink::log_store::{LogReader, LogStoreReadItem, TruncateOffset};
use crate::sink::writer::{LogSinkerOf, SinkWriter, SinkWriterExt};
use crate::sink::{
- DummySinkCommitCoordinator, Result, Sink, SinkCommitCoordinator, SinkError, SinkMetrics,
- SinkParam, SinkWriterParam,
+ DummySinkCommitCoordinator, LogSinker, Result, Sink, SinkCommitCoordinator, SinkError,
+ SinkMetrics, SinkParam, SinkWriterParam,
};
use crate::ConnectorParams;
@@ -101,18 +109,12 @@ impl TryFrom for RemoteSink {
impl Sink for RemoteSink {
type Coordinator = DummySinkCommitCoordinator;
- type LogSinker = LogSinkerOf>;
+ type LogSinker = RemoteLogSinker;
const SINK_NAME: &'static str = R::SINK_NAME;
async fn new_log_sinker(&self, writer_param: SinkWriterParam) -> Result {
- Ok(RemoteSinkWriter::new(
- self.param.clone(),
- writer_param.connector_params,
- writer_param.sink_metrics.clone(),
- )
- .await?
- .into_log_sinker(writer_param.sink_metrics))
+ RemoteLogSinker::new(self.param.clone(), writer_param).await
}
async fn validate(&self) -> Result<()> {
@@ -192,6 +194,140 @@ impl Sink for RemoteSink {
}
}
+pub struct RemoteLogSinker {
+ writer: RemoteSinkWriter,
+ sink_metrics: SinkMetrics,
+}
+
+impl RemoteLogSinker {
+ async fn new(sink_param: SinkParam, writer_param: SinkWriterParam) -> Result {
+ let writer = RemoteSinkWriter::new(
+ sink_param,
+ writer_param.connector_params,
+ writer_param.sink_metrics.clone(),
+ )
+ .await?;
+ let sink_metrics = writer_param.sink_metrics;
+ Ok(RemoteLogSinker {
+ writer,
+ sink_metrics,
+ })
+ }
+}
+
+/// Await the given future while monitoring on error of the receiver stream.
+async fn await_future_with_monitor_receiver_err>>(
+ receiver: &mut SinkWriterStreamJniReceiver,
+ future: F,
+) -> Result {
+ match await_future_with_monitor_error_stream(&mut receiver.response_stream, future).await {
+ Ok(result) => result,
+ Err(None) => Err(SinkError::Remote(anyhow!("end of remote receiver stream"))),
+ Err(Some(err)) => Err(SinkError::Internal(err)),
+ }
+}
+
+#[async_trait]
+impl LogSinker for RemoteLogSinker {
+ async fn consume_log_and_sink(self, mut log_reader: impl LogReader) -> Result<()> {
+ // Note: this is a total copy of the implementation of LogSinkerOf,
+ // except that we monitor the future of `log_reader.next_item` with await_future_with_monitor_receiver_err
+ // to monitor the error in the response stream.
+
+ let mut sink_writer = self.writer;
+ let sink_metrics = self.sink_metrics;
+ #[derive(Debug)]
+ enum LogConsumerState {
+ /// Mark that the log consumer is not initialized yet
+ Uninitialized,
+
+ /// Mark that a new epoch has begun.
+ EpochBegun { curr_epoch: u64 },
+
+ /// Mark that the consumer has just received a barrier
+ BarrierReceived { prev_epoch: u64 },
+ }
+
+ let mut state = LogConsumerState::Uninitialized;
+
+ log_reader.init().await?;
+
+ loop {
+ let (epoch, item): (u64, LogStoreReadItem) = await_future_with_monitor_receiver_err(
+ &mut sink_writer.stream_handle.response_rx,
+ log_reader.next_item().map_err(SinkError::Internal),
+ )
+ .await?;
+ if let LogStoreReadItem::UpdateVnodeBitmap(_) = &item {
+ match &state {
+ LogConsumerState::BarrierReceived { .. } => {}
+ _ => unreachable!(
+ "update vnode bitmap can be accepted only right after \
+ barrier, but current state is {:?}",
+ state
+ ),
+ }
+ }
+ // begin_epoch when not previously began
+ state = match state {
+ LogConsumerState::Uninitialized => {
+ sink_writer.begin_epoch(epoch).await?;
+ LogConsumerState::EpochBegun { curr_epoch: epoch }
+ }
+ LogConsumerState::EpochBegun { curr_epoch } => {
+ assert!(
+ epoch >= curr_epoch,
+ "new epoch {} should not be below the current epoch {}",
+ epoch,
+ curr_epoch
+ );
+ LogConsumerState::EpochBegun { curr_epoch: epoch }
+ }
+ LogConsumerState::BarrierReceived { prev_epoch } => {
+ assert!(
+ epoch > prev_epoch,
+ "new epoch {} should be greater than prev epoch {}",
+ epoch,
+ prev_epoch
+ );
+ sink_writer.begin_epoch(epoch).await?;
+ LogConsumerState::EpochBegun { curr_epoch: epoch }
+ }
+ };
+ match item {
+ LogStoreReadItem::StreamChunk { chunk, .. } => {
+ if let Err(e) = sink_writer.write_batch(chunk).await {
+ sink_writer.abort().await?;
+ return Err(e);
+ }
+ }
+ LogStoreReadItem::Barrier { is_checkpoint } => {
+ let prev_epoch = match state {
+ LogConsumerState::EpochBegun { curr_epoch } => curr_epoch,
+ _ => unreachable!("epoch must have begun before handling barrier"),
+ };
+ if is_checkpoint {
+ let start_time = Instant::now();
+ sink_writer.barrier(true).await?;
+ sink_metrics
+ .sink_commit_duration_metrics
+ .observe(start_time.elapsed().as_millis() as f64);
+ log_reader
+ .truncate(TruncateOffset::Barrier { epoch })
+ .await?;
+ } else {
+ sink_writer.barrier(false).await?;
+ }
+ state = LogConsumerState::BarrierReceived { prev_epoch }
+ }
+ LogStoreReadItem::UpdateVnodeBitmap(vnode_bitmap) => {
+ sink_writer.update_vnode_bitmap(vnode_bitmap).await?;
+ }
+ }
+ }
+ }
+}
+
#[derive(Debug)]
pub struct CoordinatedRemoteSink(pub RemoteSink);
@@ -286,14 +422,11 @@ impl SinkCoordinatorStreamJniHandle {
}
}
-const DEFAULT_CHANNEL_SIZE: usize = 16;
-#[derive(Debug)]
-pub struct SinkWriterStreamJniHandle {
+struct SinkWriterStreamJniSender {
request_tx: Sender,
- response_rx: Receiver,
}
-impl SinkWriterStreamJniHandle {
+impl SinkWriterStreamJniSender {
pub async fn start_epoch(&mut self, epoch: u64) -> Result<()> {
self.request_tx
.send(SinkWriterStreamRequest {
@@ -316,33 +449,29 @@ impl SinkWriterStreamJniHandle {
.map_err(|err| SinkError::Internal(err.into()))
}
- pub async fn barrier(&mut self, epoch: u64) -> Result<()> {
+ pub async fn barrier(&mut self, epoch: u64, is_checkpoint: bool) -> Result<()> {
self.request_tx
.send(SinkWriterStreamRequest {
request: Some(SinkRequest::Barrier(Barrier {
epoch,
- is_checkpoint: false,
+ is_checkpoint,
})),
})
.await
.map_err(|err| SinkError::Internal(err.into()))
}
+}
- pub async fn commit(&mut self, epoch: u64) -> Result {
- self.request_tx
- .send(SinkWriterStreamRequest {
- request: Some(SinkRequest::Barrier(Barrier {
- epoch,
- is_checkpoint: true,
- })),
- })
- .await
- .map_err(|err| SinkError::Internal(err.into()))?;
+struct SinkWriterStreamJniReceiver {
+ response_stream: Peekable>>,
+}
- match self.response_rx.recv().await {
- Some(SinkWriterStreamResponse {
+impl SinkWriterStreamJniReceiver {
+ async fn next_commit_response(&mut self) -> Result {
+ match self.response_stream.try_next().await {
+ Ok(Some(SinkWriterStreamResponse {
response: Some(sink_writer_stream_response::Response::Commit(rsp)),
- }) => Ok(rsp),
+ })) => Ok(rsp),
msg => Err(SinkError::Internal(anyhow!(
"should get Sync response but get {:?}",
msg
@@ -351,6 +480,53 @@ impl SinkWriterStreamJniHandle {
}
}
+const DEFAULT_CHANNEL_SIZE: usize = 16;
+struct SinkWriterStreamJniHandle {
+ request_tx: SinkWriterStreamJniSender,
+ response_rx: SinkWriterStreamJniReceiver,
+}
+
+impl std::fmt::Debug for SinkWriterStreamJniHandle {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ f.debug_struct("SinkWriterStreamJniHandle").finish()
+ }
+}
+
+impl SinkWriterStreamJniHandle {
+ async fn start_epoch(&mut self, epoch: u64) -> Result<()> {
+ await_future_with_monitor_receiver_err(
+ &mut self.response_rx,
+ self.request_tx.start_epoch(epoch),
+ )
+ .await
+ }
+
+ async fn write_batch(&mut self, epoch: u64, batch_id: u64, payload: Payload) -> Result<()> {
+ await_future_with_monitor_receiver_err(
+ &mut self.response_rx,
+ self.request_tx.write_batch(epoch, batch_id, payload),
+ )
+ .await
+ }
+
+ async fn barrier(&mut self, epoch: u64) -> Result<()> {
+ await_future_with_monitor_receiver_err(
+ &mut self.response_rx,
+ self.request_tx.barrier(epoch, false),
+ )
+ .await
+ }
+
+ async fn commit(&mut self, epoch: u64) -> Result {
+ await_future_with_monitor_receiver_err(
+ &mut self.response_rx,
+ self.request_tx.barrier(epoch, true),
+ )
+ .await?;
+ self.response_rx.next_commit_response().await
+ }
+}
+
pub type RemoteSinkWriter = RemoteSinkWriterInner<(), R>;
pub type CoordinatedRemoteSinkWriter = RemoteSinkWriterInner, R>;
@@ -374,10 +550,7 @@ impl RemoteSinkWriterInner {
let (request_tx, request_rx) = mpsc::channel(DEFAULT_CHANNEL_SIZE);
let (response_tx, response_rx) = mpsc::channel(DEFAULT_CHANNEL_SIZE);
- let mut stream_handle = SinkWriterStreamJniHandle {
- request_tx,
- response_rx,
- };
+ let mut response_stream = ReceiverStream::new(response_rx).peekable();
std::thread::spawn(move || {
let mut env = JVM.get_or_init().unwrap().attach_current_thread().unwrap();
@@ -388,7 +561,10 @@ impl RemoteSinkWriterInner {
"(JJ)V",
&[
JValue::from(&request_rx as *const Receiver as i64),
- JValue::from(&response_tx as *const Sender as i64),
+ JValue::from(
+ &response_tx as *const Sender>
+ as i64,
+ ),
],
);
@@ -410,8 +586,7 @@ impl RemoteSinkWriterInner {
};
// First request
- stream_handle
- .request_tx
+ request_tx
.send(sink_writer_stream_request)
.await
.map_err(|err| {
@@ -423,17 +598,18 @@ impl RemoteSinkWriterInner {
})?;
// First response
- match stream_handle.response_rx.recv().await {
- Some(SinkWriterStreamResponse {
+ match response_stream.try_next().await {
+ Ok(Some(SinkWriterStreamResponse {
response: Some(sink_writer_stream_response::Response::Start(_)),
- }) => {}
- msg => {
+ })) => {}
+ Ok(msg) => {
return Err(SinkError::Internal(anyhow!(
"should get start response for connector `{}` but get {:?}",
R::SINK_NAME,
msg
)));
}
+ Err(e) => return Err(SinkError::Internal(e)),
};
tracing::trace!(
@@ -444,6 +620,11 @@ impl RemoteSinkWriterInner {
let schema = param.schema();
+ let stream_handle = SinkWriterStreamJniHandle {
+ request_tx: SinkWriterStreamJniSender { request_tx },
+ response_rx: SinkWriterStreamJniReceiver { response_stream },
+ };
+
Ok(Self {
properties: param.properties,
epoch: None,
@@ -458,7 +639,7 @@ impl RemoteSinkWriterInner {
#[cfg(test)]
fn for_test(
- response_receiver: Receiver,
+ response_receiver: Receiver>,
request_sender: Sender,
) -> RemoteSinkWriter {
use risingwave_common::catalog::{Field, Schema};
@@ -480,8 +661,12 @@ impl RemoteSinkWriterInner {
]);
let stream_handle = SinkWriterStreamJniHandle {
- request_tx: request_sender,
- response_rx: response_receiver,
+ request_tx: SinkWriterStreamJniSender {
+ request_tx: request_sender,
+ },
+ response_rx: SinkWriterStreamJniReceiver {
+ response_stream: ReceiverStream::new(response_receiver).peekable(),
+ },
};
RemoteSinkWriter {
@@ -828,12 +1013,12 @@ mod test {
// test commit
response_sender
- .send(SinkWriterStreamResponse {
+ .send(Ok(SinkWriterStreamResponse {
response: Some(Response::Commit(CommitResponse {
epoch: 2022,
metadata: None,
})),
- })
+ }))
.await
.expect("test failed: failed to sync epoch");
sink.barrier(true).await.unwrap();
diff --git a/src/connector/src/sink/writer.rs b/src/connector/src/sink/writer.rs
index 37ad452831b2e..64261bb42ab48 100644
--- a/src/connector/src/sink/writer.rs
+++ b/src/connector/src/sink/writer.rs
@@ -12,17 +12,25 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+use std::future::{Future, Ready};
+use std::pin::pin;
use std::sync::Arc;
use std::time::Instant;
use async_trait::async_trait;
+use futures::future::{select, Either};
+use futures::TryFuture;
use risingwave_common::array::StreamChunk;
use risingwave_common::buffer::Bitmap;
+use risingwave_common::util::drop_either_future;
use crate::sink::encoder::SerTo;
use crate::sink::formatter::SinkFormatter;
-use crate::sink::log_store::{LogReader, LogStoreReadItem, TruncateOffset};
-use crate::sink::{LogSinker, Result, SinkMetrics};
+use crate::sink::log_store::{
+ DeliveryFutureManager, DeliveryFutureManagerAddFuture, LogReader, LogStoreReadItem,
+ TruncateOffset,
+};
+use crate::sink::{LogSinker, Result, SinkError, SinkMetrics};
#[async_trait]
pub trait SinkWriter: Send + 'static {
@@ -48,22 +56,17 @@ pub trait SinkWriter: Send + 'static {
}
}
-// TODO: remove this trait after KafkaSinkWriter implements SinkWriter
-#[async_trait]
-// An old version of SinkWriter for backward compatibility
-pub trait SinkWriterV1: Send + 'static {
- async fn write_batch(&mut self, chunk: StreamChunk) -> Result<()>;
-
- // the following interface is for transactions, if not supported, return Ok(())
- // start a transaction with epoch number. Note that epoch number should be increasing.
- async fn begin_epoch(&mut self, epoch: u64) -> Result<()>;
+pub type DummyDeliveryFuture = Ready>;
- // commits the current transaction and marks all messages in the transaction success.
- async fn commit(&mut self) -> Result<()>;
+pub trait AsyncTruncateSinkWriter: Send + 'static {
+ type DeliveryFuture: TryFuture + Unpin + Send + 'static =
+ DummyDeliveryFuture;
- // aborts the current transaction because some error happens. we should rollback to the last
- // commit point.
- async fn abort(&mut self) -> Result<()>;
+ fn write_chunk<'a>(
+ &'a mut self,
+ chunk: StreamChunk,
+ add_future: DeliveryFutureManagerAddFuture<'a, Self::DeliveryFuture>,
+ ) -> impl Future> + Send + 'a;
}
/// A free-form sink that may output in multiple formats and encodings. Examples include kafka,
@@ -104,12 +107,12 @@ pub trait FormattedSink {
}
}
-pub struct LogSinkerOf> {
+pub struct LogSinkerOf {
writer: W,
sink_metrics: SinkMetrics,
}
-impl> LogSinkerOf {
+impl LogSinkerOf {
pub fn new(writer: W, sink_metrics: SinkMetrics) -> Self {
LogSinkerOf {
writer,
@@ -118,6 +121,7 @@ impl> LogSinkerOf {
}
}
+#[async_trait]
impl> LogSinker for LogSinkerOf {
async fn consume_log_and_sink(self, mut log_reader: impl LogReader) -> Result<()> {
let mut sink_writer = self.writer;
@@ -222,3 +226,64 @@ where
}
}
}
+
+pub struct AsyncTruncateLogSinkerOf {
+ writer: W,
+ future_manager: DeliveryFutureManager,
+}
+
+impl AsyncTruncateLogSinkerOf {
+ pub fn new(writer: W, max_future_count: usize) -> Self {
+ AsyncTruncateLogSinkerOf {
+ writer,
+ future_manager: DeliveryFutureManager::new(max_future_count),
+ }
+ }
+}
+
+#[async_trait]
+impl LogSinker for AsyncTruncateLogSinkerOf {
+ async fn consume_log_and_sink(mut self, mut log_reader: impl LogReader) -> Result<()> {
+ log_reader.init().await?;
+ loop {
+ let select_result = drop_either_future(
+ select(
+ pin!(log_reader.next_item()),
+ pin!(self.future_manager.next_truncate_offset()),
+ )
+ .await,
+ );
+ match select_result {
+ Either::Left(item_result) => {
+ let (epoch, item) = item_result?;
+ match item {
+ LogStoreReadItem::StreamChunk { chunk_id, chunk } => {
+ let add_future = self.future_manager.start_write_chunk(epoch, chunk_id);
+ self.writer.write_chunk(chunk, add_future).await?;
+ }
+ LogStoreReadItem::Barrier {
+ is_checkpoint: _is_checkpoint,
+ } => {
+ self.future_manager.add_barrier(epoch);
+ }
+ LogStoreReadItem::UpdateVnodeBitmap(_) => {}
+ }
+ }
+ Either::Right(offset_result) => {
+ let offset = offset_result?;
+ log_reader.truncate(offset).await?;
+ }
+ }
+ }
+ }
+}
+
+#[easy_ext::ext(AsyncTruncateSinkWriterExt)]
+impl T
+where
+ T: AsyncTruncateSinkWriter + Sized,
+{
+ pub fn into_log_sinker(self, max_future_count: usize) -> AsyncTruncateLogSinkerOf {
+ AsyncTruncateLogSinkerOf::new(self, max_future_count)
+ }
+}
diff --git a/src/expr/core/src/aggregate/def.rs b/src/expr/core/src/aggregate/def.rs
index f71bfd454a415..964ec46c9f9c4 100644
--- a/src/expr/core/src/aggregate/def.rs
+++ b/src/expr/core/src/aggregate/def.rs
@@ -233,6 +233,9 @@ pub enum AggKind {
PercentileDisc,
Mode,
Grouping,
+
+ /// Return last seen one of the input values.
+ InternalLastSeenValue,
}
impl AggKind {
@@ -264,6 +267,7 @@ impl AggKind {
PbType::PercentileDisc => Ok(AggKind::PercentileDisc),
PbType::Mode => Ok(AggKind::Mode),
PbType::Grouping => Ok(AggKind::Grouping),
+ PbType::InternalLastSeenValue => Ok(AggKind::InternalLastSeenValue),
PbType::Unspecified => bail!("Unrecognized agg."),
}
}
@@ -294,8 +298,9 @@ impl AggKind {
Self::VarSamp => PbType::VarSamp,
Self::PercentileCont => PbType::PercentileCont,
Self::PercentileDisc => PbType::PercentileDisc,
- Self::Grouping => PbType::Grouping,
Self::Mode => PbType::Mode,
+ Self::Grouping => PbType::Grouping,
+ Self::InternalLastSeenValue => PbType::InternalLastSeenValue,
}
}
}
@@ -422,6 +427,7 @@ pub mod agg_kinds {
| AggKind::BoolAnd
| AggKind::BoolOr
| AggKind::ApproxCountDistinct
+ | AggKind::InternalLastSeenValue
};
}
pub use single_value_state;
@@ -450,7 +456,11 @@ impl AggKind {
/// Get the total phase agg kind from the partial phase agg kind.
pub fn partial_to_total(self) -> Option