diff --git a/.config/hakari.toml b/.config/hakari.toml index 4d9ec4b357346..c1355e0892334 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -34,6 +34,9 @@ workspace-members = [ ] third-party = [ { name = "opendal" }, + # For some reasons, tikv-jemalloc-sys would be compiled twice if being added into `workspace-hack` + { name = "tikv-jemalloc-sys", git = "https://github.com/risingwavelabs/jemallocator.git", rev = "64a2d9" }, + { name = "tikv-jemallocator", git = "https://github.com/risingwavelabs/jemallocator.git", rev = "64a2d9" }, # These are solely dev-dependencies. Unifying them may slow down build. { name = "criterion" }, { name = "console" }, diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index bae692a579559..c3a80429ee84d 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -33,3 +33,6 @@ f8266748dcb70541da944664552c1944ff8362e4 # feat(risedev): add check for trailing spaces in `risedev check` (#11294) f2a3fd021059e680b35b24c63cff5f8dbe9f9d5f + +# chore(rustfmt): format let-chains and let-else #9409 +d70dba827c303373f3220c9733f7c7443e5c2d37 \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 2dd2d9347b96b..b2d58279b5290 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -7,7 +7,10 @@ updates: open-pull-requests-limit: 10 # Disable auto rebase to reduce cost. Use `@dependabot rebase` manually instead. rebase-strategy: "disabled" - + # Ignore patch to reduce spam. Manually run `cargo update` regularly instead. + ignore: + - dependency-name: "*" + update-types: ["version-update:semver-patch"] # Create a group of dependencies to be updated together in one pull request groups: arrow: @@ -16,3 +19,7 @@ updates: aws: patterns: - "aws*" + tonic: + patterns: + - "tonic*" + - "prost*" diff --git a/.github/workflows/cherry-pick-to-release-branch.yml b/.github/workflows/cherry-pick-to-release-branch.yml index e98e1769630b9..026b2313d8353 100644 --- a/.github/workflows/cherry-pick-to-release-branch.yml +++ b/.github/workflows/cherry-pick-to-release-branch.yml @@ -6,8 +6,8 @@ on: types: ["closed", "labeled"] jobs: - release_pull_request_1_1: - if: "contains(github.event.pull_request.labels.*.name, 'need-cherry-pick-v1.1') && github.event.pull_request.merged == true" + release_pull_request_1_3: + if: "contains(github.event.pull_request.labels.*.name, 'need-cherry-pick-v1.3') && github.event.pull_request.merged == true" runs-on: ubuntu-latest name: release_pull_request steps: @@ -16,9 +16,9 @@ jobs: - name: Create PR to branch uses: risingwavelabs/github-action-cherry-pick@master with: - pr_branch: 'v1.1-rc' + pr_branch: 'v1.3-rc' pr_labels: 'cherry-pick' - pr_body: ${{ format('Cherry picking \#{0} onto branch v1.1-rc', github.event.number) }} + pr_body: ${{ format('Cherry picking \#{0} onto branch v1.3-rc', github.event.number) }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml index 66f740666e2a7..3e181eda27fce 100644 --- a/.github/workflows/doc.yml +++ b/.github/workflows/doc.yml @@ -14,6 +14,16 @@ jobs: build: runs-on: ubuntu-latest steps: + - name: Maximize build space + uses: easimon/maximize-build-space@master + with: + remove-dotnet: 'true' + remove-android: 'true' + remove-haskell: 'true' + remove-codeql: 'true' + remove-docker-images: 'true' + root-reserve-mb: 10240 + temp-reserve-mb: 10240 - uses: actions/checkout@v3 - name: Setup Rust toolchain run: rustup show @@ -30,6 +40,8 @@ jobs: mkdir artifact cp -R target/doc/* artifact + - name: Show available storage + run: df -h - name: Install cargo-docset uses: taiki-e/install-action@v2 with: @@ -49,7 +61,8 @@ jobs: uses: actions/upload-pages-artifact@v1 with: path: artifact - + - name: Show available storage + run: df -h deploy: needs: build permissions: diff --git a/.github/workflows/hakari_fix.yml b/.github/workflows/hakari_fix.yml index 670ca38cccc27..b8ded582c36e9 100644 --- a/.github/workflows/hakari_fix.yml +++ b/.github/workflows/hakari_fix.yml @@ -15,7 +15,8 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.head_ref }} + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} - name: Install cargo-hakari uses: taiki-e/install-action@v2 diff --git a/.github/workflows/nightly-rust.yml b/.github/workflows/nightly-rust.yml index e6afb6970daec..5219b4805c74d 100644 --- a/.github/workflows/nightly-rust.yml +++ b/.github/workflows/nightly-rust.yml @@ -19,7 +19,15 @@ jobs: remove-haskell: 'true' remove-codeql: 'true' remove-docker-images: 'true' + root-reserve-mb: 2048 - uses: actions/checkout@v3 + if: ${{ github.event_name == 'schedule' }} + with: + # For daily scheduled run, we use a fixed branch, so that we can apply patches to fix compile errors earlier. + # We can also ensure the regression is due to new rust instead of new RisingWave code. + ref: xxchan/latest-nightly-rust + - uses: actions/checkout@v3 + if: ${{ !(github.event_name == 'schedule') }} - name: Setup Rust toolchain run: | rustup override set nightly diff --git a/.licenserc.yaml b/.licenserc.yaml index e596f12143b13..c1745a4d1ad74 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -17,5 +17,6 @@ header: - "**/*.d.ts" - "src/sqlparser/**/*.rs" - "java/connector-node/risingwave-source-cdc/src/main/java/com/risingwave/connector/cdc/debezium/internal/*.java" + - "src/meta/src/model_v2/migration/**/*.rs" comment: on-failure diff --git a/Cargo.lock b/Cargo.lock index 1df7d4f81c014..123834525d751 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,13 +49,19 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f2135563fb5c609d2b2b87c1e8ce7bc41b0b45430fa9661f457981503dd5bf0" +checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab" dependencies = [ "memchr", ] +[[package]] +name = "aliasable" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" + [[package]] name = "alloc-no-stdlib" version = "2.0.4" @@ -207,7 +213,7 @@ dependencies = [ "uuid", "xz2", "zerocopy", - "zstd", + "zstd 0.12.4", ] [[package]] @@ -220,7 +226,7 @@ dependencies = [ "proc-macro2", "quote", "serde_json", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -237,9 +243,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow-arith" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "895263144bd4a69751cbe6a34a53f26626e19770b313a9fa792c415cd0e78f11" +checksum = "bc1d4e368e87ad9ee64f28b9577a3834ce10fe2703a26b28417d485bbbdff956" dependencies = [ "arrow-array", "arrow-buffer", @@ -252,9 +258,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "226fdc6c3a4ae154a74c24091d36a90b514f0ed7112f5b8322c1d8f354d8e20d" +checksum = "d02efa7253ede102d45a4e802a129e83bcc3f49884cab795b1ac223918e4318d" dependencies = [ "ahash 0.8.3", "arrow-buffer", @@ -268,9 +274,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc4843af4dd679c2f35b69c572874da8fde33be53eb549a5fb128e7a4b763510" +checksum = "fda119225204141138cb0541c692fbfef0e875ba01bfdeaed09e9d354f9d6195" dependencies = [ "bytes", "half 2.3.1", @@ -279,9 +285,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35e8b9990733a9b635f656efda3c9b8308c7a19695c9ec2c7046dd154f9b144b" +checksum = "1d825d51b9968868d50bc5af92388754056796dbc62a4e25307d588a1fc84dee" dependencies = [ "arrow-array", "arrow-buffer", @@ -296,9 +302,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da900f31ff01a0a84da0572209be72b2b6f980f3ea58803635de47913191c188" +checksum = "475a4c3699c8b4095ca61cecf15da6f67841847a5f5aac983ccb9a377d02f73a" dependencies = [ "arrow-buffer", "arrow-schema", @@ -308,29 +314,29 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e40372d37a860a742f248d4d7c137950cd793f1b46f2b99a5116c55efbe2699f" +checksum = "cd938ea4a0e8d0db2b9f47ebba792f73f6188f4289707caeaf93a3be705e5ed5" dependencies = [ "arrow-array", "arrow-buffer", "arrow-cast", "arrow-ipc", "arrow-schema", - "base64 0.21.3", + "base64 0.21.4", "bytes", "futures", "paste", - "prost", + "prost 0.12.1", "tokio", - "tonic", + "tonic 0.10.2", ] [[package]] name = "arrow-ipc" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2707a8d7ee2d345d045283ece3ae43416175873483e5d96319c929da542a0b1f" +checksum = "1248005c8ac549f869b7a840859d942bf62471479c1a2d82659d453eebcd166a" dependencies = [ "arrow-array", "arrow-buffer", @@ -340,11 +346,26 @@ dependencies = [ "flatbuffers", ] +[[package]] +name = "arrow-ord" +version = "47.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03b87aa408ea6a6300e49eb2eba0c032c88ed9dc19e0a9948489c55efdca71f4" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "half 2.3.1", + "num", +] + [[package]] name = "arrow-row" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e32afc1329f7b372463b21c6ca502b07cf237e1ed420d87706c1770bb0ebd38" +checksum = "114a348ab581e7c9b6908fcab23cb39ff9f060eb19e72b13f8fb8eaa37f65d22" dependencies = [ "ahash 0.8.3", "arrow-array", @@ -357,16 +378,17 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b104f5daa730f00fde22adc03a12aa5a2ae9ccbbf99cbd53d284119ddc90e03d" +checksum = "5d1d179c117b158853e0101bfbed5615e86fe97ee356b4af901f1c5001e1ce4b" [[package]] name = "arrow-select" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73b3ca55356d1eae07cf48808d8c462cea674393ae6ad1e0b120f40b422eb2b4" +checksum = "d5c71e003202e67e9db139e5278c79f5520bb79922261dfe140e4637ee8b6108" dependencies = [ + "ahash 0.8.3", "arrow-array", "arrow-buffer", "arrow-data", @@ -380,6 +402,16 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" +[[package]] +name = "async-attributes" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5" +dependencies = [ + "quote", + "syn 1.0.109", +] + [[package]] name = "async-channel" version = "1.9.0" @@ -404,6 +436,56 @@ dependencies = [ "tokio", ] +[[package]] +name = "async-executor" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fa3dc5f2a8564f07759c008b9109dc0d39de92a88d5588b8a5036d286383afb" +dependencies = [ + "async-lock", + "async-task", + "concurrent-queue", + "fastrand 1.9.0", + "futures-lite", + "slab", +] + +[[package]] +name = "async-global-executor" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1b6f5d7df27bd294849f8eec66ecfc63d11814df7a4f5d74168a2394467b776" +dependencies = [ + "async-channel", + "async-executor", + "async-io", + "async-lock", + "blocking", + "futures-lite", + "once_cell", + "tokio", +] + +[[package]] +name = "async-io" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" +dependencies = [ + "async-lock", + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-lite", + "log", + "parking", + "polling", + "rustix 0.37.23", + "slab", + "socket2 0.4.9", + "waker-fn", +] + [[package]] name = "async-lock" version = "2.8.0" @@ -415,15 +497,14 @@ dependencies = [ [[package]] name = "async-nats" -version = "0.31.0" +version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8257238e2a3629ee5618502a75d1b91f8017c24638c75349fc8d2d80cf1f7c4c" +checksum = "0e45b67ea596bb94741ef15ba1d90b72c92bdc07553d8033734cb620a2b39f1c" dependencies = [ - "base64 0.21.3", + "base64 0.21.4", "bytes", "futures", "http", - "itoa", "memchr", "nkeys", "nuid", @@ -431,6 +512,7 @@ dependencies = [ "rand", "regex", "ring", + "rustls 0.21.7", "rustls-native-certs", "rustls-pemfile", "rustls-webpki 0.101.4", @@ -455,7 +537,34 @@ checksum = "5fd55a5ba1179988837d24ab4c7cc8ed6efdeff578ede0416b4225a5fca35bd0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", +] + +[[package]] +name = "async-std" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +dependencies = [ + "async-attributes", + "async-channel", + "async-global-executor", + "async-io", + "async-lock", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "futures-lite", + "gloo-timers", + "kv-log-macro", + "log", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", ] [[package]] @@ -477,7 +586,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -494,7 +603,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -512,6 +621,12 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba" +[[package]] +name = "atomic-waker" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1181e1e0d1fce796a03db1ae795d67167da795f9cf4a39c37589e85ef57f26d3" + [[package]] name = "auto_enums" version = "0.8.2" @@ -521,7 +636,7 @@ dependencies = [ "derive_utils", "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -1051,9 +1166,9 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" [[package]] name = "base64" -version = "0.21.3" +version = "0.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "414dcefbc63d77c526a76b3afcf6fbb9b5e2791c19c3aa2297733208750c6e53" +checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2" [[package]] name = "base64-simd" @@ -1071,7 +1186,7 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c5b0a88aa36e9f095ee2e2b13fb8c5e4313e022783aedacc123328c0084916d" dependencies = [ - "base64 0.21.3", + "base64 0.21.4", ] [[package]] @@ -1225,6 +1340,21 @@ dependencies = [ "generic-array", ] +[[package]] +name = "blocking" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77231a1c8f801696fc0123ec6150ce92cffb8e164a02afb9c8ddee0e9b65ad65" +dependencies = [ + "async-channel", + "async-lock", + "async-task", + "atomic-waker", + "fastrand 1.9.0", + "futures-lite", + "log", +] + [[package]] name = "borsh" version = "0.10.3" @@ -1342,9 +1472,9 @@ checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" @@ -1471,7 +1601,7 @@ checksum = "bc7cb2538d4ecc42b6c3b57a83094d8c69894e74468d18cd045a09fdea807358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -1582,7 +1712,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -1644,9 +1774,9 @@ dependencies = [ [[package]] name = "cmsketch" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "467e460587e81453bf9aeb43cd534e9c5ad670042023bd6c3f377c23b76cc2f0" +checksum = "93710598b87c37ea250ab17a36f9f79dbaf3bd20e55806cf09345103bc26d60e" dependencies = [ "paste", ] @@ -1686,7 +1816,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4" dependencies = [ "bytes", + "futures-core", "memchr", + "pin-project-lite", + "tokio", + "tokio-util", ] [[package]] @@ -1725,35 +1859,36 @@ dependencies = [ [[package]] name = "console-api" -version = "0.5.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2895653b4d9f1538a83970077cb01dfc77a4810524e51a110944688e916b18e" +checksum = "fd326812b3fd01da5bb1af7d340d0d555fd3d4b641e7f1dfcf5962a902952787" dependencies = [ - "prost", - "prost-types", - "tonic", + "futures-core", + "prost 0.12.1", + "prost-types 0.12.1", + "tonic 0.10.2", "tracing-core", ] [[package]] name = "console-subscriber" -version = "0.1.10" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4cf42660ac07fcebed809cfe561dd8730bcd35b075215e6479c516bcd0d11cb" +checksum = "7481d4c57092cd1c19dd541b92bdce883de840df30aa5d03fd48a3935c01842e" dependencies = [ "console-api", "crossbeam-channel", "crossbeam-utils", - "futures", + "futures-task", "hdrhistogram", "humantime", - "prost-types", + "prost-types 0.12.1", "serde", "serde_json", "thread_local", "tokio", "tokio-stream", - "tonic", + "tonic 0.10.2", "tracing", "tracing-core", "tracing-subscriber", @@ -1879,7 +2014,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -2046,9 +2181,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.2.2" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" dependencies = [ "csv-core", "itoa", @@ -2058,9 +2193,9 @@ dependencies = [ [[package]] name = "csv-core" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" dependencies = [ "memchr", ] @@ -2072,7 +2207,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f34ba9a9bcb8645379e9de8cb3ecfcf4d1c85ba66d90deb3259206fa5aa193b" dependencies = [ "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -2099,7 +2234,7 @@ checksum = "83fdaf97f4804dcebfa5862639bc9ce4121e82140bec2a987ac5140294865b5b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -2126,7 +2261,7 @@ dependencies = [ "proc-macro2", "quote", "scratch", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -2143,7 +2278,7 @@ checksum = "2fa16a70dd58129e4dfffdff535fb1bce66673f7bbeec4a5a1765a504e1ccd84" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -2215,7 +2350,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -2248,7 +2383,7 @@ checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" dependencies = [ "darling_core 0.20.3", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -2320,6 +2455,17 @@ dependencies = [ "serde", ] +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "derive_builder" version = "0.12.0" @@ -2359,18 +2505,19 @@ checksum = "9abcad25e9720609ccb3dcdb795d845e37d8ce34183330a9f48b03a1a71c8e21" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] name = "dialoguer" -version = "0.10.4" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59c6f2989294b9a498d3ad5491a79c6deb604617378e1cdc4bfc1c1361fe2f87" +checksum = "658bce805d770f407bc62102fca7c2c64ceef2fbcb2b8bd19d2765ce093980de" dependencies = [ "console", "shell-words", "tempfile", + "thiserror", "zeroize", ] @@ -2445,9 +2592,9 @@ dependencies = [ [[package]] name = "duration-str" -version = "0.5.1" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f037c488d179e21c87ef5fa9c331e8e62f5dddfa84618b41bb197da03edff1" +checksum = "5e172e85f305d6a442b250bf40667ffcb91a24f52c9a1ca59e2fa991ac9b7790" dependencies = [ "chrono", "nom", @@ -2535,7 +2682,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -2575,7 +2722,7 @@ checksum = "eecf8589574ce9b895052fa12d69af7a233f99e6107f5cb8dd1044f2a17bfdcb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -2588,7 +2735,7 @@ dependencies = [ "num-traits", "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -2642,14 +2789,14 @@ dependencies = [ [[package]] name = "etcd-client" -version = "0.11.1" -source = "git+https://github.com/risingwavelabs/etcd-client.git?rev=d55550a#d55550a182f2119e39e64858771468e1b26f6777" +version = "0.12.1" +source = "git+https://github.com/risingwavelabs/etcd-client.git?rev=4e84d40#4e84d40a84b35718d814cc2afccc9274c9d78e1e" dependencies = [ "http", - "prost", + "prost 0.12.1", "tokio", "tokio-stream", - "tonic", + "tonic 0.10.2", "tonic-build", "tower", "tower-service", @@ -2871,7 +3018,7 @@ dependencies = [ [[package]] name = "foyer" version = "0.1.0" -source = "git+https://github.com/mrcroxx/foyer?rev=41b1d39#41b1d3934cc92976737a9296273b4c5bee6422a0" +source = "git+https://github.com/mrcroxx/foyer?rev=438eec8#438eec87e90c7a80cb53a06b711c6ea1ad7a0f41" dependencies = [ "foyer-common", "foyer-intrusive", @@ -2882,21 +3029,21 @@ dependencies = [ [[package]] name = "foyer-common" version = "0.1.0" -source = "git+https://github.com/mrcroxx/foyer?rev=41b1d39#41b1d3934cc92976737a9296273b4c5bee6422a0" +source = "git+https://github.com/mrcroxx/foyer?rev=438eec8#438eec87e90c7a80cb53a06b711c6ea1ad7a0f41" dependencies = [ "bytes", "foyer-workspace-hack", + "madsim-tokio", "parking_lot 0.12.1", "paste", "rand", - "tokio", "tracing", ] [[package]] name = "foyer-intrusive" version = "0.1.0" -source = "git+https://github.com/mrcroxx/foyer?rev=41b1d39#41b1d3934cc92976737a9296273b4c5bee6422a0" +source = "git+https://github.com/mrcroxx/foyer?rev=438eec8#438eec87e90c7a80cb53a06b711c6ea1ad7a0f41" dependencies = [ "bytes", "cmsketch", @@ -2913,11 +3060,10 @@ dependencies = [ [[package]] name = "foyer-storage" version = "0.1.0" -source = "git+https://github.com/mrcroxx/foyer?rev=41b1d39#41b1d3934cc92976737a9296273b4c5bee6422a0" +source = "git+https://github.com/mrcroxx/foyer?rev=438eec8#438eec87e90c7a80cb53a06b711c6ea1ad7a0f41" dependencies = [ "anyhow", "async-channel", - "async-trait", "bitflags 2.4.0", "bitmaps", "bytes", @@ -2928,6 +3074,7 @@ dependencies = [ "futures", "itertools 0.11.0", "libc", + "madsim-tokio", "memoffset", "nix 0.27.1", "parking_lot 0.12.1", @@ -2935,7 +3082,6 @@ dependencies = [ "prometheus", "rand", "thiserror", - "tokio", "tracing", "twox-hash", ] @@ -2943,7 +3089,7 @@ dependencies = [ [[package]] name = "foyer-workspace-hack" version = "0.1.0" -source = "git+https://github.com/mrcroxx/foyer?rev=41b1d39#41b1d3934cc92976737a9296273b4c5bee6422a0" +source = "git+https://github.com/mrcroxx/foyer?rev=438eec8#438eec87e90c7a80cb53a06b711c6ea1ad7a0f41" dependencies = [ "crossbeam-utils", "either", @@ -2954,7 +3100,6 @@ dependencies = [ "hyper", "itertools 0.10.5", "libc", - "lock_api", "memchr", "parking_lot 0.12.1", "parking_lot_core 0.9.8", @@ -2962,9 +3107,9 @@ dependencies = [ "quote", "rand", "regex", - "regex-automata 0.3.8", - "regex-syntax 0.7.5", - "syn 2.0.33", + "regex-automata 0.4.1", + "regex-syntax 0.8.0", + "syn 2.0.37", "tokio", "tracing", "tracing-core", @@ -3001,7 +3146,7 @@ checksum = "b0fa992f1656e1707946bbba340ad244f0814009ef8c0118eb7b658395f19a2e" dependencies = [ "frunk_proc_macro_helpers", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -3013,7 +3158,7 @@ dependencies = [ "frunk_core", "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -3025,7 +3170,7 @@ dependencies = [ "frunk_core", "frunk_proc_macro_helpers", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -3146,6 +3291,21 @@ version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + [[package]] name = "futures-macro" version = "0.3.28" @@ -3154,7 +3314,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -3249,6 +3409,18 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "gloo-timers" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b995a66bb87bebce9a0f4a95aed01daca4872c050bfcb21653361c03bc35e5c" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] + [[package]] name = "google-cloud-auth" version = "0.12.0" @@ -3256,7 +3428,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "931bedb2264cb00f914b0a6a5c304e34865c34306632d3932e0951a073e4a67d" dependencies = [ "async-trait", - "base64 0.21.3", + "base64 0.21.4", "google-cloud-metadata", "google-cloud-token", "home", @@ -3282,7 +3454,7 @@ dependencies = [ "thiserror", "tokio", "tokio-retry", - "tonic", + "tonic 0.9.2", "tower", "tracing", ] @@ -3293,9 +3465,9 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5453af21ac0cc1f3b2cfb5b687c174e701c10ec2d5c286aff7ca8cbbf08d31b4" dependencies = [ - "prost", - "prost-types", - "tonic", + "prost 0.11.9", + "prost-types 0.11.9", + "tonic 0.9.2", ] [[package]] @@ -3321,7 +3493,7 @@ dependencies = [ "google-cloud-gax", "google-cloud-googleapis", "google-cloud-token", - "prost-types", + "prost-types 0.11.9", "thiserror", "tokio", "tokio-util", @@ -3682,7 +3854,7 @@ dependencies = [ [[package]] name = "icelake" version = "0.0.10" -source = "git+https://github.com/icelake-io/icelake?rev=166a36b1a40a64086db09a0e0f2ed6791cec548b#166a36b1a40a64086db09a0e0f2ed6791cec548b" +source = "git+https://github.com/icelake-io/icelake?rev=16dab0e36ab337e58ee8002d828def2d212fa116#16dab0e36ab337e58ee8002d828def2d212fa116" dependencies = [ "anyhow", "apache-avro 0.15.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -3690,6 +3862,7 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-cast", + "arrow-ord", "arrow-row", "arrow-schema", "arrow-select", @@ -3705,7 +3878,7 @@ dependencies = [ "log", "murmur3", "once_cell", - "opendal", + "opendal 0.40.0", "ordered-float 3.9.1", "parquet", "regex", @@ -3714,7 +3887,7 @@ dependencies = [ "serde", "serde_bytes", "serde_json", - "serde_with 3.3.0", + "serde_with 3.4.0", "tokio", "toml 0.7.8", "url", @@ -3797,6 +3970,17 @@ dependencies = [ "str_stack", ] +[[package]] +name = "inherent" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce243b1bfa62ffc028f1cc3b6034ec63d649f3031bc8a4fbbb004e1ac17d1f68" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.37", +] + [[package]] name = "inquire" version = "0.6.2" @@ -3856,12 +4040,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "iter-chunks" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7abddfc4e19bc38f3922e41b341fedb4e1470e922f024c4e5ae5922f56c7593" - [[package]] name = "itertools" version = "0.10.5" @@ -3959,7 +4137,7 @@ version = "8.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" dependencies = [ - "base64 0.21.3", + "base64 0.21.4", "pem 1.1.1", "ring", "serde", @@ -3976,6 +4154,15 @@ dependencies = [ "duct", ] +[[package]] +name = "kv-log-macro" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" +dependencies = [ + "log", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -4161,6 +4348,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" + [[package]] name = "linux-raw-sys" version = "0.4.5" @@ -4189,6 +4382,9 @@ name = "log" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +dependencies = [ + "value-bag", +] [[package]] name = "loom" @@ -4312,21 +4508,21 @@ dependencies = [ [[package]] name = "madsim-etcd-client" -version = "0.3.0+0.11.1" +version = "0.4.0+0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c26d21c8d69c25db9d461ab7dfa4b09bd982687546c8ca2c43d743533a8f1c3f" +checksum = "02b4b5de48bb7f3f7eae0bca62b3ed0b7d714b1b273d7347329b92c3a2eef113" dependencies = [ "etcd-client", "futures-util", "http", "madsim", "serde", - "serde_with 2.3.3", + "serde_with 3.4.0", "spin 0.9.8", "thiserror", "tokio", - "toml 0.7.8", - "tonic", + "toml 0.8.2", + "tonic 0.10.2", "tracing", ] @@ -4344,8 +4540,9 @@ dependencies = [ [[package]] name = "madsim-rdkafka" -version = "0.2.22" -source = "git+https://github.com/madsim-rs/madsim.git?rev=fedb1e3#fedb1e3a0a8758650c9e15076941c999150bdb31" +version = "0.3.0+0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00f9ab2d0545a55e4f209fc72c180a7e7b45a4e7baee7b4994c4628a877c5525" dependencies = [ "async-channel", "async-trait", @@ -4378,29 +4575,31 @@ dependencies = [ [[package]] name = "madsim-tonic" -version = "0.3.1+0.9.2" +version = "0.4.0+0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66177cce816367f8358a4dc482eabff8f979cf8a1d3288d3aa8dd822fb327c69" +checksum = "3b4d847e67d6f8319d7c5393121556e2a987f5b744967a0f9b84e502020239d3" dependencies = [ "async-stream", "chrono", "futures-util", "madsim", - "tonic", + "tokio", + "tonic 0.10.2", + "tower", "tracing", ] [[package]] name = "madsim-tonic-build" -version = "0.3.1+0.9.2" +version = "0.4.2+0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55f6b2947243e5ae6a37c7992da07cf3ed60ebeb6a3d2c1e95574a2a2697b0c0" +checksum = "4a2ad2776ba20221ccbe4e136e2fa0f7ab90eebd608373177f3e74a198a288ec" dependencies = [ - "prettyplease", + "prettyplease 0.2.15", "proc-macro2", - "prost-build", + "prost-build 0.12.1", "quote", - "syn 1.0.109", + "syn 2.0.37", "tonic-build", ] @@ -4558,6 +4757,15 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "model_migration" +version = "0.1.0" +dependencies = [ + "async-std", + "sea-orm-migration", + "uuid", +] + [[package]] name = "moka" version = "0.12.0" @@ -4593,6 +4801,12 @@ name = "multimap" version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + +[[package]] +name = "multimap" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70db9248a93dc36a36d9a47898caa007a32755c7ad140ec64eeeb50d5a730631" dependencies = [ "serde", ] @@ -4616,7 +4830,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", "termcolor", "thiserror", ] @@ -4660,7 +4874,7 @@ version = "0.30.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57349d5a326b437989b6ee4dc8f2f34b0cc131202748414712a8e7d98952fc8c" dependencies = [ - "base64 0.21.3", + "base64 0.21.4", "bigdecimal", "bindgen", "bitflags 2.4.0", @@ -4824,11 +5038,10 @@ dependencies = [ [[package]] name = "nuid" -version = "0.3.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20c1bb65186718d348306bf1afdeb20d9ab45b2ab80fb793c0fdcf59ffbb4f38" +checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83" dependencies = [ - "lazy_static", "rand", ] @@ -5033,7 +5246,40 @@ dependencies = [ "async-compat", "async-trait", "backon", - "base64 0.21.3", + "base64 0.21.4", + "bytes", + "chrono", + "flagset", + "futures", + "http", + "hyper", + "log", + "md-5", + "once_cell", + "parking_lot 0.12.1", + "percent-encoding", + "pin-project", + "quick-xml 0.29.0", + "reqsign", + "reqwest", + "serde", + "serde_json", + "sha2", + "tokio", + "uuid", +] + +[[package]] +name = "opendal" +version = "0.40.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddba7299bab261d3ae2f37617fb7f45b19ed872752bb4e22cf93a69d979366c5" +dependencies = [ + "anyhow", + "async-compat", + "async-trait", + "backon", + "base64 0.21.4", "bytes", "chrono", "flagset", @@ -5046,6 +5292,7 @@ dependencies = [ "parking_lot 0.12.1", "percent-encoding", "pin-project", + "prometheus", "quick-xml 0.29.0", "reqsign", "reqwest", @@ -5106,7 +5353,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -5160,10 +5407,10 @@ dependencies = [ "opentelemetry-semantic-conventions", "opentelemetry_api", "opentelemetry_sdk", - "prost", + "prost 0.11.9", "thiserror", "tokio", - "tonic", + "tonic 0.9.2", ] [[package]] @@ -5174,8 +5421,8 @@ checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb" dependencies = [ "opentelemetry_api", "opentelemetry_sdk", - "prost", - "tonic", + "prost 0.11.9", + "tonic 0.9.2", ] [[package]] @@ -5264,6 +5511,30 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "ouroboros" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2ba07320d39dfea882faa70554b4bd342a5f273ed59ba7c1c6b4c840492c954" +dependencies = [ + "aliasable", + "ouroboros_macro", + "static_assertions", +] + +[[package]] +name = "ouroboros_macro" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec4c6225c69b4ca778c0aea097321a64c421cf4577b331c61b229267edabb6f8" +dependencies = [ + "heck 0.4.1", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 2.0.37", +] + [[package]] name = "outref" version = "0.5.1" @@ -5288,6 +5559,12 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384e52fd8fbd4cbe3c317e8216260c21a0f9134de108cea8a4dd4e7e152c472d" +[[package]] +name = "parking" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14f2252c834a40ed9bb5422029649578e63aa341ac401f74e719dd1afda8394e" + [[package]] name = "parking_lot" version = "0.11.2" @@ -5341,9 +5618,9 @@ dependencies = [ [[package]] name = "parquet" -version = "46.0.0" +version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad2cba786ae07da4d73371a88b9e0f9d3ffac1a9badc83922e0e15814f5c5fa" +checksum = "0463cc3b256d5f50408c49a4be3a16674f4c8ceef60941709620a062b1f6bf4d" dependencies = [ "ahash 0.8.3", "arrow-array", @@ -5353,7 +5630,7 @@ dependencies = [ "arrow-ipc", "arrow-schema", "arrow-select", - "base64 0.21.3", + "base64 0.21.4", "brotli", "bytes", "chrono", @@ -5369,7 +5646,7 @@ dependencies = [ "thrift", "tokio", "twox-hash", - "zstd", + "zstd 0.12.4", ] [[package]] @@ -5395,7 +5672,7 @@ dependencies = [ "regex", "regex-syntax 0.7.5", "structmeta", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -5437,7 +5714,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1030c719b0ec2a2d25a5df729d6cff1acf3cc230bf766f4f97833591f7577b90" dependencies = [ - "base64 0.21.3", + "base64 0.21.4", "serde", ] @@ -5449,8 +5726,8 @@ checksum = "bdbb7b706f2afc610f3853550cdbbf6372fd324824a087806bd4480ea4996e24" dependencies = [ "heck 0.4.1", "itertools 0.10.5", - "prost", - "prost-types", + "prost 0.11.9", + "prost-types 0.11.9", ] [[package]] @@ -5474,7 +5751,7 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a" dependencies = [ - "base64 0.21.3", + "base64 0.21.4", "serde", ] @@ -5591,7 +5868,7 @@ checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -5668,12 +5945,28 @@ dependencies = [ ] [[package]] -name = "portable-atomic" -version = "1.4.3" +name = "polling" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31114a898e107c51bb1609ffaf55a0e011cf6a4d7f1170d0015a165082c0338b" - -[[package]] +checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" +dependencies = [ + "autocfg", + "bitflags 1.3.2", + "cfg-if", + "concurrent-queue", + "libc", + "log", + "pin-project-lite", + "windows-sys 0.48.0", +] + +[[package]] +name = "portable-atomic" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31114a898e107c51bb1609ffaf55a0e011cf6a4d7f1170d0015a165082c0338b" + +[[package]] name = "postgres" version = "0.19.5" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -5696,7 +5989,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -5705,7 +5998,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49b6c5ef183cd3ab4ba005f1ca64c21e8bd97ce4699cfea9e8d9a2c4958ca520" dependencies = [ - "base64 0.21.3", + "base64 0.21.4", "byteorder", "bytes", "fallible-iterator", @@ -5734,9 +6027,9 @@ dependencies = [ [[package]] name = "pprof" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978385d59daf9269189d052ca8a84c1acfd0715c0599a5d5188d4acc078ca46a" +checksum = "ef5c97c51bd34c7e742402e216abdeb44d415fbe6ae41d56b114723e953711cb" dependencies = [ "backtrace", "cfg-if", @@ -5831,6 +6124,16 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "prettyplease" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" +dependencies = [ + "proc-macro2", + "syn 2.0.37", +] + [[package]] name = "priority-queue" version = "1.3.2" @@ -5857,7 +6160,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919" dependencies = [ "once_cell", - "toml_edit", + "toml_edit 0.19.15", ] [[package]] @@ -5909,7 +6212,7 @@ dependencies = [ "byteorder", "hex", "lazy_static", - "rustix 0.36.15", + "rustix 0.36.16", ] [[package]] @@ -5922,7 +6225,7 @@ dependencies = [ "byteorder", "hex", "lazy_static", - "rustix 0.36.15", + "rustix 0.36.16", ] [[package]] @@ -5944,13 +6247,13 @@ dependencies = [ [[package]] name = "prometheus-http-query" -version = "0.6.6" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7970fd6e91b5cb87e9a093657572a896d133879ced7752d2c7635beae29eaba0" +checksum = "8e7c6186f0b66203811641c88ca4e5817182caa7553868359bafa5b17d97f37f" dependencies = [ + "mime", "reqwest", "serde", - "serde_json", "time", "url", ] @@ -5962,7 +6265,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.11.9", +] + +[[package]] +name = "prost" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4fdd22f3b9c31b53c060df4a0613a1c7f062d4115a2b984dd15b1858f7e340d" +dependencies = [ + "bytes", + "prost-derive 0.12.1", ] [[package]] @@ -5976,17 +6289,39 @@ dependencies = [ "itertools 0.10.5", "lazy_static", "log", - "multimap", + "multimap 0.8.3", "petgraph", - "prettyplease", - "prost", - "prost-types", + "prettyplease 0.1.25", + "prost 0.11.9", + "prost-types 0.11.9", "regex", "syn 1.0.109", "tempfile", "which", ] +[[package]] +name = "prost-build" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bdf592881d821b83d471f8af290226c8d51402259e9bb5be7f9f8bdebbb11ac" +dependencies = [ + "bytes", + "heck 0.4.1", + "itertools 0.11.0", + "log", + "multimap 0.8.3", + "once_cell", + "petgraph", + "prettyplease 0.2.15", + "prost 0.12.1", + "prost-types 0.12.1", + "regex", + "syn 2.0.37", + "tempfile", + "which", +] + [[package]] name = "prost-derive" version = "0.11.9" @@ -6000,25 +6335,37 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "prost-derive" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "265baba7fabd416cf5078179f7d2cbeca4ce7a9041111900675ea7c4cb8a4c32" +dependencies = [ + "anyhow", + "itertools 0.11.0", + "proc-macro2", + "quote", + "syn 2.0.37", +] + [[package]] name = "prost-helpers" version = "0.1.0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", - "workspace-hack", + "syn 2.0.37", ] [[package]] name = "prost-reflect" -version = "0.11.5" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b823de344848e011658ac981009100818b322421676740546f8b52ed5249428" +checksum = "057237efdb71cf4b3f9396302a3d6599a92fa94063ba537b66130980ea9909f3" dependencies = [ "once_cell", - "prost", - "prost-types", + "prost 0.12.1", + "prost-types 0.12.1", ] [[package]] @@ -6027,7 +6374,16 @@ version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13" dependencies = [ - "prost", + "prost 0.11.9", +] + +[[package]] +name = "prost-types" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e081b29f63d83a4bc75cfc9f3fe424f9156cf92d8a4f0c9407cce9a1b67327cf" +dependencies = [ + "prost 0.12.1", ] [[package]] @@ -6111,9 +6467,9 @@ dependencies = [ "oauth2", "openidconnect", "pem 1.1.1", - "prost", - "prost-build", - "prost-derive", + "prost 0.11.9", + "prost-build 0.11.9", + "prost-derive 0.11.9", "rand", "regex", "serde", @@ -6259,8 +6615,9 @@ dependencies = [ [[package]] name = "rdkafka-sys" -version = "4.3.0+1.9.2" -source = "git+https://github.com/MaterializeInc/rust-rdkafka?rev=8ea07c4#8ea07c4d2b96636ff093e670bc921892aee0d56a" +version = "4.6.0+2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad63c279fca41a27c231c450a2d2ad18288032e9cbb159ad16c9d96eba35aaaf" dependencies = [ "cmake", "libc", @@ -6278,12 +6635,19 @@ version = "0.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f49cdc0bb3f412bf8e7d1bd90fe1d9eb10bc5c399ba90973c14662a27b3f8ba" dependencies = [ + "async-std", + "async-trait", + "bytes", "combine", + "futures-util", "itoa", "percent-encoding", + "pin-project-lite", "ryu", "sha1_smol", "socket2 0.4.9", + "tokio", + "tokio-util", "url", ] @@ -6307,14 +6671,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.5" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" +checksum = "d119d7c7ca818f8a53c300863d4f87566aac09943aef5b355bb83969dae75d87" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.3.8", - "regex-syntax 0.7.5", + "regex-automata 0.4.1", + "regex-syntax 0.8.0", ] [[package]] @@ -6328,13 +6692,13 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.8" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" +checksum = "465c6fc0621e4abc4187a2bda0937bfd4f722c2730b29562e19689ea796c9a4b" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.5", + "regex-syntax 0.8.0", ] [[package]] @@ -6349,6 +6713,12 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "regex-syntax" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3cbb081b9784b07cceb8824c8583f86db4814d172ab043f3c23f7dc600bf83d" + [[package]] name = "rend" version = "0.4.0" @@ -6366,7 +6736,7 @@ checksum = "3228e570df74d69d3d3236a71371f1edd748a3e4eb728ea1f29d403bc10fc727" dependencies = [ "anyhow", "async-trait", - "base64 0.21.3", + "base64 0.21.4", "chrono", "form_urlencoded", "hex", @@ -6395,7 +6765,7 @@ version = "0.11.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e9ad3fe7488d7e34558a2033d45a0c90b72d97b4f80705666fea71472e2e6a1" dependencies = [ - "base64 0.21.3", + "base64 0.21.4", "bytes", "encoding_rs", "futures-core", @@ -6478,7 +6848,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "serde_with 3.3.0", + "serde_with 3.4.0", "serde_yaml", "tempfile", "tracing", @@ -6509,7 +6879,7 @@ dependencies = [ "bytes", "itertools 0.11.0", "parking_lot 0.12.1", - "prost", + "prost 0.12.1", "risingwave_common", "risingwave_hummock_sdk", "risingwave_object_store", @@ -6527,7 +6897,6 @@ dependencies = [ "anyhow", "assert_matches", "async-recursion", - "async-stream", "async-trait", "criterion", "either", @@ -6546,6 +6915,7 @@ dependencies = [ "risingwave_common", "risingwave_connector", "risingwave_expr", + "risingwave_expr_impl", "risingwave_hummock_sdk", "risingwave_pb", "risingwave_rpc_client", @@ -6560,7 +6930,6 @@ dependencies = [ "tokio-metrics", "tokio-stream", "tracing", - "uuid", "workspace-hack", ] @@ -6591,7 +6960,7 @@ dependencies = [ "risingwave_storage", "serde", "tokio-stream", - "toml 0.7.8", + "toml 0.8.2", "tracing", "tracing-subscriber", "workspace-hack", @@ -6608,8 +6977,9 @@ dependencies = [ "risingwave_compactor", "risingwave_compute", "risingwave_ctl", + "risingwave_expr_impl", "risingwave_frontend", - "risingwave_meta", + "risingwave_meta_node", "risingwave_rt", "task_stats_alloc", "tikv-jemallocator", @@ -6633,8 +7003,9 @@ dependencies = [ "risingwave_compactor", "risingwave_compute", "risingwave_ctl", + "risingwave_expr_impl", "risingwave_frontend", - "risingwave_meta", + "risingwave_meta_node", "risingwave_rt", "shell-words", "strum 0.25.0", @@ -6656,9 +7027,11 @@ dependencies = [ "arc-swap", "arrow-array", "arrow-buffer", + "arrow-cast", "arrow-schema", "async-trait", "auto_enums", + "auto_impl", "bitflags 2.4.0", "byteorder", "bytes", @@ -6705,7 +7078,7 @@ dependencies = [ "pretty_assertions", "procfs 0.15.1", "prometheus", - "prost", + "prost 0.12.1", "rand", "regex", "reqwest", @@ -6718,7 +7091,7 @@ dependencies = [ "serde_bytes", "serde_default", "serde_json", - "serde_with 3.3.0", + "serde_with 3.4.0", "smallbitset", "speedate", "static_assertions", @@ -6727,9 +7100,8 @@ dependencies = [ "sysinfo", "tempfile", "thiserror", - "tikv-jemalloc-ctl", "tinyvec", - "toml 0.7.8", + "toml 0.8.2", "tower-layer", "tower-service", "tracing", @@ -6741,6 +7113,19 @@ dependencies = [ "workspace-hack", ] +[[package]] +name = "risingwave_common_heap_profiling" +version = "1.3.0-alpha" +dependencies = [ + "anyhow", + "chrono", + "madsim-tokio", + "parking_lot 0.12.1", + "risingwave_common", + "tikv-jemalloc-ctl", + "tracing", +] + [[package]] name = "risingwave_common_proc_macro" version = "1.3.0-alpha" @@ -6750,7 +7135,6 @@ dependencies = [ "proc-macro2", "quote", "syn 1.0.109", - "workspace-hack", ] [[package]] @@ -6789,6 +7173,7 @@ dependencies = [ "risingwave_hummock_sdk", "risingwave_hummock_test", "risingwave_meta", + "risingwave_meta_node", "risingwave_object_store", "risingwave_pb", "risingwave_rpc_client", @@ -6802,22 +7187,20 @@ dependencies = [ name = "risingwave_compactor" version = "1.3.0-alpha" dependencies = [ - "anyhow", "async-trait", "await-tree", "clap", "madsim-tokio", "madsim-tonic", "parking_lot 0.12.1", - "prometheus", "risingwave_common", + "risingwave_common_heap_profiling", "risingwave_common_service", "risingwave_object_store", "risingwave_pb", "risingwave_rpc_client", "risingwave_storage", "serde", - "serde_json", "tracing", "workspace-hack", ] @@ -6844,9 +7227,11 @@ dependencies = [ "rand", "risingwave_batch", "risingwave_common", + "risingwave_common_heap_profiling", "risingwave_common_service", "risingwave_connector", "risingwave_hummock_sdk", + "risingwave_jni_core", "risingwave_pb", "risingwave_rpc_client", "risingwave_source", @@ -6880,8 +7265,7 @@ dependencies = [ "aws-sdk-s3", "aws-smithy-http", "aws-types", - "base64 0.21.3", - "bincode 1.3.3", + "base64 0.21.4", "byteorder", "bytes", "chrono", @@ -6889,6 +7273,7 @@ dependencies = [ "criterion", "csv", "duration-str", + "easy-ext", "enum-as-inner", "futures", "futures-async-stream", @@ -6904,25 +7289,24 @@ dependencies = [ "jsonschema-transpiler", "madsim-rdkafka", "madsim-tokio", - "madsim-tonic", "maplit", "moka", "mysql_async", "mysql_common", "nexmark", - "nkeys", "num-bigint", - "opendal", "parking_lot 0.12.1", "paste", "prometheus", - "prost", - "prost-build", + "prost 0.12.1", + "prost-build 0.12.1", "prost-reflect", - "prost-types", + "prost-types 0.12.1", "protobuf-native", "pulsar", "rand", + "redis", + "regex", "reqwest", "risingwave_common", "risingwave_jni_core", @@ -6932,15 +7316,20 @@ dependencies = [ "serde", "serde_derive", "serde_json", - "serde_with 3.3.0", + "serde_with 3.4.0", "simd-json", + "strum 0.25.0", + "strum_macros 0.25.2", "tempfile", "thiserror", "time", "tokio-retry", "tokio-stream", "tokio-util", + "tonic 0.9.2", "tracing", + "tracing-futures", + "tracing-test", "url", "urlencoding", "workspace-hack", @@ -6999,46 +7388,66 @@ dependencies = [ name = "risingwave_expr" version = "1.3.0-alpha" dependencies = [ - "aho-corasick", "anyhow", "arrow-array", "arrow-schema", "async-trait", - "auto_enums", + "auto_impl", "await-tree", "cfg-or-panic", "chrono", - "chrono-tz", - "criterion", "ctor", "downcast-rs", "easy-ext", "either", "expect-test", - "fancy-regex", - "futures", "futures-async-stream", "futures-util", - "hex", "itertools 0.11.0", "madsim-tokio", - "md5", "num-traits", "parse-display", "paste", - "regex", "risingwave_common", "risingwave_expr_macro", "risingwave_pb", "risingwave_udf", + "smallvec", + "static_assertions", + "thiserror", + "tracing", + "workspace-hack", +] + +[[package]] +name = "risingwave_expr_impl" +version = "1.3.0-alpha" +dependencies = [ + "aho-corasick", + "anyhow", + "async-trait", + "auto_enums", + "chrono", + "criterion", + "expect-test", + "fancy-regex", + "futures-async-stream", + "futures-util", + "hex", + "itertools 0.11.0", + "madsim-tokio", + "md5", + "num-traits", + "regex", + "risingwave_common", + "risingwave_expr", + "risingwave_pb", "rust_decimal", "self_cell", "serde", "serde_json", "sha1", "sha2", - "smallvec", - "static_assertions", "thiserror", "tracing", "workspace-hack", @@ -7051,7 +7460,7 @@ dependencies = [ "itertools 0.11.0", "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -7082,7 +7491,6 @@ dependencies = [ "madsim-tonic", "maplit", "md5", - "more-asserts", "num-integer", "parking_lot 0.12.1", "parse-display", @@ -7100,6 +7508,7 @@ dependencies = [ "risingwave_common_service", "risingwave_connector", "risingwave_expr", + "risingwave_expr_impl", "risingwave_pb", "risingwave_rpc_client", "risingwave_source", @@ -7180,7 +7589,7 @@ dependencies = [ "madsim-tokio", "mockall", "parking_lot 0.12.1", - "prost", + "prost 0.12.1", "risingwave_common", "risingwave_hummock_sdk", "risingwave_pb", @@ -7192,7 +7601,8 @@ dependencies = [ name = "risingwave_java_binding" version = "0.1.0" dependencies = [ - "prost", + "jni", + "prost 0.12.1", "risingwave_common", "risingwave_expr", "risingwave_jni_core", @@ -7206,11 +7616,13 @@ name = "risingwave_jni_core" version = "0.1.0" dependencies = [ "bytes", + "cfg-or-panic", "futures", "itertools 0.11.0", "jni", "madsim-tokio", - "prost", + "paste", + "prost 0.12.1", "risingwave_common", "risingwave_expr", "risingwave_hummock_sdk", @@ -7253,18 +7665,18 @@ dependencies = [ "maplit", "memcomparable", "mime_guess", + "model_migration", "num-integer", "num-traits", "parking_lot 0.12.1", "prometheus", "prometheus-http-query", - "prost", + "prost 0.12.1", "rand", - "regex", "reqwest", "risingwave_backup", "risingwave_common", - "risingwave_common_service", + "risingwave_common_heap_profiling", "risingwave_connector", "risingwave_hummock_sdk", "risingwave_object_store", @@ -7273,12 +7685,10 @@ dependencies = [ "risingwave_sqlparser", "risingwave_test_runner", "scopeguard", + "sea-orm", "serde", "serde_json", - "sqlx", - "static_assertions", "sync-point", - "tempfile", "thiserror", "tokio-retry", "tokio-stream", @@ -7290,6 +7700,56 @@ dependencies = [ "workspace-hack", ] +[[package]] +name = "risingwave_meta_node" +version = "1.3.0-alpha" +dependencies = [ + "anyhow", + "clap", + "either", + "futures", + "itertools 0.11.0", + "madsim-etcd-client", + "madsim-tokio", + "madsim-tonic", + "model_migration", + "prometheus-http-query", + "regex", + "risingwave_common", + "risingwave_common_heap_profiling", + "risingwave_common_service", + "risingwave_meta", + "risingwave_meta_service", + "risingwave_pb", + "risingwave_rpc_client", + "sea-orm", + "tracing", + "workspace-hack", +] + +[[package]] +name = "risingwave_meta_service" +version = "1.3.0-alpha" +dependencies = [ + "anyhow", + "async-trait", + "either", + "futures", + "itertools 0.11.0", + "madsim-tokio", + "madsim-tonic", + "regex", + "risingwave_common", + "risingwave_connector", + "risingwave_meta", + "risingwave_pb", + "sea-orm", + "sync-point", + "tokio-stream", + "tracing", + "workspace-hack", +] + [[package]] name = "risingwave_object_store" version = "1.3.0-alpha" @@ -7309,7 +7769,7 @@ dependencies = [ "itertools 0.11.0", "madsim-aws-sdk-s3", "madsim-tokio", - "opendal", + "opendal 0.39.0", "prometheus", "risingwave_common", "spin 0.9.8", @@ -7328,9 +7788,10 @@ dependencies = [ "madsim-tonic-build", "pbjson", "pbjson-build", - "prost", + "prost 0.12.1", "prost-helpers", "serde", + "strum 0.25.0", "walkdir", "workspace-hack", ] @@ -7345,10 +7806,11 @@ dependencies = [ "libtest-mimic", "madsim-tokio", "paste", + "risingwave_expr_impl", "risingwave_frontend", "risingwave_sqlparser", "serde", - "serde_with 3.3.0", + "serde_with 3.4.0", "serde_yaml", "tempfile", "walkdir", @@ -7403,7 +7865,6 @@ name = "risingwave_rt" version = "1.3.0-alpha" dependencies = [ "await-tree", - "chrono", "console", "console-subscriber", "either", @@ -7415,7 +7876,6 @@ dependencies = [ "opentelemetry-semantic-conventions", "parking_lot 0.12.1", "pprof", - "prometheus", "risingwave_common", "risingwave_variables", "rlimit", @@ -7455,8 +7915,9 @@ dependencies = [ "risingwave_connector", "risingwave_ctl", "risingwave_e2e_extended_mode_test", + "risingwave_expr_impl", "risingwave_frontend", - "risingwave_meta", + "risingwave_meta_node", "risingwave_pb", "risingwave_rpc_client", "risingwave_sqlparser", @@ -7468,6 +7929,7 @@ dependencies = [ "tempfile", "tikv-jemallocator", "tokio-postgres", + "tokio-stream", "tracing", "tracing-subscriber", ] @@ -7479,7 +7941,6 @@ dependencies = [ "anyhow", "assert_matches", "criterion", - "easy-ext", "futures", "futures-async-stream", "itertools 0.11.0", @@ -7516,7 +7977,7 @@ dependencies = [ "madsim-tokio", "risingwave_sqlparser", "serde", - "serde_with 3.3.0", + "serde_with 3.4.0", "serde_yaml", "walkdir", "workspace-hack", @@ -7538,6 +7999,7 @@ dependencies = [ "regex", "risingwave_common", "risingwave_expr", + "risingwave_expr_impl", "risingwave_frontend", "risingwave_pb", "risingwave_sqlparser", @@ -7560,10 +8022,10 @@ dependencies = [ "regex", "risingwave_rt", "serde", - "serde_with 3.3.0", + "serde_with 3.4.0", "tokio-postgres", "tokio-stream", - "toml 0.7.8", + "toml 0.8.2", "tracing", "workspace-hack", ] @@ -7572,7 +8034,6 @@ dependencies = [ name = "risingwave_storage" version = "1.3.0-alpha" dependencies = [ - "anyhow", "arc-swap", "async-trait", "auto_enums", @@ -7596,6 +8057,7 @@ dependencies = [ "lz4", "mach2", "madsim-tokio", + "madsim-tonic", "memcomparable", "moka", "more-asserts", @@ -7604,7 +8066,7 @@ dependencies = [ "parking_lot 0.12.1", "procfs 0.15.1", "prometheus", - "prost", + "prost 0.12.1", "rand", "risingwave_backup", "risingwave_common", @@ -7629,7 +8091,7 @@ dependencies = [ "workspace-hack", "xorf", "xxhash-rust", - "zstd", + "zstd 0.13.0", ] [[package]] @@ -7644,7 +8106,6 @@ dependencies = [ "await-tree", "bytes", "criterion", - "dyn-clone", "educe", "either", "enum-as-inner", @@ -7653,7 +8114,6 @@ dependencies = [ "futures-async-stream", "governor", "hytra", - "iter-chunks", "itertools 0.11.0", "local_stats_alloc", "lru 0.7.6", @@ -7661,18 +8121,16 @@ dependencies = [ "madsim-tonic", "maplit", "memcomparable", - "multimap", - "num-traits", + "multimap 0.9.0", "parking_lot 0.12.1", - "parse-display", "pin-project", "prometheus", - "prost", + "prost 0.12.1", "rand", "risingwave_common", "risingwave_connector", "risingwave_expr", - "risingwave_frontend", + "risingwave_expr_impl", "risingwave_hummock_sdk", "risingwave_hummock_test", "risingwave_pb", @@ -7683,7 +8141,6 @@ dependencies = [ "serde_json", "serde_yaml", "smallvec", - "spin 0.9.8", "static_assertions", "task_stats_alloc", "thiserror", @@ -7842,9 +8299,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.36.15" +version = "0.36.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c37f1bd5ef1b5422177b7646cba67430579cfe2ace80f284fee876bca52ad941" +checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab" dependencies = [ "bitflags 1.3.2", "errno", @@ -7854,6 +8311,20 @@ dependencies = [ "windows-sys 0.45.0", ] +[[package]] +name = "rustix" +version = "0.37.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06" +dependencies = [ + "bitflags 1.3.2", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys 0.3.8", + "windows-sys 0.48.0", +] + [[package]] name = "rustix" version = "0.38.11" @@ -7909,7 +8380,7 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d3987094b1d07b653b7dfdc3f70ce9a1da9c51ac18c1b06b662e4f9a0e9f4b2" dependencies = [ - "base64 0.21.3", + "base64 0.21.4", ] [[package]] @@ -8021,6 +8492,165 @@ dependencies = [ "untrusted", ] +[[package]] +name = "sea-bae" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bd3534a9978d0aa7edd2808dc1f8f31c4d0ecd31ddf71d997b3c98e9f3c9114" +dependencies = [ + "heck 0.4.1", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 2.0.37", +] + +[[package]] +name = "sea-orm" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61f6c7daef05dde3476d97001e11fca7a52b655aa3bf4fd610ab2da1176a2ed5" +dependencies = [ + "async-stream", + "async-trait", + "bigdecimal", + "chrono", + "futures", + "log", + "ouroboros", + "rust_decimal", + "sea-orm-macros", + "sea-query", + "sea-query-binder", + "serde", + "serde_json", + "sqlx", + "strum 0.25.0", + "thiserror", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "sea-orm-cli" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e3f0ff2fa5672e2e7314d107c6498a18e469beeb340a0ed84e3075fce73c2cd" +dependencies = [ + "chrono", + "clap", + "dotenvy", + "glob", + "regex", + "sea-schema", + "tracing", + "tracing-subscriber", + "url", +] + +[[package]] +name = "sea-orm-macros" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd90e73d5f5b184bad525767da29fbfec132b4e62ebd6f60d2f2737ec6468f62" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "sea-bae", + "syn 2.0.37", + "unicode-ident", +] + +[[package]] +name = "sea-orm-migration" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21f673fcefb3a7e7b89a12b6c0e854ec0be14367635ac3435369c8ad7f11e09e" +dependencies = [ + "async-trait", + "clap", + "dotenvy", + "futures", + "sea-orm", + "sea-orm-cli", + "sea-schema", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "sea-query" +version = "0.30.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28c05a5bf6403834be253489bbe95fa9b1e5486bc843b61f60d26b5c9c1e244b" +dependencies = [ + "bigdecimal", + "chrono", + "derivative", + "inherent", + "ordered-float 3.9.1", + "rust_decimal", + "sea-query-derive", + "serde_json", + "time", + "uuid", +] + +[[package]] +name = "sea-query-binder" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36bbb68df92e820e4d5aeb17b4acd5cc8b5d18b2c36a4dd6f4626aabfa7ab1b9" +dependencies = [ + "bigdecimal", + "chrono", + "rust_decimal", + "sea-query", + "serde_json", + "sqlx", + "time", + "uuid", +] + +[[package]] +name = "sea-query-derive" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd78f2e0ee8e537e9195d1049b752e0433e2cac125426bccb7b5c3e508096117" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 1.0.109", + "thiserror", +] + +[[package]] +name = "sea-schema" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cd9561232bd1b82ea748b581f15909d11de0db6563ddcf28c5d908aee8282f1" +dependencies = [ + "futures", + "sea-query", + "sea-schema-derive", +] + +[[package]] +name = "sea-schema-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6f686050f76bffc4f635cda8aea6df5548666b830b52387e8bc7de11056d11e" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "seahash" version = "4.1.0" @@ -8131,7 +8761,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -8193,7 +8823,7 @@ checksum = "8725e1dfadb3a50f7e5ce0b1a540466f6ed3fe7a0fca2ac2b8b831d31316bd00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -8229,34 +8859,18 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe" -dependencies = [ - "base64 0.13.1", - "chrono", - "hex", - "indexmap 1.9.3", - "serde", - "serde_json", - "serde_with_macros 2.3.3", - "time", -] - -[[package]] -name = "serde_with" -version = "3.3.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ca3b16a3d82c4088f343b7480a93550b3eabe1a358569c2dfe38bbcead07237" +checksum = "64cd236ccc1b7a29e7e2739f27c0b2dd199804abc4290e32f59f3b68d6405c23" dependencies = [ - "base64 0.21.3", + "base64 0.21.4", "chrono", "hex", "indexmap 1.9.3", "indexmap 2.0.0", "serde", "serde_json", - "serde_with_macros 3.3.0", + "serde_with_macros 3.4.0", "time", ] @@ -8274,26 +8888,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "2.3.3" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "881b6f881b17d13214e5d494c939ebab463d01264ce1811e9d4ac3a882e7695f" +checksum = "93634eb5f75a2323b16de4748022ac4297f9e76b6dced2be287a099f41b5e788" dependencies = [ "darling 0.20.3", "proc-macro2", "quote", - "syn 2.0.33", -] - -[[package]] -name = "serde_with_macros" -version = "3.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e6be15c453eb305019bfa438b1593c731f36a289a7853f7707ee29e870b3b3c" -dependencies = [ - "darling 0.20.3", - "proc-macro2", - "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -8331,7 +8933,7 @@ checksum = "91d129178576168c589c9ec973feedf7d3126c01ac2bf08795109aa35b69fb8f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -8447,10 +9049,11 @@ dependencies = [ [[package]] name = "simd-json" -version = "0.10.6" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de7f1293f0e4e11d52e588766fe9de8caa2857ff63809d40de83245452ca7c5c" +checksum = "f0f07a84c7456b901b8dd2c1d44caca8b0fd2c2616206ee5acc9d9da61e8d9ec" dependencies = [ + "getrandom", "halfbrown", "lexical-core", "serde", @@ -8467,9 +9070,9 @@ checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" [[package]] name = "similar" -version = "2.2.1" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "420acb44afdae038210c99e69aae24109f32f15500aa708e81d46c9f29d55fcf" +checksum = "2aeaf503862c419d66959f5d7ca015337d864e9c49485d771b732e2a20453597" [[package]] name = "simple_asn1" @@ -8546,9 +9149,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" dependencies = [ "serde", ] @@ -8670,6 +9273,7 @@ checksum = "dd4cef4251aabbae751a3710927945901ee1d97ee96d757f6880ebb9a79bfd53" dependencies = [ "ahash 0.8.3", "atoi", + "bigdecimal", "byteorder", "bytes", "chrono", @@ -8688,19 +9292,23 @@ dependencies = [ "indexmap 2.0.0", "log", "memchr", + "native-tls", "once_cell", "paste", "percent-encoding", + "rust_decimal", "serde", "serde_json", "sha2", "smallvec", "sqlformat", "thiserror", + "time", "tokio", "tokio-stream", "tracing", "url", + "uuid", ] [[package]] @@ -8749,7 +9357,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ca69bf415b93b60b80dc8fda3cb4ef52b2336614d8da2de5456cc942a110482" dependencies = [ "atoi", - "base64 0.21.3", + "base64 0.21.4", + "bigdecimal", "bitflags 2.4.0", "byteorder", "bytes", @@ -8774,6 +9383,7 @@ dependencies = [ "percent-encoding", "rand", "rsa", + "rust_decimal", "serde", "sha1", "sha2", @@ -8781,7 +9391,9 @@ dependencies = [ "sqlx-core", "stringprep", "thiserror", + "time", "tracing", + "uuid", "whoami", ] @@ -8792,7 +9404,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0db2df1b8731c3651e204629dd55e52adbae0462fa1bdcbed56a2302c18181e" dependencies = [ "atoi", - "base64 0.21.3", + "base64 0.21.4", + "bigdecimal", "bitflags 2.4.0", "byteorder", "chrono", @@ -8811,8 +9424,10 @@ dependencies = [ "log", "md-5", "memchr", + "num-bigint", "once_cell", "rand", + "rust_decimal", "serde", "serde_json", "sha1", @@ -8821,7 +9436,9 @@ dependencies = [ "sqlx-core", "stringprep", "thiserror", + "time", "tracing", + "uuid", "whoami", ] @@ -8844,8 +9461,10 @@ dependencies = [ "percent-encoding", "serde", "sqlx-core", + "time", "tracing", "url", + "uuid", ] [[package]] @@ -8892,7 +9511,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -8903,7 +9522,7 @@ checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -8944,7 +9563,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -9009,9 +9628,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.33" +version = "2.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9caece70c63bfba29ec2fed841a09851b14a235c60010fa4de58089b6c025668" +checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" dependencies = [ "proc-macro2", "quote", @@ -9113,7 +9732,7 @@ checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -9287,7 +9906,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -9422,7 +10041,19 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit", + "toml_edit 0.19.15", +] + +[[package]] +name = "toml" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "185d8ab0dfbb35cf1399a6344d8484209c088f75f8f68230da55d48d95d43e3d" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit 0.20.2", ] [[package]] @@ -9447,6 +10078,19 @@ dependencies = [ "winnow", ] +[[package]] +name = "toml_edit" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "396e4d48bbb2b7554c944bde63101b5ae446cff6ec4a24227428f15eb72ef338" +dependencies = [ + "indexmap 2.0.0", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + [[package]] name = "tonic" version = "0.9.2" @@ -9456,7 +10100,7 @@ dependencies = [ "async-stream", "async-trait", "axum", - "base64 0.21.3", + "base64 0.21.4", "bytes", "flate2", "futures-core", @@ -9468,7 +10112,7 @@ dependencies = [ "hyper-timeout", "percent-encoding", "pin-project", - "prost", + "prost 0.11.9", "rustls-pemfile", "tokio", "tokio-rustls 0.24.1", @@ -9480,17 +10124,44 @@ dependencies = [ "webpki-roots 0.23.1", ] +[[package]] +name = "tonic" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64 0.21.4", + "bytes", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost 0.12.1", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tonic-build" -version = "0.9.2" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6fdaae4c2c638bb70fe42803a26fbd6fc6ac8c72f5c59f67ecc2a2dcabf4b07" +checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889" dependencies = [ - "prettyplease", + "prettyplease 0.2.15", "proc-macro2", - "prost-build", + "prost-build 0.12.1", "quote", - "syn 1.0.109", + "syn 2.0.37", ] [[package]] @@ -9571,7 +10242,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -9820,9 +10491,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79daa5ed5740825c40b389c5e50312b9c86df53fccd33f281df655642b43869d" +checksum = "88ad59a7560b41a70d191093a945f0b87bc1deeda46fb237479708a1d6b6cdfc" dependencies = [ "getrandom", "rand", @@ -9835,6 +10506,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "value-bag" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d92ccd67fb88503048c01b59152a04effd0782d035a83a6d256ce6085f08f4a3" + [[package]] name = "value-trait" version = "0.6.1" @@ -9891,6 +10568,12 @@ dependencies = [ "libc", ] +[[package]] +name = "waker-fn" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" + [[package]] name = "walkdir" version = "2.4.0" @@ -9937,7 +10620,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", "wasm-bindgen-shared", ] @@ -9971,7 +10654,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -10007,9 +10690,9 @@ dependencies = [ [[package]] name = "webpki" -version = "0.22.1" +version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e" +checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f" dependencies = [ "ring", "untrusted", @@ -10261,11 +10944,12 @@ dependencies = [ "ahash 0.8.3", "allocator-api2", "anyhow", + "async-std", "auto_enums", "aws-credential-types", "aws-sdk-s3", "aws-smithy-client", - "base64 0.21.3", + "base64 0.21.4", "bit-vec", "bitflags 2.4.0", "byteorder", @@ -10294,10 +10978,10 @@ dependencies = [ "futures-util", "hashbrown 0.12.3", "hashbrown 0.14.0", - "heck 0.4.1", "hyper", "indexmap 1.9.3", "itertools 0.10.5", + "itertools 0.11.0", "jni", "lazy_static", "lexical-core", @@ -10313,7 +10997,6 @@ dependencies = [ "madsim-tokio", "md-5", "mio", - "multimap", "nom", "num-bigint", "num-integer", @@ -10321,6 +11004,7 @@ dependencies = [ "num-traits", "opentelemetry_api", "opentelemetry_sdk", + "ordered-float 3.9.1", "parking_lot 0.12.1", "parking_lot_core 0.9.8", "petgraph", @@ -10329,31 +11013,40 @@ dependencies = [ "postgres-types", "proc-macro2", "prometheus", - "prost", + "prost 0.11.9", + "prost 0.12.1", + "prost-types 0.12.1", "rand", "rand_chacha", "rand_core", + "redis", "regex", - "regex-automata 0.3.8", - "regex-syntax 0.7.5", + "regex-automata 0.4.1", + "regex-syntax 0.8.0", "reqwest", "ring", "rust_decimal", + "rustls 0.21.7", "scopeguard", + "sea-orm", + "sea-query", + "sea-query-binder", "serde", "serde_json", - "serde_with 3.3.0", + "serde_with 3.4.0", "sha1", "sha2", "signature", "smallvec", + "sqlx", "sqlx-core", "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", + "strum 0.25.0", "subtle", "syn 1.0.109", - "syn 2.0.33", + "syn 2.0.37", "time", "time-macros", "tinyvec", @@ -10362,8 +11055,9 @@ dependencies = [ "tokio-stream", "tokio-util", "toml_datetime", - "toml_edit", - "tonic", + "toml_edit 0.19.15", + "tonic 0.10.2", + "tonic 0.9.2", "tower", "tracing", "tracing-core", @@ -10392,9 +11086,9 @@ checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" [[package]] name = "xorf" -version = "0.8.1" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57901b00e3f8e14f4d20b8955bf8087ecb545cfe2ed8741c2a2dbc89847a1a29" +checksum = "7d36478bcf71152a2f9f6cf9bc48273333f32780c769ef90e13d464ab778db5f" dependencies = [ "libm", "rand", @@ -10448,7 +11142,7 @@ checksum = "56097d5b91d711293a42be9289403896b68654625021732067eac7a4ca388a1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.33", + "syn 2.0.37", ] [[package]] @@ -10463,7 +11157,16 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" dependencies = [ - "zstd-safe", + "zstd-safe 6.0.6", +] + +[[package]] +name = "zstd" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" +dependencies = [ + "zstd-safe 7.0.0", ] [[package]] @@ -10476,6 +11179,15 @@ dependencies = [ "zstd-sys", ] +[[package]] +name = "zstd-safe" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e" +dependencies = [ + "zstd-sys", +] + [[package]] name = "zstd-sys" version = "2.0.8+zstd.1.5.5" diff --git a/Cargo.toml b/Cargo.toml index fe6a516e2dada..ef09221b818a2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,16 +7,21 @@ members = [ "src/cmd_all", "src/common", "src/common/common_service", + "src/common/heap_profiling", "src/compute", "src/connector", "src/ctl", - "src/expr", + "src/expr/core", + "src/expr/impl", "src/expr/macro", "src/frontend", "src/frontend/planner_test", "src/java_binding", "src/jni_core", "src/meta", + "src/meta/node", + "src/meta/service", + "src/meta/src/model_v2/migration", "src/object_store", "src/prost", "src/prost/helpers", @@ -91,10 +96,10 @@ aws-smithy-http = "0.55" aws-smithy-types = "0.55" aws-endpoint = "0.55" aws-types = "0.55" -etcd-client = { package = "madsim-etcd-client", version = "0.3" } +etcd-client = { package = "madsim-etcd-client", version = "0.4" } futures-async-stream = "0.2" hytra = "0.1" -rdkafka = { package = "madsim-rdkafka", git = "https://github.com/madsim-rs/madsim.git", rev = "fedb1e3", features = [ +rdkafka = { package = "madsim-rdkafka", version = "0.3.0", features = [ "cmake-build", ] } hashbrown = { version = "0.14.0", features = [ @@ -103,14 +108,18 @@ hashbrown = { version = "0.14.0", features = [ "nightly", ] } criterion = { version = "0.5", features = ["async_futures"] } -tonic = { package = "madsim-tonic", version = "0.3.1" } -tonic-build = { package = "madsim-tonic-build", version = "0.3.1" } -icelake = { git = "https://github.com/icelake-io/icelake", rev = "166a36b1a40a64086db09a0e0f2ed6791cec548b" } -arrow-array = "46" -arrow-schema = "46" -arrow-buffer = "46" -arrow-flight = "46" -arrow-select = "46" +tonic = { package = "madsim-tonic", version = "0.4.0" } +tonic-build = { package = "madsim-tonic-build", version = "0.4.2" } +prost = { version = "0.12" } +icelake = { git = "https://github.com/icelake-io/icelake", rev = "16dab0e36ab337e58ee8002d828def2d212fa116" } +arrow-array = "47" +arrow-cast = "47" +arrow-schema = "47" +arrow-buffer = "47" +arrow-flight = "47" +arrow-select = "47" +arrow-ord = "47" +tikv-jemalloc-ctl = { git = "https://github.com/risingwavelabs/jemallocator.git", rev = "64a2d9" } tikv-jemallocator = { git = "https://github.com/risingwavelabs/jemallocator.git", features = [ "profiling", "stats", @@ -121,16 +130,20 @@ risingwave_batch = { path = "./src/batch" } risingwave_cmd = { path = "./src/cmd" } risingwave_common = { path = "./src/common" } risingwave_common_service = { path = "./src/common/common_service" } +risingwave_common_heap_profiling = { path = "./src/common/heap_profiling" } risingwave_compactor = { path = "./src/storage/compactor" } risingwave_compute = { path = "./src/compute" } risingwave_ctl = { path = "./src/ctl" } risingwave_connector = { path = "./src/connector" } -risingwave_expr = { path = "./src/expr" } +risingwave_expr = { path = "./src/expr/core" } +risingwave_expr_impl = { path = "./src/expr/impl" } risingwave_frontend = { path = "./src/frontend" } risingwave_hummock_sdk = { path = "./src/storage/hummock_sdk" } risingwave_hummock_test = { path = "./src/storage/hummock_test" } risingwave_hummock_trace = { path = "./src/storage/hummock_trace" } risingwave_meta = { path = "./src/meta" } +risingwave_meta_service = { path = "./src/meta/service" } +risingwave_meta_node = { path = "./src/meta/node" } risingwave_object_store = { path = "./src/object_store" } risingwave_pb = { path = "./src/prost" } risingwave_rpc_client = { path = "./src/rpc_client" } @@ -224,4 +237,4 @@ tokio-stream = { git = "https://github.com/madsim-rs/tokio.git", rev = "fe39bb8e tokio-retry = { git = "https://github.com/madsim-rs/rust-tokio-retry.git", rev = "95e2fd3" } tokio-postgres = { git = "https://github.com/madsim-rs/rust-postgres.git", rev = "ac00d88" } # patch: unlimit 4MB message size for grpc client -etcd-client = { git = "https://github.com/risingwavelabs/etcd-client.git", rev = "d55550a" } +etcd-client = { git = "https://github.com/risingwavelabs/etcd-client.git", rev = "4e84d40" } diff --git a/Makefile.toml b/Makefile.toml index 3f10deb424465..86e9cea136bb5 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -24,6 +24,7 @@ env_scripts = [ #!@duckscript # only duckscript can modify env variables in cargo-make +# duckscript doc: https://github.com/sagiegurari/duckscript/blob/master/docs/sdk.md set_env ENABLE_TELEMETRY "false" @@ -70,6 +71,26 @@ else set_env BUILD_HUMMOCK_TRACE_CMD "" end +is_ci = get_env RISINGWAVE_CI +is_not_ci = not ${is_ci} + +if ${is_not_ci} + query_log_path = get_env RW_QUERY_LOG_PATH + no_query_log_path = not ${query_log_path} + + if ${no_query_log_path} + set_env RW_QUERY_LOG_PATH "${PREFIX_LOG}" + fi + + rust_log = get_env RUST_LOG + no_rust_log = not ${rust_log} + + if ${no_rust_log} + set_env RUST_LOG "pgwire_query_log=info" + else + set_env RUST_LOG "pgwire_query_log=info,${rust_log}" + end +end ''', ] @@ -385,6 +406,7 @@ condition = { env_set = [ "ENABLE_BUILD_DASHBOARD", ], files_modified = { input = [ "./dashboard/**/*.js", + "./dashboard/**/*.ts*", "./dashboard/package.json", "./dashboard/next.config.js", ], output = [ @@ -440,6 +462,12 @@ cargo build -p risingwave_cmd_all \ [tasks.clean] private = true category = "RiseDev - Build" +description = "Clean all build targets" +dependencies = ["clean-rust", "clean-java"] + +[tasks.clean-rust] +private = true +category = "RiseDev - Build" description = "Clean Rust targets" condition = { env_set = ["ENABLE_BUILD_RUST"] } script = ''' @@ -448,6 +476,25 @@ set -e cargo clean ''' +[tasks.clean-java] +private = true +category = "RiseDev - Build" +description = "Clean Rust targets" +condition = { env_set = ["ENABLE_RW_CONNECTOR", "ENABLE_BUILD_RW_CONNECTOR"] } +script = ''' +#!/usr/bin/env bash +set -e + +if command -v mvn &> /dev/null; then + MAVEN_PATH="$(command -v mvn)" +else + MAVEN_PATH="${PREFIX_BIN}/maven/bin/mvn" +fi + +cd "${JAVA_DIR}" +"${MAVEN_PATH}" clean +''' + [tasks.build-docs] private = true category = "RiseDev - Build" @@ -855,7 +902,7 @@ TARGET_PATH="${JAVA_DIR}/connector-node/assembly/target/${ARTIFACT}" echo "Building connector node..." cd "${JAVA_DIR}" -"${MAVEN_PATH}" --batch-mode --update-snapshots package -Dmaven.test.skip +"${MAVEN_PATH}" --batch-mode --update-snapshots package -Dmaven.test.skip -Dno-build-rust rm -rf ${PREFIX_BIN}/connector-node mkdir -p "${PREFIX_BIN}/connector-node" @@ -1270,7 +1317,6 @@ dependencies = ["k", "l", "check-logs", "wait-processes-exit"] description = "Kill cluster, dump logs and check logs" [tasks.wait-processes-exit] -private = true category = "Misc" description = "Wait for RisingWave processes to exit" script = """ @@ -1292,9 +1338,12 @@ echo "All processes has exited." [tasks.slt] category = "RiseDev - SQLLogicTest" -install_crate = { version = "0.17.0", crate_name = "sqllogictest-bin", binary = "sqllogictest", test_arg = [ +install_crate = { version = "0.17.1", crate_name = "sqllogictest-bin", binary = "sqllogictest", test_arg = [ "--help", ], install_command = "binstall" } +dependencies = ["check-risedev-env-file"] +env_files = ["${PREFIX_CONFIG}/risedev-env"] +env = { SLT_HOST = "${RW_FRONTEND_LISTEN_ADDRESS}", SLT_PORT = "${RW_FRONTEND_PORT}", SLT_DB = "dev" } command = "sqllogictest" args = ["${@}"] description = "🌟 Run SQLLogicTest" @@ -1397,3 +1446,15 @@ cargo run -p risingwave_common --bin example-config >> src/config/example.toml category = "RiseDev - Backwards Compatibility Test" description = "Run backwards compatibility test" script = "./backwards-compat-tests/scripts/run_local.sh" + +# For debugging. +# To show the env for a specific task, use `run_task = "show-env"` for that task. +[tasks.show-env] +private = true +description = "Show cargo-make runtime environment variables" +script = """ +#!@duckscript +# https://github.com/sagiegurari/cargo-make/issues/889 +vars = dump_variables +echo ${vars} +""" diff --git a/README.md b/README.md index d63369b0200a6..c1878a2717159 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@

-[![Slack](https://badgen.net/badge/Slack/Join%20RisingWave/0abd59?icon=slack)](https://join.slack.com/t/risingwave-community/shared_invite/zt-120rft0mr-d8uGk3d~NZiZAQWPnElOfw) +[![Slack](https://badgen.net/badge/Slack/Join%20RisingWave/0abd59?icon=slack)](https://risingwave.com/slack) [![Build status](https://badge.buildkite.com/9394d2bca0f87e2e97aa78b25f765c92d4207c0b65e7f6648f.svg)](https://buildkite.com/risingwavelabs/main) [![codecov](https://codecov.io/gh/risingwavelabs/risingwave/branch/main/graph/badge.svg?token=EB44K9K38B)](https://codecov.io/gh/risingwavelabs/risingwave) @@ -45,7 +45,7 @@ To learn about how to use RisingWave, refer to [RisingWave User Documentation](h ## Community -Looking for help, discussions, collaboration opportunities, or a casual afternoon chat with our fellow engineers and community members? Join our [Slack workspace](https://join.slack.com/t/risingwave-community/shared_invite/zt-120rft0mr-d8uGk3d~NZiZAQWPnElOfw)! +Looking for help, discussions, collaboration opportunities, or a casual afternoon chat with our fellow engineers and community members? Join our [Slack workspace](https://risingwave.com/slack)! ## License diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml index ddef3984d3bd4..6fe7cfbfdeca2 100644 --- a/ci/docker-compose.yml +++ b/ci/docker-compose.yml @@ -87,6 +87,7 @@ services: - db - elasticsearch - clickhouse-server + - pulsar volumes: - ..:/risingwave @@ -182,3 +183,19 @@ services: KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_INTERNAL:PLAINTEXT KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9093,PLAINTEXT_INTERNAL://localhost:29093 KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + + pulsar: + container_name: pulsar + image: apachepulsar/pulsar:latest + command: bin/pulsar standalone + ports: + - "6650:6650" + - "6651:8080" + expose: + - "8080" + - "6650" + healthcheck: + test: [ "CMD-SHELL", "bin/pulsar-admin brokers healthcheck"] + interval: 5s + timeout: 5s + retries: 5 diff --git a/ci/scripts/backfill-test.sh b/ci/scripts/backfill-test.sh index 039e8bee94865..c0b95da958fed 100755 --- a/ci/scripts/backfill-test.sh +++ b/ci/scripts/backfill-test.sh @@ -30,13 +30,6 @@ git config --global --add safe.directory /risingwave download_and_prepare_rw "$profile" common -echo "--- e2e, ci-backfill, build" -cargo make ci-start ci-backfill - ################ TESTS -echo "--- e2e, ci-backfill, run backfill test" ./ci/scripts/run-backfill-tests.sh - -echo "--- Kill cluster" -cargo make kill diff --git a/ci/scripts/check.sh b/ci/scripts/check.sh index fe0b79dcf30ef..2d194c40e2e99 100755 --- a/ci/scripts/check.sh +++ b/ci/scripts/check.sh @@ -42,7 +42,7 @@ sccache --show-stats sccache --zero-stats echo "--- Run doctest" -cargo test --doc +RUSTDOCFLAGS="-Clink-arg=-fuse-ld=lld" cargo test --doc echo "--- Show sccache stats" sccache --show-stats diff --git a/ci/scripts/cron-fuzz-test.sh b/ci/scripts/cron-fuzz-test.sh index f12e3063a5a3b..c58f074decdf1 100755 --- a/ci/scripts/cron-fuzz-test.sh +++ b/ci/scripts/cron-fuzz-test.sh @@ -4,8 +4,13 @@ set -euo pipefail source ci/scripts/common.sh -export RUN_SQLSMITH=0 -export RUN_SQLSMITH_FRONTEND=1 + +# NOTE(kwannoel): Disabled because there's some breakage after #12485, +# see https://github.com/risingwavelabs/risingwave/issues/12577. +# Frontend is relatively stable, e2e fuzz test will cover the same cases also, +# so we can just disable it. +export RUN_SQLSMITH_FRONTEND=0 +export RUN_SQLSMITH=1 export SQLSMITH_COUNT=1000 export TEST_NUM=100 source ci/scripts/run-fuzz-test.sh diff --git a/ci/scripts/deterministic-recovery-test.sh b/ci/scripts/deterministic-recovery-test.sh index 0d3a7b3fabed4..6514fe1f7c0c3 100755 --- a/ci/scripts/deterministic-recovery-test.sh +++ b/ci/scripts/deterministic-recovery-test.sh @@ -9,11 +9,19 @@ echo "--- Download artifacts" download-and-decompress-artifact risingwave_simulation . chmod +x ./risingwave_simulation -export RUST_LOG="info,risingwave_meta::barrier::recovery=debug" +export RUST_LOG="info,\ +risingwave_meta::barrier::recovery=debug,\ +risingwave_meta::rpc::ddl_controller=debug,\ +risingwave_meta::barrier::mod=debug,\ +risingwave_simulation=debug" export LOGDIR=.risingwave/log mkdir -p $LOGDIR +# FIXME(kwannoel): Why is this failing? +# echo "--- deterministic simulation e2e, ci-3cn-2fe-3meta, recovery, background_ddl" +# seq $TEST_NUM | parallel MADSIM_TEST_SEED={} './risingwave_simulation --kill --kill-rate=${KILL_RATE} ./e2e_test/background_ddl/sim/basic.slt 2> $LOGDIR/recovery-ddl-{}.log && rm $LOGDIR/recovery-ddl-{}.log' + echo "--- deterministic simulation e2e, ci-3cn-2fe-3meta, recovery, ddl" seq $TEST_NUM | parallel MADSIM_TEST_SEED={} './risingwave_simulation --kill --kill-rate=${KILL_RATE} ./e2e_test/ddl/\*\*/\*.slt 2> $LOGDIR/recovery-ddl-{}.log && rm $LOGDIR/recovery-ddl-{}.log' diff --git a/ci/scripts/e2e-iceberg-sink-test.sh b/ci/scripts/e2e-iceberg-sink-test.sh index 41f1ee2a80f26..1a12225ab5435 100755 --- a/ci/scripts/e2e-iceberg-sink-test.sh +++ b/ci/scripts/e2e-iceberg-sink-test.sh @@ -5,6 +5,9 @@ set -euo pipefail source ci/scripts/common.sh +# prepare environment +export CONNECTOR_LIBS_PATH="./connector-node/libs" + while getopts 'p:' opt; do case ${opt} in p ) diff --git a/ci/scripts/e2e-iceberg-sink-v2-test.sh b/ci/scripts/e2e-iceberg-sink-v2-test.sh index 83c0d187d6b3b..0e8054a4946af 100755 --- a/ci/scripts/e2e-iceberg-sink-v2-test.sh +++ b/ci/scripts/e2e-iceberg-sink-v2-test.sh @@ -38,7 +38,10 @@ bash ./start_spark_connect_server.sh # Don't remove the `--quiet` option since poetry has a bug when printing output, see # https://github.com/python-poetry/poetry/issues/3412 "$HOME"/.local/bin/poetry update --quiet -"$HOME"/.local/bin/poetry run python main.py +"$HOME"/.local/bin/poetry run python main.py -t ./test_case/no_partition_append_only.toml +"$HOME"/.local/bin/poetry run python main.py -t ./test_case/no_partition_upsert.toml +"$HOME"/.local/bin/poetry run python main.py -t ./test_case/partition_append_only.toml +"$HOME"/.local/bin/poetry run python main.py -t ./test_case/partition_upsert.toml echo "--- Kill cluster" diff --git a/ci/scripts/e2e-kafka-sink-test.sh b/ci/scripts/e2e-kafka-sink-test.sh index 85aad20749d61..06ef185f46e8b 100755 --- a/ci/scripts/e2e-kafka-sink-test.sh +++ b/ci/scripts/e2e-kafka-sink-test.sh @@ -5,6 +5,7 @@ set -euo pipefail ./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-append-only --create > /dev/null 2>&1 ./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert --create > /dev/null 2>&1 +./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert-schema --create > /dev/null 2>&1 ./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-debezium --create > /dev/null 2>&1 sqllogictest -p 4566 -d dev 'e2e_test/sink/kafka/create_sink.slt' @@ -28,6 +29,15 @@ if [ $? -ne 0 ]; then exit 1 fi +# test upsert kafka sink with schema +echo "testing upsert kafka sink with schema" +diff ./e2e_test/sink/kafka/upsert_schema1.result \ +<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert-schema --from-beginning --property print.key=true --max-messages 10 | sort) 2> /dev/null) +if [ $? -ne 0 ]; then + echo "The output for upsert sink with schema is not as expected." + exit 1 +fi + # test debezium kafka sink echo "testing debezium kafka sink" (./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-debezium --property print.key=true --from-beginning --max-messages 10 | sort) > ./e2e_test/sink/kafka/debezium1.tmp.result 2> /dev/null @@ -62,6 +72,15 @@ if [ $? -ne 0 ]; then exit 1 fi +# test upsert kafka sink with schema after update +echo "testing upsert kafka sink with schema after updating data" +diff ./e2e_test/sink/kafka/upsert_schema2.result \ +<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert-schema --from-beginning --property print.key=true --max-messages 11 | sort) 2> /dev/null) +if [ $? -ne 0 ]; then + echo "The output for upsert sink with schema is not as expected." + exit 1 +fi + # test debezium kafka sink after update echo "testing debezium kafka sink after updating data" (./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-debezium --property print.key=true --from-beginning --max-messages 11 | sort) > ./e2e_test/sink/kafka/debezium2.tmp.result 2> /dev/null @@ -87,6 +106,15 @@ if [ $? -ne 0 ]; then exit 1 fi +# test upsert kafka sink with schema after delete +echo "testing upsert kafka sink with schema after deleting data" +diff ./e2e_test/sink/kafka/upsert_schema3.result \ +<((./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert-schema --from-beginning --property print.key=true --max-messages 12 | sort) 2> /dev/null) +if [ $? -ne 0 ]; then + echo "The output for upsert sink with schema is not as expected." + exit 1 +fi + # test debezium kafka sink after delete echo "testing debezium kafka sink after deleting data" (./.risingwave/bin/kafka/bin/kafka-console-consumer.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-debezium --property print.key=true --from-beginning --max-messages 13 | sort) > ./e2e_test/sink/kafka/debezium3.tmp.result 2> /dev/null @@ -103,3 +131,10 @@ sqllogictest -p 4566 -d dev 'e2e_test/sink/kafka/drop_sink.slt' ./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-append-only --delete > /dev/null 2>&1 ./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-upsert --delete > /dev/null 2>&1 ./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-debezium --delete > /dev/null 2>&1 + +# test different encoding +echo "testing protobuf" +cp src/connector/src/test_data/proto_recursive/recursive.pb ./proto-recursive +./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-append-only-protobuf --create > /dev/null 2>&1 +sqllogictest -p 4566 -d dev 'e2e_test/sink/kafka/protobuf.slt' +./.risingwave/bin/kafka/bin/kafka-topics.sh --bootstrap-server 127.0.0.1:29092 --topic test-rw-sink-append-only-protobuf --delete > /dev/null 2>&1 diff --git a/ci/scripts/e2e-pulsar-sink-test.sh b/ci/scripts/e2e-pulsar-sink-test.sh new file mode 100755 index 0000000000000..a2a0edb550f33 --- /dev/null +++ b/ci/scripts/e2e-pulsar-sink-test.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +source ci/scripts/common.sh + +while getopts 'p:' opt; do + case ${opt} in + p ) + profile=$OPTARG + ;; + \? ) + echo "Invalid Option: -$OPTARG" 1>&2 + exit 1 + ;; + : ) + echo "Invalid option: $OPTARG requires an argument" 1>&2 + ;; + esac +done +shift $((OPTIND -1)) + +download_and_prepare_rw "$profile" source + +echo "--- starting risingwave cluster" +cargo make ci-start ci-pulsar-test +sleep 1 + +echo "--- waiting until pulsar is healthy" +HTTP_CODE=404 +MAX_RETRY=20 +while [[ $HTTP_CODE -ne 200 && MAX_RETRY -gt 0 ]] +do + HTTP_CODE=$(curl --connect-timeout 2 -s -o /dev/null -w ''%{http_code}'' http://pulsar:8080/admin/v2/clusters) + ((MAX_RETRY--)) + sleep 5 +done + +# Exits as soon as any line fails. +set -euo pipefail + +echo "--- testing pulsar sink" +sqllogictest -p 4566 -d dev './e2e_test/sink/pulsar_sink.slt' + +sleep 1 + +echo "--- Kill cluster" +cargo make ci-kill \ No newline at end of file diff --git a/ci/scripts/pr-fuzz-test.sh b/ci/scripts/pr-fuzz-test.sh index bbf8471864e9e..66923c4fb8a71 100755 --- a/ci/scripts/pr-fuzz-test.sh +++ b/ci/scripts/pr-fuzz-test.sh @@ -5,35 +5,15 @@ set -euo pipefail source ci/scripts/common.sh -set +e -# Set features, depending on our workflow -# If sqlsmith files are modified, we run tests with sqlsmith enabled. -MATCHES="ci/scripts/cron-fuzz-test.sh\ -\|ci/scripts/pr-fuzz-test.sh\ -\|ci/scripts/run-fuzz-test.sh\ -\|src/tests/sqlsmith" -NOT_MATCHES="\.md" -CHANGED=$(git diff --name-only origin/main | grep -v "$NOT_MATCHES" | grep "$MATCHES") -set -e -# Always run sqlsmith frontend tests -export RUN_SQLSMITH_FRONTEND=1 +# NOTE(kwannoel): Disabled because there's some breakage after #12485, +# see https://github.com/risingwavelabs/risingwave/issues/12577. +# Frontend is relatively stable, e2e fuzz test will cover the same cases also, +# so we can just disable it. +export RUN_SQLSMITH_FRONTEND=0 export RUN_SQLSMITH=1 export SQLSMITH_COUNT=100 - -# Run e2e tests if changes to sqlsmith source files detected. -# NOTE(kwannoel): Keep this here in-case we ever want to revert. -#if [[ -n "$CHANGED" ]]; then -# echo "--- Checking whether to run all sqlsmith tests" -# echo "origin/main SHA: $(git rev-parse origin/main)" -# echo "Changes to Sqlsmith source files detected:" -# echo "$CHANGED" -# export RUN_SQLSMITH=1 -# export SQLSMITH_COUNT=100 -# export TEST_NUM=32 -# echo "Enabled Sqlsmith tests." -#else -# export RUN_SQLSMITH=0 -#fi +export TEST_NUM=32 +echo "Enabled Sqlsmith tests." source ci/scripts/run-fuzz-test.sh diff --git a/ci/scripts/release.sh b/ci/scripts/release.sh index 9852d48e0ba50..08e5794f173cd 100755 --- a/ci/scripts/release.sh +++ b/ci/scripts/release.sh @@ -11,7 +11,7 @@ if [ "${BUILDKITE_SOURCE}" != "schedule" ] && [ "${BUILDKITE_SOURCE}" != "webhoo fi echo "--- Install java and maven" -yum install -y java-11-openjdk wget python3 cyrus-sasl-devel +yum install -y java-11-openjdk java-11-openjdk-devel wget python3 cyrus-sasl-devel pip3 install toml-cli wget https://ci-deps-dist.s3.amazonaws.com/apache-maven-3.9.3-bin.tar.gz && tar -zxvf apache-maven-3.9.3-bin.tar.gz export PATH="${REPO_ROOT}/apache-maven-3.9.3/bin:$PATH" @@ -64,6 +64,10 @@ elif [[ -n "${BINARY_NAME+x}" ]]; then aws s3 cp risingwave-${BINARY_NAME}-x86_64-unknown-linux.tar.gz s3://risingwave-nightly-pre-built-binary fi +echo "--- Build connector node" +cd ${REPO_ROOT}/java && mvn -B package -Dmaven.test.skip=true -Dno-build-rust +cd ${REPO_ROOT} && mv ${REPO_ROOT}/java/connector-node/assembly/target/risingwave-connector-1.0.0.tar.gz risingwave-connector-"${BUILDKITE_TAG}".tar.gz + if [[ -n "${BUILDKITE_TAG}" ]]; then echo "--- Install gh cli" yum install -y dnf @@ -87,8 +91,6 @@ if [[ -n "${BUILDKITE_TAG}" ]]; then gh release upload "${BUILDKITE_TAG}" risectl-"${BUILDKITE_TAG}"-x86_64-unknown-linux.tar.gz echo "--- Release build and upload risingwave connector node jar asset" - cd ${REPO_ROOT}/java && mvn -B package -Dmaven.test.skip=true -Djava.binding.release=true - cd connector-node/assembly/target && mv risingwave-connector-1.0.0.tar.gz risingwave-connector-"${BUILDKITE_TAG}".tar.gz gh release upload "${BUILDKITE_TAG}" risingwave-connector-"${BUILDKITE_TAG}".tar.gz fi diff --git a/ci/scripts/run-backfill-tests.sh b/ci/scripts/run-backfill-tests.sh index 6c02442a06255..d0d5eafb3c917 100755 --- a/ci/scripts/run-backfill-tests.sh +++ b/ci/scripts/run-backfill-tests.sh @@ -1,23 +1,31 @@ #!/usr/bin/env bash # Runs backfill tests. -# NOTE(kwannoel): -# The following scenario is adapted in madsim's integration tests as well. -# But this script reproduces it more reliably (I'm not sure why.) -# Hence keeping it in case we ever need to debug backfill again. # USAGE: -# Start a rw cluster then run this script. # ```sh -# ./risedev d +# cargo make ci-start ci-backfill # ./ci/scripts/run-backfill-tests.sh # ``` +# Example progress: +# dev=> select * from rw_catalog.rw_ddl_progress; +# ddl_id | ddl_statement | progress | initialized_at +#--------+------------------------------------------------+----------+------------------------------- +# 1002 | CREATE MATERIALIZED VIEW m1 AS SELECT * FROM t | 56.12% | 2023-09-27 06:37:06.636+00:00 +#(1 row) set -euo pipefail PARENT_PATH=$(dirname "${BASH_SOURCE[0]}") +TEST_DIR=$PWD/e2e_test +BACKGROUND_DDL_DIR=$TEST_DIR/background_ddl +COMMON_DIR=$BACKGROUND_DDL_DIR/common + +CLUSTER_PROFILE='ci-1cn-1fe-with-recovery' +export RUST_LOG="risingwave_meta=debug" + run_sql_file() { psql -h localhost -p 4566 -d dev -U root -f "$@" } @@ -30,15 +38,405 @@ flush() { run_sql "FLUSH;" } -run_sql_file "$PARENT_PATH"/sql/backfill/create_base_table.sql +cancel_stream_jobs() { + ID=$(run_sql "select ddl_id from rw_catalog.rw_ddl_progress;" | tail -3 | head -1 | grep -E -o "[0-9]*") + echo "CANCELLING STREAM_JOB: $ID" + run_sql "CANCEL JOBS $ID;" .risingwave/log/compute-node.log 2>&1 & +} + +# Test snapshot and upstream read. +test_snapshot_and_upstream_read() { + echo "--- e2e, ci-backfill, test_snapshot_and_upstream_read" + cargo make ci-start ci-backfill + + run_sql_file "$PARENT_PATH"/sql/backfill/create_base_table.sql + + # Provide snapshot + run_sql_file "$PARENT_PATH"/sql/backfill/insert.sql + + # Provide updates ... + run_sql_file "$PARENT_PATH"/sql/backfill/insert.sql & + + # ... and concurrently create mv. + run_sql_file "$PARENT_PATH"/sql/backfill/create_mv.sql & + + wait + + run_sql_file "$PARENT_PATH"/sql/backfill/select.sql deletes.log 2>&1 & + + ./risedev psql -c "CREATE MATERIALIZED VIEW m1 as select * from tomb;" + echo "--- Kill cluster" + kill_cluster + wait +} + +test_backfill_restart_cn_recovery() { + echo "--- e2e, $CLUSTER_PROFILE, test_background_restart_cn_recovery" + cargo make ci-start $CLUSTER_PROFILE + + # Test before recovery + sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_table.slt" + sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_bg_mv.slt" + sleep 1 + OLD_PROGRESS=$(run_sql "SHOW JOBS;" | grep -E -o "[0-9]{1,2}\.[0-9]{1,2}") + + # Restart 1 CN + restart_cn + + # Give some time to recover. + sleep 3 + + # Test after recovery + sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt" + + # Recover the mview progress + sleep 5 + + NEW_PROGRESS=$(run_sql "SHOW JOBS;" | grep -E -o "[0-9]{1,2}\.[0-9]{1,2}") + + if [[ ${OLD_PROGRESS%.*} -le ${NEW_PROGRESS%.*} ]]; then + echo "OK: $OLD_PROGRESS smaller or equal to $NEW_PROGRESS" + else + echo "FAILED: $OLD_PROGRESS larger than $NEW_PROGRESS" + exit 1 + fi + + # Trigger a bootstrap recovery + pkill compute-node + kill_cluster + rename_logs_with_prefix "before-restart" + sleep 10 + cargo make dev $CLUSTER_PROFILE + + # Recover mview progress + sleep 5 + + OLD_PROGRESS=$NEW_PROGRESS + NEW_PROGRESS=$(run_sql "SHOW JOBS;" | grep -E -o "[0-9]{1,2}\.[0-9]{1,2}") + + if [[ ${OLD_PROGRESS%.*} -le ${NEW_PROGRESS%.*} ]]; then + echo "OK: $OLD_PROGRESS smaller or equal to $NEW_PROGRESS" + else + echo "FAILED: $OLD_PROGRESS larger than $NEW_PROGRESS" + exit 1 + fi + + sleep 60 + + # Test after backfill finished + sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_backfilled_mv.slt" + + # After cluster restart(s), backfilled mv should still be present. + restart_cluster + sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_backfilled_mv.slt" + restart_cluster + sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_backfilled_mv.slt" + + sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_mv.slt" + sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_table.slt" + + kill_cluster +} + +main() { + set -euo pipefail + test_snapshot_and_upstream_read + test_backfill_tombstone + test_background_ddl_recovery + test_background_ddl_cancel + test_foreground_ddl_no_recover + test_foreground_ddl_cancel + test_foreground_index_cancel + test_foreground_sink_cancel + test_backfill_restart_cn_recovery +} -echo "Backfill tests complete" +main diff --git a/ci/scripts/run-e2e-test.sh b/ci/scripts/run-e2e-test.sh index e88704ca7a22f..8fb29ec5bd1a2 100755 --- a/ci/scripts/run-e2e-test.sh +++ b/ci/scripts/run-e2e-test.sh @@ -28,18 +28,25 @@ fi cluster_start() { if [[ $mode == "standalone" ]]; then + mkdir -p "$PREFIX_LOG" + cargo make clean-data + cargo make pre-start-dev start_standalone "$PREFIX_LOG"/standalone.log & - cargo make ci-start standalone-minio-etcd-compactor + cargo make dev standalone-minio-etcd else cargo make ci-start "$mode" fi } cluster_stop() { - if [[ $mode == "standalone" ]]; then + if [[ $mode == "standalone" ]] + then stop_standalone + # Don't check standalone logs, they will exceed the limit. + cargo make kill + else + cargo make ci-kill fi - cargo make ci-kill } download_and_prepare_rw "$profile" common @@ -65,11 +72,7 @@ echo "--- e2e, $mode, batch" RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \ cluster_start sqllogictest -p 4566 -d dev './e2e_test/ddl/**/*.slt' --junit "batch-ddl-${profile}" -if [[ $mode != "standalone" ]]; then - sqllogictest -p 4566 -d dev './e2e_test/background_ddl/**/*.slt' --junit "batch-ddl-${profile}" -else - echo "Skipping background_ddl test for $mode" -fi +sqllogictest -p 4566 -d dev './e2e_test/background_ddl/basic.slt' --junit "batch-ddl-${profile}" sqllogictest -p 4566 -d dev './e2e_test/visibility_mode/*.slt' --junit "batch-${profile}" sqllogictest -p 4566 -d dev './e2e_test/database/prepare.slt' sqllogictest -p 4566 -d test './e2e_test/database/test.slt' @@ -184,6 +187,13 @@ if [[ "$mode" == "standalone" ]]; then run_sql() { psql -h localhost -p 4566 -d dev -U root -c "$@" } + compactor_is_online() { + set +e + grep -q "risingwave_cmd_all::standalone: starting compactor-node thread" "${PREFIX_LOG}/standalone.log" + local EXIT_CODE=$? + set -e + return $EXIT_CODE + } echo "--- e2e, standalone, cluster-persistence-test" cluster_start @@ -214,6 +224,43 @@ if [[ "$mode" == "standalone" ]]; then echo "--- Kill cluster" cluster_stop + wait + + # Test that we can optionally include nodes in standalone mode. + echo "--- e2e, standalone, cluster-opts-test" + + echo "test standalone without compactor" + mkdir -p "$PREFIX_LOG" + cargo make clean-data + cargo make pre-start-dev + start_standalone_without_compactor "$PREFIX_LOG"/standalone.log & + cargo make dev standalone-minio-etcd-compactor + wait_standalone + if compactor_is_online + then + echo "ERROR: Compactor should not be online." + exit 1 + fi + cluster_stop + echo "test standalone without compactor [TEST PASSED]" + + wait + + echo "test standalone with compactor" + mkdir -p "$PREFIX_LOG" + cargo make clean-data + cargo make pre-start-dev + start_standalone "$PREFIX_LOG"/standalone.log & + cargo make dev standalone-minio-etcd + wait_standalone + if ! compactor_is_online + then + echo "ERROR: Compactor should be online." + exit 1 + fi + cluster_stop + echo "test standalone with compactor [TEST PASSED]" + # Make sure any remaining background task exits. wait fi diff --git a/ci/scripts/run-micro-benchmarks.sh b/ci/scripts/run-micro-benchmarks.sh index 13648d1661ab2..568c90de425ca 100755 --- a/ci/scripts/run-micro-benchmarks.sh +++ b/ci/scripts/run-micro-benchmarks.sh @@ -10,6 +10,12 @@ set -euo pipefail # Make sure the added benchmark has a unique name. BENCHMARKS="stream_hash_agg json_parser bench_block_iter bench_compactor bench_lru_cache bench_merge_iter" +# Reference: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html +get_instance_type() { + TOKEN=`curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"` \ + && curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-type +} + # cargo criterion --bench stream_hash_agg --message-format=json bench() { BENCHMARK_NAME=$1 @@ -34,6 +40,17 @@ bench() { } main() { + # FIXME(kwannoel): This is a workaround + # Microbenchmarks need to be namespaced by instance types, + # the result upload endpoint needs to be parameterized by instance type as well to support this. + echo "--- Getting aws instance type" + local instance_type=$(get_instance_type) + echo "instance_type: $instance_type" + if [[ $instance_type != "m6i.4xlarge" ]]; then + echo "Only m6i.4xlarge is supported, skipping microbenchmark" + exit 0 + fi + # We need cargo criterion to generate machine-readable benchmark results from # microbench. echo "--- Installing cargo criterion" diff --git a/ci/scripts/s3-source-test.sh b/ci/scripts/s3-source-test.sh index 710ba63b6fd60..9fce76f000e31 100755 --- a/ci/scripts/s3-source-test.sh +++ b/ci/scripts/s3-source-test.sh @@ -30,7 +30,7 @@ cargo make ci-start ci-1cn-1fe echo "--- Run test" python3 -m pip install minio psycopg2-binary -python3 e2e_test/s3/$script.py +python3 e2e_test/s3/$script echo "--- Kill cluster" cargo make ci-kill diff --git a/ci/scripts/sql/backfill/insert.sql b/ci/scripts/sql/backfill/insert.sql index 18ed763429231..f25f8b09cb2b8 100644 --- a/ci/scripts/sql/backfill/insert.sql +++ b/ci/scripts/sql/backfill/insert.sql @@ -2,5 +2,5 @@ insert into t1 SELECT generate_series, '{"orders": {"id": 1, "price": "2.30", "customer_id": 2}}'::jsonb -FROM generate_series(1, 200000); +FROM generate_series(1, 50000); FLUSH; \ No newline at end of file diff --git a/ci/scripts/standalone-utils.sh b/ci/scripts/standalone-utils.sh index 4461331c28bfb..438f413ebe4dc 100755 --- a/ci/scripts/standalone-utils.sh +++ b/ci/scripts/standalone-utils.sh @@ -6,7 +6,9 @@ export RW_PREFIX=$PWD/.risingwave export PREFIX_BIN=$RW_PREFIX/bin export PREFIX_LOG=$RW_PREFIX/log -start_standalone() { +# NOTE(kwannoel): Compared to start_standalone below, we omitted the compactor-opts, +# so it should not start. +start_standalone_without_compactor() { RUST_BACKTRACE=1 \ "$PREFIX_BIN"/risingwave/standalone \ --meta-opts=" \ @@ -24,7 +26,6 @@ start_standalone() { --listen-addr 127.0.0.1:5688 \ --prometheus-listener-addr 127.0.0.1:1222 \ --advertise-addr 127.0.0.1:5688 \ - --metrics-level info \ --async-stack-trace verbose \ --connector-rpc-endpoint 127.0.0.1:50051 \ --parallelism 4 \ @@ -36,10 +37,49 @@ start_standalone() { --advertise-addr 127.0.0.1:4566 \ --prometheus-listener-addr 127.0.0.1:2222 \ --health-check-listener-addr 127.0.0.1:6786 \ - --metrics-level info \ --meta-addr http://127.0.0.1:5690" >"$1" 2>&1 } +# You can fill up this section by consulting +# .risingwave/log/risedev.log, after calling ./risedev d full. +# It is expected that minio, etcd will be started after this is called. +start_standalone() { + RUST_BACKTRACE=1 \ + "$PREFIX_BIN"/risingwave/standalone \ + --meta-opts=" \ + --listen-addr 127.0.0.1:5690 \ + --advertise-addr 127.0.0.1:5690 \ + --dashboard-host 127.0.0.1:5691 \ + --prometheus-host 127.0.0.1:1250 \ + --connector-rpc-endpoint 127.0.0.1:50051 \ + --backend etcd \ + --etcd-endpoints 127.0.0.1:2388 \ + --state-store hummock+minio://hummockadmin:hummockadmin@127.0.0.1:9301/hummock001 \ + --data-directory hummock_001 \ + --dashboard-ui-path $RW_PREFIX/ui" \ + --compute-opts=" \ + --listen-addr 127.0.0.1:5688 \ + --prometheus-listener-addr 127.0.0.1:1222 \ + --advertise-addr 127.0.0.1:5688 \ + --async-stack-trace verbose \ + --connector-rpc-endpoint 127.0.0.1:50051 \ + --parallelism 4 \ + --total-memory-bytes 8589934592 \ + --role both \ + --meta-address http://127.0.0.1:5690" \ + --frontend-opts=" \ + --listen-addr 127.0.0.1:4566 \ + --advertise-addr 127.0.0.1:4566 \ + --prometheus-listener-addr 127.0.0.1:2222 \ + --health-check-listener-addr 127.0.0.1:6786 \ + --meta-addr http://127.0.0.1:5690" \ + --compactor-opts=" \ + --listen-addr 127.0.0.1:6660 \ + --prometheus-listener-addr 127.0.0.1:1260 \ + --advertise-addr 127.0.0.1:6660 \ + --meta-address http://127.0.0.1:5690" >"$1" 2>&1 +} + stop_standalone() { pkill standalone } diff --git a/ci/workflows/integration-tests.yml b/ci/workflows/integration-tests.yml index 6c4851c0c669c..4bd0ec1a000b1 100644 --- a/ci/workflows/integration-tests.yml +++ b/ci/workflows/integration-tests.yml @@ -29,7 +29,7 @@ steps: - "postgres-cdc" - "mysql-sink" - "postgres-sink" - - "iceberg-sink" + # - "iceberg-sink" - "debezium-mysql" format: - "json" @@ -75,10 +75,10 @@ steps: testcase: "postgres-sink" format: "protobuf" skip: true - - with: - testcase: "iceberg-sink" - format: "protobuf" - skip: true + # - with: + # testcase: "iceberg-sink" + # format: "protobuf" + # skip: true - with: testcase: "debezium-mysql" format: "protobuf" diff --git a/ci/workflows/main-cron.yml b/ci/workflows/main-cron.yml index 20a47373e0f1e..d8e78952c141f 100644 --- a/ci/workflows/main-cron.yml +++ b/ci/workflows/main-cron.yml @@ -99,7 +99,7 @@ steps: config: ci/docker-compose.yml mount-buildkite-agent: true - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 10 + timeout_in_minutes: 12 retry: *auto-retry - label: "end-to-end source test (release)" @@ -245,25 +245,6 @@ steps: timeout_in_minutes: 5 retry: *auto-retry - - label: "connector node integration test Java {{matrix.java_version}}" - command: "ci/scripts/connector-node-integration-test.sh -p ci-release -v {{matrix.java_version}}" - depends_on: - - "build" - - "build-other" - plugins: - - docker-compose#v4.9.0: - run: rw-build-env - config: ci/docker-compose.yml - mount-buildkite-agent: true - - ./ci/plugins/upload-failure-logs - matrix: - setup: - java_version: - - "11" - - "17" - timeout_in_minutes: 10 - retry: *auto-retry - - label: "end-to-end iceberg sink test (release)" command: "ci/scripts/e2e-iceberg-sink-test.sh -p ci-release" depends_on: @@ -289,7 +270,7 @@ steps: config: ci/docker-compose.yml mount-buildkite-agent: true - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 5 + timeout_in_minutes: 10 retry: *auto-retry - label: "e2e java-binding test (release)" @@ -309,7 +290,7 @@ steps: retry: *auto-retry - label: "S3 source check on AWS (json parser)" - command: "ci/scripts/s3-source-test.sh -p ci-release -s run" + command: "ci/scripts/s3-source-test.sh -p ci-release -s run.py" depends_on: build plugins: - seek-oss/aws-sm#v2.3.1: @@ -326,7 +307,7 @@ steps: retry: *auto-retry - label: "S3 source check on AWS (json parser)" - command: "ci/scripts/s3-source-test.sh -p ci-release -s json_file" + command: "ci/scripts/s3-source-test.sh -p ci-release -s json_file.py" depends_on: build plugins: - seek-oss/aws-sm#v2.3.1: @@ -343,7 +324,41 @@ steps: retry: *auto-retry - label: "S3 source check on AWS (csv parser)" - command: "ci/scripts/s3-source-test.sh -p ci-release -s run_csv" + command: "ci/scripts/s3-source-test.sh -p ci-release -s run_csv.py" + depends_on: build + plugins: + - seek-oss/aws-sm#v2.3.1: + env: + S3_SOURCE_TEST_CONF: ci_s3_source_test_aws + - docker-compose#v4.9.0: + run: rw-build-env + config: ci/docker-compose.yml + mount-buildkite-agent: true + environment: + - S3_SOURCE_TEST_CONF + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 25 + retry: *auto-retry + + - label: "S3_v2 source check on AWS (json parser)" + command: "ci/scripts/s3-source-test.sh -p ci-release -s 'fs_source_v2.py json'" + depends_on: build + plugins: + - seek-oss/aws-sm#v2.3.1: + env: + S3_SOURCE_TEST_CONF: ci_s3_source_test_aws + - docker-compose#v4.9.0: + run: rw-build-env + config: ci/docker-compose.yml + mount-buildkite-agent: true + environment: + - S3_SOURCE_TEST_CONF + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 25 + retry: *auto-retry + + - label: "S3_v2 source check on AWS (csv parser)" + command: "ci/scripts/s3-source-test.sh -p ci-release -s 'fs_source_v2.py csv_without_header'" depends_on: build plugins: - seek-oss/aws-sm#v2.3.1: @@ -463,7 +478,7 @@ steps: config: ci/docker-compose.yml mount-buildkite-agent: true - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 10 + timeout_in_minutes: 20 retry: *auto-retry - label: "e2e standalone binary test" @@ -478,7 +493,7 @@ steps: config: ci/docker-compose.yml mount-buildkite-agent: true - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 21 + timeout_in_minutes: 25 retry: *auto-retry - label: "end-to-end test for opendal (parallel)" @@ -495,46 +510,6 @@ steps: timeout_in_minutes: 14 retry: *auto-retry - - label: "end-to-end test (parallel, in-memory)" - command: "ci/scripts/e2e-test-parallel-in-memory.sh -p ci-release" - depends_on: "build" - plugins: - - docker-compose#v4.9.0: - run: rw-build-env - config: ci/docker-compose.yml - mount-buildkite-agent: true - - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 12 - retry: *auto-retry - - - label: "end-to-end iceberg sink test" - command: "ci/scripts/e2e-iceberg-sink-test.sh -p ci-release" - depends_on: - - "build" - - "build-other" - plugins: - - docker-compose#v4.9.0: - run: rw-build-env - config: ci/docker-compose.yml - mount-buildkite-agent: true - - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 5 - retry: *auto-retry - - - label: "end-to-end iceberg sink v2 test" - command: "ci/scripts/e2e-iceberg-sink-v2-test.sh -p ci-release" - depends_on: - - "build" - - "build-other" - plugins: - - docker-compose#v4.9.0: - run: sink-test-env - config: ci/docker-compose.yml - mount-buildkite-agent: true - - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 10 - retry: *auto-retry - - label: "end-to-end clickhouse sink test" command: "ci/scripts/e2e-clickhouse-sink-test.sh -p ci-release" depends_on: @@ -549,14 +524,14 @@ steps: timeout_in_minutes: 10 retry: *auto-retry - - label: "e2e java-binding test" - command: "ci/scripts/java-binding-test.sh -p ci-release" + - label: "end-to-end pulsar sink test" + command: "ci/scripts/e2e-pulsar-sink-test.sh -p ci-release" depends_on: - "build" - "build-other" plugins: - docker-compose#v4.9.0: - run: rw-build-env + run: sink-test-env config: ci/docker-compose.yml mount-buildkite-agent: true - ./ci/plugins/upload-failure-logs diff --git a/ci/workflows/pull-request.yml b/ci/workflows/pull-request.yml index c3b0bd2728556..985bd0be4b822 100644 --- a/ci/workflows/pull-request.yml +++ b/ci/workflows/pull-request.yml @@ -19,6 +19,7 @@ steps: - label: "build" command: "ci/scripts/build.sh -p ci-dev" key: "build" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-build") plugins: - docker-compose#v4.9.0: run: rw-build-env @@ -30,6 +31,7 @@ steps: - label: "build other components" command: "ci/scripts/build-other.sh" key: "build-other" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-build-other") plugins: - seek-oss/aws-sm#v2.3.1: env: @@ -40,12 +42,13 @@ steps: mount-buildkite-agent: true environment: - GITHUB_TOKEN - timeout_in_minutes: 12 + timeout_in_minutes: 14 retry: *auto-retry - label: "build (deterministic simulation)" command: "ci/scripts/build-simulation.sh" key: "build-simulation" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-build-simulation") plugins: - docker-compose#v4.9.0: run: rw-build-env @@ -57,6 +60,7 @@ steps: - label: "docslt" command: "ci/scripts/docslt.sh" key: "docslt" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-docslt") plugins: - docker-compose#v4.9.0: run: rw-build-env @@ -67,6 +71,7 @@ steps: - label: "end-to-end test" command: "ci/scripts/e2e-test.sh -p ci-dev -m ci-3streaming-2serving-3fe" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-e2e-test") depends_on: - "build" - "build-other" @@ -82,6 +87,7 @@ steps: - label: "end-to-end test (parallel)" command: "ci/scripts/e2e-test-parallel.sh -p ci-dev" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-e2e-parallel-tests") depends_on: - "build" - "docslt" @@ -124,6 +130,7 @@ steps: - label: "end-to-end source test" command: "ci/scripts/e2e-source-test.sh -p ci-dev" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-e2e-source-tests") depends_on: - "build" - "build-other" @@ -138,6 +145,7 @@ steps: - label: "end-to-end sink test" command: "ci/scripts/e2e-sink-test.sh -p ci-dev" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-e2e-sink-tests") depends_on: - "build" - "build-other" @@ -201,6 +209,21 @@ steps: timeout_in_minutes: 10 retry: *auto-retry + - label: "end-to-end pulsar sink test" + if: build.pull_request.labels includes "ci/run-e2e-pulsar-sink-tests" + command: "ci/scripts/e2e-pulsar-sink-test.sh -p ci-dev" + depends_on: + - "build" + - "build-other" + plugins: + - docker-compose#v4.9.0: + run: sink-test-env + config: ci/docker-compose.yml + mount-buildkite-agent: true + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 10 + retry: *auto-retry + - label: "end-to-end clickhouse sink test" if: build.pull_request.labels includes "ci/run-e2e-clickhouse-sink-tests" command: "ci/scripts/e2e-clickhouse-sink-test.sh -p ci-dev" @@ -233,6 +256,7 @@ steps: - label: "regress test" command: "ci/scripts/regress-test.sh -p ci-dev" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-regress-test") depends_on: "build" plugins: - docker-compose#v4.9.0: @@ -248,6 +272,7 @@ steps: # This ensures our `main-cron` workflow will be stable. - label: "unit test" command: "ci/scripts/pr-unit-test.sh" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-unit-test") plugins: - ./ci/plugins/swapfile - seek-oss/aws-sm#v2.3.1: @@ -258,11 +283,12 @@ steps: config: ci/docker-compose.yml environment: - CODECOV_TOKEN - timeout_in_minutes: 18 + timeout_in_minutes: 20 retry: *auto-retry - label: "check" command: "ci/scripts/check.sh" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-check") plugins: - gencer/cache#v2.4.10: id: cache @@ -284,6 +310,7 @@ steps: - label: "unit test (deterministic simulation)" command: "ci/scripts/deterministic-unit-test.sh" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-unit-test-deterministic-simulation") plugins: - docker-compose#v4.9.0: run: rw-build-env @@ -295,6 +322,7 @@ steps: - label: "integration test (deterministic simulation)" command: "TEST_NUM=5 ci/scripts/deterministic-it-test.sh" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-integration-test-deterministic-simulation") depends_on: "build-simulation" plugins: - docker-compose#v4.9.0: @@ -306,6 +334,7 @@ steps: - label: "end-to-end test (deterministic simulation)" command: "TEST_NUM=16 ci/scripts/deterministic-e2e-test.sh" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-e2e-test-deterministic-simulation") depends_on: "build-simulation" plugins: - seek-oss/aws-sm#v2.3.1: @@ -324,6 +353,7 @@ steps: - label: "recovery test (deterministic simulation)" command: "TEST_NUM=8 KILL_RATE=0.5 ci/scripts/deterministic-recovery-test.sh" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-recovery-test-deterministic-simulation") depends_on: "build-simulation" plugins: # - seek-oss/aws-sm#v2.3.1: @@ -343,6 +373,7 @@ steps: - label: "misc check" command: "ci/scripts/misc-check.sh" + if: (!build.pull_request.labels includes "ci/skip-ci" || build.pull_request.labels includes "ci/run-misc-check") plugins: - docker-compose#v4.9.0: run: rw-build-env @@ -455,11 +486,11 @@ steps: - "build" plugins: - docker-compose#v4.9.0: - run: ci-flamegraph-env + run: rw-build-env config: ci/docker-compose.yml mount-buildkite-agent: true - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 10 + timeout_in_minutes: 30 - label: "e2e standalone binary test" command: "ci/scripts/e2e-test.sh -p ci-dev -m standalone" @@ -474,5 +505,59 @@ steps: config: ci/docker-compose.yml mount-buildkite-agent: true - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 25 + timeout_in_minutes: 30 + retry: *auto-retry + + # FIXME(kwannoel): Let the github PR labeller label it, if sqlsmith source files has changes. + - label: "fuzz test" + command: "ci/scripts/pr-fuzz-test.sh -p ci-dev" + if: build.pull_request.labels includes "ci/run-sqlsmith-fuzzing-tests" + depends_on: + - "build" + - "build-simulation" + plugins: + - ./ci/plugins/swapfile + - docker-compose#v4.9.0: + run: rw-build-env + config: ci/docker-compose.yml + mount-buildkite-agent: true + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 15 + retry: *auto-retry + + - label: "enable ci/skip-ci only in draft PRs" + if: build.pull_request.labels includes "ci/skip-ci" && !build.pull_request.draft + commands: + - echo "ci/skip-ci is only usable for draft Pull Requests" + - exit 1 + + - label: "micro benchmark" + command: "ci/scripts/run-micro-benchmarks.sh" + key: "run-micro-benchmarks" + if: build.pull_request.labels includes "ci/run-micro-benchmarks" + plugins: + - docker-compose#v4.9.0: + run: rw-build-env + config: ci/docker-compose.yml + mount-buildkite-agent: true + timeout_in_minutes: 60 retry: *auto-retry + + - label: "upload micro-benchmark" + if: build.pull_request.labels includes "ci/run-upload-micro-benchmark" + command: + - "BUILDKITE_BUILD_NUMBER=$BUILDKITE_BUILD_NUMBER ci/scripts/upload-micro-bench-results.sh" + depends_on: "run-micro-benchmarks" + plugins: + - seek-oss/aws-sm#v2.3.1: + env: + BUILDKITE_TOKEN: buildkite_token + GITHUB_TOKEN: github-token + - docker-compose#v4.9.0: + run: rw-build-env + config: ci/docker-compose.yml + mount-buildkite-agent: true + environment: + - BUILDKITE_TOKEN + - GITHUB_TOKEN + timeout_in_minutes: 5 diff --git a/ci/workflows/sqlsmith-tests.yml b/ci/workflows/sqlsmith-tests.yml deleted file mode 100644 index 201b3dd8bd20c..0000000000000 --- a/ci/workflows/sqlsmith-tests.yml +++ /dev/null @@ -1,43 +0,0 @@ -auto-retry: &auto-retry - automatic: - # Agent terminated because the AWS EC2 spot instance killed by AWS. - - signal_reason: agent_stop - limit: 3 - -steps: - - label: "check ci image rebuild" - plugins: - - chronotc/monorepo-diff#v2.3.0: - diff: "git diff --name-only origin/main" - watch: - - path: "ci/build-ci-image.sh" - config: - command: "ci/build-ci-image.sh" - label: "ci build images" - retry: *auto-retry - - wait - - - label: "build" - command: "ci/scripts/build.sh -p ci-dev" - key: "build" - plugins: - - docker-compose#v4.9.0: - run: rw-build-env - config: ci/docker-compose.yml - mount-buildkite-agent: true - timeout_in_minutes: 15 - retry: *auto-retry - - - label: "fuzz test" - command: "ci/scripts/pr-fuzz-test.sh -p ci-dev" - depends_on: - - "build" - plugins: - - ./ci/plugins/swapfile - - docker-compose#v4.9.0: - run: rw-build-env - config: ci/docker-compose.yml - mount-buildkite-agent: true - - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 25 - retry: *auto-retry diff --git a/dashboard/components/BackPressureTable.tsx b/dashboard/components/BackPressureTable.tsx index 4b5e987a1b924..84ba13c3b849a 100644 --- a/dashboard/components/BackPressureTable.tsx +++ b/dashboard/components/BackPressureTable.tsx @@ -38,9 +38,9 @@ interface BackPressuresMetrics { } export default function BackPressureTable({ - selectedActorIds, + selectedFragmentIds, }: { - selectedActorIds: Set + selectedFragmentIds: Set }) { const [backPressuresMetrics, setBackPressuresMetrics] = useState() @@ -53,7 +53,7 @@ export default function BackPressureTable({ let metrics: BackPressuresMetrics = await getActorBackPressures() metrics.outputBufferBlockingDuration = sortBy( metrics.outputBufferBlockingDuration, - (m) => m.metric.actor_id + (m) => (m.metric.fragment_id, m.metric.downstream_fragment_id) ) setBackPressuresMetrics(metrics) await new Promise((resolve) => setTimeout(resolve, 5000)) // refresh every 5 secs @@ -74,25 +74,27 @@ export default function BackPressureTable({ return () => {} }, [toast]) - const isSelected = (actorId: string) => selectedActorIds.has(actorId) + const isSelected = (fragmentId: string) => selectedFragmentIds.has(fragmentId) const retVal = ( Back Pressures (Last 30 minutes) - - - + + + + {backPressuresMetrics && backPressuresMetrics.outputBufferBlockingDuration - .filter((m) => isSelected(m.metric.actor_id)) + .filter((m) => isSelected(m.metric.fragment_id)) .map((m) => ( - - - + + diff --git a/dashboard/components/FragmentGraph.tsx b/dashboard/components/FragmentGraph.tsx index 9c62680d25fc8..aa85501dc66ca 100644 --- a/dashboard/components/FragmentGraph.tsx +++ b/dashboard/components/FragmentGraph.tsx @@ -117,7 +117,7 @@ export default function FragmentGraph({ extraInfo: string } >() - const includedActorIds = new Set() + const includedFragmentIds = new Set() for (const [fragmentId, fragmentRoot] of deps) { const layoutRoot = treeLayoutFlip(fragmentRoot, { dx: nodeMarginX, @@ -137,7 +137,7 @@ export default function FragmentGraph({ height, extraInfo: `Actor ${fragmentRoot.data.actor_ids?.join(", ")}` || "", }) - fragmentRoot.data.actor_ids?.forEach((id) => includedActorIds.add(id)) + includedFragmentIds.add(fragmentId) } const fragmentLayout = layout( fragmentDependencyDag.map(({ width: _1, height: _2, id, ...data }) => { @@ -169,7 +169,7 @@ export default function FragmentGraph({ svgWidth, svgHeight, links, - includedActorIds, + includedFragmentIds, } }, [planNodeDependencies, fragmentDependency]) @@ -189,7 +189,7 @@ export default function FragmentGraph({ links, fragmentLayout: fragmentDependencyDag, layoutResult: planNodeDependencyDag, - includedActorIds, + includedFragmentIds, } = planNodeDependencyDagCallback() useEffect(() => { @@ -434,7 +434,7 @@ export default function FragmentGraph({ - + ) } diff --git a/dashboard/components/metrics.tsx b/dashboard/components/metrics.tsx index efa32175555eb..a933dbc7fd807 100644 --- a/dashboard/components/metrics.tsx +++ b/dashboard/components/metrics.tsx @@ -21,6 +21,11 @@ export interface MetricsSample { } export interface Metrics { + // Tags of this timeseries. Example: {"downstream_fragment_id":"15001","fragment_id":"15002"} metric: { [key: string]: string } + + // Example: [{"timestamp":1695041872.0,"value":0.3797035002929275}, + // {"timestamp":1695041887.0,"value":0.5914327683152408}, + // {"timestamp":1695041902.0,"value":0.8272212493499999}, ... ] sample: MetricsSample[] } diff --git a/dashboard/package-lock.json b/dashboard/package-lock.json index e9cb795252f6c..d21a177da60a4 100644 --- a/dashboard/package-lock.json +++ b/dashboard/package-lock.json @@ -25,7 +25,7 @@ "fabric": "^5.2.1", "framer-motion": "^6.5.1", "lodash": "^4.17.21", - "next": "^13.4.12", + "next": "^13.5.4", "react": "^18.2.0", "react-dom": "^18.2.0", "react-flow-renderer": "10.3.16", @@ -85,11 +85,12 @@ } }, "node_modules/@babel/code-frame": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.18.6.tgz", - "integrity": "sha512-TDCmlK5eOvH+eH7cdAFlNXeVJqWIQ7gW9tY1GJIpUtFb6CmjVyq2VM3u71bOyR8CRihcCgMUYoDNyLXao3+70Q==", + "version": "7.22.13", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.22.13.tgz", + "integrity": "sha512-XktuhWlJ5g+3TJXc5upd9Ks1HutSArik6jf2eAjYFyIOf4ej3RN+184cZbzDvbPnuTJIUhPKKJE3cIsYTiAT3w==", "dependencies": { - "@babel/highlight": "^7.18.6" + "@babel/highlight": "^7.22.13", + "chalk": "^2.4.2" }, "engines": { "node": ">=6.9.0" @@ -135,12 +136,13 @@ } }, "node_modules/@babel/generator": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.19.0.tgz", - "integrity": "sha512-S1ahxf1gZ2dpoiFgA+ohK9DIpz50bJ0CWs7Zlzb54Z4sG8qmdIrGrVqmy1sAtTVRb+9CU6U8VqT9L0Zj7hxHVg==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.23.0.tgz", + "integrity": "sha512-lN85QRR+5IbYrMWM6Y4pE/noaQtg4pNiqeNGX60eqOfo6gtEj6uw/JagelB8vVztSd7R6M5n1+PQkDbHbBRU4g==", "dependencies": { - "@babel/types": "^7.19.0", + "@babel/types": "^7.23.0", "@jridgewell/gen-mapping": "^0.3.2", + "@jridgewell/trace-mapping": "^0.3.17", "jsesc": "^2.5.1" }, "engines": { @@ -190,31 +192,31 @@ } }, "node_modules/@babel/helper-environment-visitor": { - "version": "7.18.9", - "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.18.9.tgz", - "integrity": "sha512-3r/aACDJ3fhQ/EVgFy0hpj8oHyHpQc+LPtJoY9SzTThAsStm4Ptegq92vqKoE3vD706ZVFWITnMnxucw+S9Ipg==", + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.20.tgz", + "integrity": "sha512-zfedSIzFhat/gFhWfHtgWvlec0nqB9YEIVrpuwjruLlXfUSnA8cJB0miHKwqDnQ7d32aKo2xt88/xZptwxbfhA==", "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-function-name": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.19.0.tgz", - "integrity": "sha512-WAwHBINyrpqywkUH0nTnNgI5ina5TFn85HKS0pbPDfxFfhyR/aNQEn4hGi1P1JyT//I0t4OgXUlofzWILRvS5w==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.23.0.tgz", + "integrity": "sha512-OErEqsrxjZTJciZ4Oo+eoZqeW9UIiOcuYKRJA4ZAgV9myA+pOXhhmpfNCKjEH/auVfEYVFJ6y1Tc4r0eIApqiw==", "dependencies": { - "@babel/template": "^7.18.10", - "@babel/types": "^7.19.0" + "@babel/template": "^7.22.15", + "@babel/types": "^7.23.0" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-hoist-variables": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.18.6.tgz", - "integrity": "sha512-UlJQPkFqFULIcyW5sbzgbkxn2FKRgwWiRexcuaR8RNJRy8+LLveqPjwZV/bwrLZCN0eUHD/x8D0heK1ozuoo6Q==", + "version": "7.22.5", + "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.22.5.tgz", + "integrity": "sha512-wGjk9QZVzvknA6yKIUURb8zY3grXCcOZt+/7Wcy8O2uctxhplmUPkOdlgoNhmdVee2c92JXbf1xpMtVNbfoxRw==", "dependencies": { - "@babel/types": "^7.18.6" + "@babel/types": "^7.22.5" }, "engines": { "node": ">=6.9.0" @@ -271,28 +273,28 @@ } }, "node_modules/@babel/helper-split-export-declaration": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.18.6.tgz", - "integrity": "sha512-bde1etTx6ZyTmobl9LLMMQsaizFVZrquTEHOqKeQESMKo4PlObf+8+JA25ZsIpZhT/WEd39+vOdLXAFG/nELpA==", + "version": "7.22.6", + "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.22.6.tgz", + "integrity": "sha512-AsUnxuLhRYsisFiaJwvp1QF+I3KjD5FOxut14q/GzovUe6orHLesW2C7d754kRm53h5gqrz6sFl6sxc4BVtE/g==", "dependencies": { - "@babel/types": "^7.18.6" + "@babel/types": "^7.22.5" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-string-parser": { - "version": "7.18.10", - "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.18.10.tgz", - "integrity": "sha512-XtIfWmeNY3i4t7t4D2t02q50HvqHybPqW2ki1kosnvWCwuCMeo81Jf0gwr85jy/neUdg5XDdeFE/80DXiO+njw==", + "version": "7.22.5", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.22.5.tgz", + "integrity": "sha512-mM4COjgZox8U+JcXQwPijIZLElkgEpO5rsERVDJTc2qfCDfERyob6k5WegS14SX18IIjv+XD+GrqNumY5JRCDw==", "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-validator-identifier": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.18.6.tgz", - "integrity": "sha512-MmetCkz9ej86nJQV+sFCxoGGrUbU3q02kgLciwkrt9QqEB7cP39oKEY0PakknEO0Gu20SskMRi+AYZ3b1TpN9g==", + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz", + "integrity": "sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==", "engines": { "node": ">=6.9.0" } @@ -321,12 +323,12 @@ } }, "node_modules/@babel/highlight": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.18.6.tgz", - "integrity": "sha512-u7stbOuYjaPezCuLj29hNW1v64M2Md2qupEKP1fHc7WdOA3DgLh37suiSrZYY7haUB7iBeQZ9P1uiRF359do3g==", + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.22.20.tgz", + "integrity": "sha512-dkdMCN3py0+ksCgYmGG8jKeGA/8Tk+gJwSYYlFGxG5lmhfKNoAy004YpLxpS1W2J8m/EK2Ew+yOs9pVRwO89mg==", "dependencies": { - "@babel/helper-validator-identifier": "^7.18.6", - "chalk": "^2.0.0", + "@babel/helper-validator-identifier": "^7.22.20", + "chalk": "^2.4.2", "js-tokens": "^4.0.0" }, "engines": { @@ -334,9 +336,9 @@ } }, "node_modules/@babel/parser": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.19.0.tgz", - "integrity": "sha512-74bEXKX2h+8rrfQUfsBfuZZHzsEs6Eql4pqy/T4Nn6Y9wNPggQOqD6z6pn5Bl8ZfysKouFZT/UXEH94ummEeQw==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.23.0.tgz", + "integrity": "sha512-vvPKKdMemU85V9WE/l5wZEmImpCtLqbnTvqDS2U1fJ96KrxoW7KrXhNsNCblQlg8Ck4b85yxdTyelsMUgFUXiw==", "bin": { "parser": "bin/babel-parser.js" }, @@ -383,31 +385,31 @@ } }, "node_modules/@babel/template": { - "version": "7.18.10", - "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.18.10.tgz", - "integrity": "sha512-TI+rCtooWHr3QJ27kJxfjutghu44DLnasDMwpDqCXVTal9RLp3RSYNh4NdBrRP2cQAoG9A8juOQl6P6oZG4JxA==", + "version": "7.22.15", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.22.15.tgz", + "integrity": "sha512-QPErUVm4uyJa60rkI73qneDacvdvzxshT3kksGqlGWYdOTIUOwJ7RDUL8sGqslY1uXWSL6xMFKEXDS3ox2uF0w==", "dependencies": { - "@babel/code-frame": "^7.18.6", - "@babel/parser": "^7.18.10", - "@babel/types": "^7.18.10" + "@babel/code-frame": "^7.22.13", + "@babel/parser": "^7.22.15", + "@babel/types": "^7.22.15" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/traverse": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.19.0.tgz", - "integrity": "sha512-4pKpFRDh+utd2mbRC8JLnlsMUii3PMHjpL6a0SZ4NMZy7YFP9aXORxEhdMVOc9CpWtDF09IkciQLEhK7Ml7gRA==", - "dependencies": { - "@babel/code-frame": "^7.18.6", - "@babel/generator": "^7.19.0", - "@babel/helper-environment-visitor": "^7.18.9", - "@babel/helper-function-name": "^7.19.0", - "@babel/helper-hoist-variables": "^7.18.6", - "@babel/helper-split-export-declaration": "^7.18.6", - "@babel/parser": "^7.19.0", - "@babel/types": "^7.19.0", + "version": "7.23.2", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.23.2.tgz", + "integrity": "sha512-azpe59SQ48qG6nu2CzcMLbxUudtN+dOM9kDbUqGq3HXUJRlo7i8fvPoxQUzYgLZ4cMVmuZgm8vvBpNeRhd6XSw==", + "dependencies": { + "@babel/code-frame": "^7.22.13", + "@babel/generator": "^7.23.0", + "@babel/helper-environment-visitor": "^7.22.20", + "@babel/helper-function-name": "^7.23.0", + "@babel/helper-hoist-variables": "^7.22.5", + "@babel/helper-split-export-declaration": "^7.22.6", + "@babel/parser": "^7.23.0", + "@babel/types": "^7.23.0", "debug": "^4.1.0", "globals": "^11.1.0" }, @@ -416,12 +418,12 @@ } }, "node_modules/@babel/types": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.19.0.tgz", - "integrity": "sha512-YuGopBq3ke25BVSiS6fgF49Ul9gH1x70Bcr6bqRLjWCkcX8Hre1/5+z+IiWOIerRMSSEfGZVB9z9kyq7wVs9YA==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.23.0.tgz", + "integrity": "sha512-0oIyUfKoI3mSqMvsxBdclDwxXKXAUA8v/apZbc+iSyARYou1o8ZGDxbUYyLFoW2arqS2jDGqJuZvv1d/io1axg==", "dependencies": { - "@babel/helper-string-parser": "^7.18.10", - "@babel/helper-validator-identifier": "^7.18.6", + "@babel/helper-string-parser": "^7.22.5", + "@babel/helper-validator-identifier": "^7.22.20", "to-fast-properties": "^2.0.0" }, "engines": { @@ -1882,12 +1884,12 @@ "integrity": "sha512-XPSJHWmi394fuUuzDnGz1wiKqWfo1yXecHQMRf2l6hztTO+nPru658AyDngaBe7isIxEkRsPR3FZh+s7iVa4Uw==" }, "node_modules/@jridgewell/trace-mapping": { - "version": "0.3.15", - "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.15.tgz", - "integrity": "sha512-oWZNOULl+UbhsgB51uuZzglikfIKSUBO/M9W2OfEjn7cmqoAiCgmv9lyACTUacZwBz0ITnJ2NqjU8Tx0DHL88g==", + "version": "0.3.19", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.19.tgz", + "integrity": "sha512-kf37QtfW+Hwx/buWGMPcR60iF9ziHa6r/CZJIHbmcm4+0qrXiVdxegAH0F6yddEVQ7zdkjcGCgCzUu+BcbhQxw==", "dependencies": { - "@jridgewell/resolve-uri": "^3.0.3", - "@jridgewell/sourcemap-codec": "^1.4.10" + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" } }, "node_modules/@loadable/component": { @@ -2034,9 +2036,9 @@ } }, "node_modules/@next/env": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/env/-/env-13.4.12.tgz", - "integrity": "sha512-RmHanbV21saP/6OEPBJ7yJMuys68cIf8OBBWd7+uj40LdpmswVAwe1uzeuFyUsd6SfeITWT3XnQfn6wULeKwDQ==" + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/env/-/env-13.5.4.tgz", + "integrity": "sha512-LGegJkMvRNw90WWphGJ3RMHMVplYcOfRWf2Be3td3sUa+1AaxmsYyANsA+znrGCBjXJNi4XAQlSoEfUxs/4kIQ==" }, "node_modules/@next/eslint-plugin-next": { "version": "13.4.12", @@ -2068,9 +2070,9 @@ } }, "node_modules/@next/swc-darwin-arm64": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-13.4.12.tgz", - "integrity": "sha512-deUrbCXTMZ6ZhbOoloqecnUeNpUOupi8SE2tx4jPfNS9uyUR9zK4iXBvH65opVcA/9F5I/p8vDXSYbUlbmBjZg==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-13.5.4.tgz", + "integrity": "sha512-Df8SHuXgF1p+aonBMcDPEsaahNo2TCwuie7VXED4FVyECvdXfRT9unapm54NssV9tF3OQFKBFOdlje4T43VO0w==", "cpu": [ "arm64" ], @@ -2083,9 +2085,9 @@ } }, "node_modules/@next/swc-darwin-x64": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-13.4.12.tgz", - "integrity": "sha512-WRvH7RxgRHlC1yb5oG0ZLx8F7uci9AivM5/HGGv9ZyG2Als8Ij64GC3d+mQ5sJhWjusyU6T6V1WKTUoTmOB0zQ==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-13.5.4.tgz", + "integrity": "sha512-siPuUwO45PnNRMeZnSa8n/Lye5ZX93IJom9wQRB5DEOdFrw0JjOMu1GINB8jAEdwa7Vdyn1oJ2xGNaQpdQQ9Pw==", "cpu": [ "x64" ], @@ -2098,9 +2100,9 @@ } }, "node_modules/@next/swc-linux-arm64-gnu": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-13.4.12.tgz", - "integrity": "sha512-YEKracAWuxp54tKiAvvq73PUs9lok57cc8meYRibTWe/VdPB2vLgkTVWFcw31YDuRXdEhdX0fWS6Q+ESBhnEig==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-13.5.4.tgz", + "integrity": "sha512-l/k/fvRP/zmB2jkFMfefmFkyZbDkYW0mRM/LB+tH5u9pB98WsHXC0WvDHlGCYp3CH/jlkJPL7gN8nkTQVrQ/2w==", "cpu": [ "arm64" ], @@ -2113,9 +2115,9 @@ } }, "node_modules/@next/swc-linux-arm64-musl": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-13.4.12.tgz", - "integrity": "sha512-LhJR7/RAjdHJ2Isl2pgc/JaoxNk0KtBgkVpiDJPVExVWA1c6gzY57+3zWuxuyWzTG+fhLZo2Y80pLXgIJv7g3g==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-13.5.4.tgz", + "integrity": "sha512-YYGb7SlLkI+XqfQa8VPErljb7k9nUnhhRrVaOdfJNCaQnHBcvbT7cx/UjDQLdleJcfyg1Hkn5YSSIeVfjgmkTg==", "cpu": [ "arm64" ], @@ -2128,9 +2130,9 @@ } }, "node_modules/@next/swc-linux-x64-gnu": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-13.4.12.tgz", - "integrity": "sha512-1DWLL/B9nBNiQRng+1aqs3OaZcxC16Nf+mOnpcrZZSdyKHek3WQh6j/fkbukObgNGwmCoVevLUa/p3UFTTqgqg==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-13.5.4.tgz", + "integrity": "sha512-uE61vyUSClnCH18YHjA8tE1prr/PBFlBFhxBZis4XBRJoR+txAky5d7gGNUIbQ8sZZ7LVkSVgm/5Fc7mwXmRAg==", "cpu": [ "x64" ], @@ -2143,9 +2145,9 @@ } }, "node_modules/@next/swc-linux-x64-musl": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-13.4.12.tgz", - "integrity": "sha512-kEAJmgYFhp0VL+eRWmUkVxLVunn7oL9Mdue/FS8yzRBVj7Z0AnIrHpTIeIUl1bbdQq1VaoOztnKicAjfkLTRCQ==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-13.5.4.tgz", + "integrity": "sha512-qVEKFYML/GvJSy9CfYqAdUexA6M5AklYcQCW+8JECmkQHGoPxCf04iMh7CPR7wkHyWWK+XLt4Ja7hhsPJtSnhg==", "cpu": [ "x64" ], @@ -2158,9 +2160,9 @@ } }, "node_modules/@next/swc-win32-arm64-msvc": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-13.4.12.tgz", - "integrity": "sha512-GMLuL/loR6yIIRTnPRY6UGbLL9MBdw2anxkOnANxvLvsml4F0HNIgvnU3Ej4BjbqMTNjD4hcPFdlEow4XHPdZA==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-13.5.4.tgz", + "integrity": "sha512-mDSQfqxAlfpeZOLPxLymZkX0hYF3juN57W6vFHTvwKlnHfmh12Pt7hPIRLYIShk8uYRsKPtMTth/EzpwRI+u8w==", "cpu": [ "arm64" ], @@ -2173,9 +2175,9 @@ } }, "node_modules/@next/swc-win32-ia32-msvc": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-13.4.12.tgz", - "integrity": "sha512-PhgNqN2Vnkm7XaMdRmmX0ZSwZXQAtamBVSa9A/V1dfKQCV1rjIZeiy/dbBnVYGdj63ANfsOR/30XpxP71W0eww==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-13.5.4.tgz", + "integrity": "sha512-aoqAT2XIekIWoriwzOmGFAvTtVY5O7JjV21giozBTP5c6uZhpvTWRbmHXbmsjZqY4HnEZQRXWkSAppsIBweKqw==", "cpu": [ "ia32" ], @@ -2188,9 +2190,9 @@ } }, "node_modules/@next/swc-win32-x64-msvc": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-13.4.12.tgz", - "integrity": "sha512-Z+56e/Ljt0bUs+T+jPjhFyxYBcdY2RIq9ELFU+qAMQMteHo7ymbV7CKmlcX59RI9C4YzN8PgMgLyAoi916b5HA==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-13.5.4.tgz", + "integrity": "sha512-cyRvlAxwlddlqeB9xtPSfNSCRy8BOa4wtMo0IuI9P7Y0XT2qpDrpFKRyZ7kUngZis59mPVla5k8X1oOJ8RxDYg==", "cpu": [ "x64" ], @@ -2333,9 +2335,9 @@ "dev": true }, "node_modules/@swc/helpers": { - "version": "0.5.1", - "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.1.tgz", - "integrity": "sha512-sJ902EfIzn1Fa+qYmjdQqh8tPsoxyBz+8yBKC2HKUxyezKJFwPGOn7pv4WY6QuQW//ySQi5lJjA/ZT9sNWWNTg==", + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.2.tgz", + "integrity": "sha512-E4KcWTpoLHqwPHLxidpOqQbcrZVgi0rsmmZXUle1jXmJfuIf/UWpczUJ7MZZ5tlxytgJXyp0w4PGkkeLiuIdZw==", "dependencies": { "tslib": "^2.4.0" } @@ -8139,9 +8141,15 @@ "optional": true }, "node_modules/nanoid": { - "version": "3.3.4", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.4.tgz", - "integrity": "sha512-MqBkQh/OHTS2egovRtLk45wEyNXwF+cokD+1YPf9u5VfJiRdAiRwB2froX5Co9Rh20xs4siNPm8naNotSD6RBw==", + "version": "3.3.6", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz", + "integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], "bin": { "nanoid": "bin/nanoid.cjs" }, @@ -8171,39 +8179,37 @@ } }, "node_modules/next": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/next/-/next-13.4.12.tgz", - "integrity": "sha512-eHfnru9x6NRmTMcjQp6Nz0J4XH9OubmzOa7CkWL+AUrUxpibub3vWwttjduu9No16dug1kq04hiUUpo7J3m3Xw==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/next/-/next-13.5.4.tgz", + "integrity": "sha512-+93un5S779gho8y9ASQhb/bTkQF17FNQOtXLKAj3lsNgltEcF0C5PMLLncDmH+8X1EnJH1kbqAERa29nRXqhjA==", "dependencies": { - "@next/env": "13.4.12", - "@swc/helpers": "0.5.1", + "@next/env": "13.5.4", + "@swc/helpers": "0.5.2", "busboy": "1.6.0", "caniuse-lite": "^1.0.30001406", - "postcss": "8.4.14", + "postcss": "8.4.31", "styled-jsx": "5.1.1", - "watchpack": "2.4.0", - "zod": "3.21.4" + "watchpack": "2.4.0" }, "bin": { "next": "dist/bin/next" }, "engines": { - "node": ">=16.8.0" + "node": ">=16.14.0" }, "optionalDependencies": { - "@next/swc-darwin-arm64": "13.4.12", - "@next/swc-darwin-x64": "13.4.12", - "@next/swc-linux-arm64-gnu": "13.4.12", - "@next/swc-linux-arm64-musl": "13.4.12", - "@next/swc-linux-x64-gnu": "13.4.12", - "@next/swc-linux-x64-musl": "13.4.12", - "@next/swc-win32-arm64-msvc": "13.4.12", - "@next/swc-win32-ia32-msvc": "13.4.12", - "@next/swc-win32-x64-msvc": "13.4.12" + "@next/swc-darwin-arm64": "13.5.4", + "@next/swc-darwin-x64": "13.5.4", + "@next/swc-linux-arm64-gnu": "13.5.4", + "@next/swc-linux-arm64-musl": "13.5.4", + "@next/swc-linux-x64-gnu": "13.5.4", + "@next/swc-linux-x64-musl": "13.5.4", + "@next/swc-win32-arm64-msvc": "13.5.4", + "@next/swc-win32-ia32-msvc": "13.5.4", + "@next/swc-win32-x64-msvc": "13.5.4" }, "peerDependencies": { "@opentelemetry/api": "^1.1.0", - "fibers": ">= 3.1.0", "react": "^18.2.0", "react-dom": "^18.2.0", "sass": "^1.3.0" @@ -8212,9 +8218,6 @@ "@opentelemetry/api": { "optional": true }, - "fibers": { - "optional": true - }, "sass": { "optional": true } @@ -8672,9 +8675,9 @@ } }, "node_modules/postcss": { - "version": "8.4.14", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.14.tgz", - "integrity": "sha512-E398TUmfAYFPBSdzgeieK2Y1+1cpdxJx8yXbK/m57nRhKSmk1GB2tO4lbLBtlkfPQTDKfe4Xqv1ASWPpayPEig==", + "version": "8.4.31", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz", + "integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==", "funding": [ { "type": "opencollective", @@ -8683,10 +8686,14 @@ { "type": "tidelift", "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" } ], "dependencies": { - "nanoid": "^3.3.4", + "nanoid": "^3.3.6", "picocolors": "^1.0.0", "source-map-js": "^1.0.2" }, @@ -10894,14 +10901,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/zod": { - "version": "3.21.4", - "resolved": "https://registry.npmjs.org/zod/-/zod-3.21.4.tgz", - "integrity": "sha512-m46AKbrzKVzOzs/DZgVnG5H55N1sv1M8qZU3A8RIKbs3mrACDNeIOeilDymVb2HdmP8uwshOCF4uJ8uM9rCqJw==", - "funding": { - "url": "https://github.com/sponsors/colinhacks" - } - }, "node_modules/zustand": { "version": "3.7.2", "resolved": "https://registry.npmjs.org/zustand/-/zustand-3.7.2.tgz", @@ -10943,11 +10942,12 @@ } }, "@babel/code-frame": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.18.6.tgz", - "integrity": "sha512-TDCmlK5eOvH+eH7cdAFlNXeVJqWIQ7gW9tY1GJIpUtFb6CmjVyq2VM3u71bOyR8CRihcCgMUYoDNyLXao3+70Q==", + "version": "7.22.13", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.22.13.tgz", + "integrity": "sha512-XktuhWlJ5g+3TJXc5upd9Ks1HutSArik6jf2eAjYFyIOf4ej3RN+184cZbzDvbPnuTJIUhPKKJE3cIsYTiAT3w==", "requires": { - "@babel/highlight": "^7.18.6" + "@babel/highlight": "^7.22.13", + "chalk": "^2.4.2" } }, "@babel/compat-data": { @@ -10980,12 +10980,13 @@ } }, "@babel/generator": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.19.0.tgz", - "integrity": "sha512-S1ahxf1gZ2dpoiFgA+ohK9DIpz50bJ0CWs7Zlzb54Z4sG8qmdIrGrVqmy1sAtTVRb+9CU6U8VqT9L0Zj7hxHVg==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.23.0.tgz", + "integrity": "sha512-lN85QRR+5IbYrMWM6Y4pE/noaQtg4pNiqeNGX60eqOfo6gtEj6uw/JagelB8vVztSd7R6M5n1+PQkDbHbBRU4g==", "requires": { - "@babel/types": "^7.19.0", + "@babel/types": "^7.23.0", "@jridgewell/gen-mapping": "^0.3.2", + "@jridgewell/trace-mapping": "^0.3.17", "jsesc": "^2.5.1" }, "dependencies": { @@ -11022,25 +11023,25 @@ } }, "@babel/helper-environment-visitor": { - "version": "7.18.9", - "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.18.9.tgz", - "integrity": "sha512-3r/aACDJ3fhQ/EVgFy0hpj8oHyHpQc+LPtJoY9SzTThAsStm4Ptegq92vqKoE3vD706ZVFWITnMnxucw+S9Ipg==" + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.20.tgz", + "integrity": "sha512-zfedSIzFhat/gFhWfHtgWvlec0nqB9YEIVrpuwjruLlXfUSnA8cJB0miHKwqDnQ7d32aKo2xt88/xZptwxbfhA==" }, "@babel/helper-function-name": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.19.0.tgz", - "integrity": "sha512-WAwHBINyrpqywkUH0nTnNgI5ina5TFn85HKS0pbPDfxFfhyR/aNQEn4hGi1P1JyT//I0t4OgXUlofzWILRvS5w==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.23.0.tgz", + "integrity": "sha512-OErEqsrxjZTJciZ4Oo+eoZqeW9UIiOcuYKRJA4ZAgV9myA+pOXhhmpfNCKjEH/auVfEYVFJ6y1Tc4r0eIApqiw==", "requires": { - "@babel/template": "^7.18.10", - "@babel/types": "^7.19.0" + "@babel/template": "^7.22.15", + "@babel/types": "^7.23.0" } }, "@babel/helper-hoist-variables": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.18.6.tgz", - "integrity": "sha512-UlJQPkFqFULIcyW5sbzgbkxn2FKRgwWiRexcuaR8RNJRy8+LLveqPjwZV/bwrLZCN0eUHD/x8D0heK1ozuoo6Q==", + "version": "7.22.5", + "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.22.5.tgz", + "integrity": "sha512-wGjk9QZVzvknA6yKIUURb8zY3grXCcOZt+/7Wcy8O2uctxhplmUPkOdlgoNhmdVee2c92JXbf1xpMtVNbfoxRw==", "requires": { - "@babel/types": "^7.18.6" + "@babel/types": "^7.22.5" } }, "@babel/helper-module-imports": { @@ -11082,22 +11083,22 @@ } }, "@babel/helper-split-export-declaration": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.18.6.tgz", - "integrity": "sha512-bde1etTx6ZyTmobl9LLMMQsaizFVZrquTEHOqKeQESMKo4PlObf+8+JA25ZsIpZhT/WEd39+vOdLXAFG/nELpA==", + "version": "7.22.6", + "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.22.6.tgz", + "integrity": "sha512-AsUnxuLhRYsisFiaJwvp1QF+I3KjD5FOxut14q/GzovUe6orHLesW2C7d754kRm53h5gqrz6sFl6sxc4BVtE/g==", "requires": { - "@babel/types": "^7.18.6" + "@babel/types": "^7.22.5" } }, "@babel/helper-string-parser": { - "version": "7.18.10", - "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.18.10.tgz", - "integrity": "sha512-XtIfWmeNY3i4t7t4D2t02q50HvqHybPqW2ki1kosnvWCwuCMeo81Jf0gwr85jy/neUdg5XDdeFE/80DXiO+njw==" + "version": "7.22.5", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.22.5.tgz", + "integrity": "sha512-mM4COjgZox8U+JcXQwPijIZLElkgEpO5rsERVDJTc2qfCDfERyob6k5WegS14SX18IIjv+XD+GrqNumY5JRCDw==" }, "@babel/helper-validator-identifier": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.18.6.tgz", - "integrity": "sha512-MmetCkz9ej86nJQV+sFCxoGGrUbU3q02kgLciwkrt9QqEB7cP39oKEY0PakknEO0Gu20SskMRi+AYZ3b1TpN9g==" + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz", + "integrity": "sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==" }, "@babel/helper-validator-option": { "version": "7.18.6", @@ -11117,19 +11118,19 @@ } }, "@babel/highlight": { - "version": "7.18.6", - "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.18.6.tgz", - "integrity": "sha512-u7stbOuYjaPezCuLj29hNW1v64M2Md2qupEKP1fHc7WdOA3DgLh37suiSrZYY7haUB7iBeQZ9P1uiRF359do3g==", + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.22.20.tgz", + "integrity": "sha512-dkdMCN3py0+ksCgYmGG8jKeGA/8Tk+gJwSYYlFGxG5lmhfKNoAy004YpLxpS1W2J8m/EK2Ew+yOs9pVRwO89mg==", "requires": { - "@babel/helper-validator-identifier": "^7.18.6", - "chalk": "^2.0.0", + "@babel/helper-validator-identifier": "^7.22.20", + "chalk": "^2.4.2", "js-tokens": "^4.0.0" } }, "@babel/parser": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.19.0.tgz", - "integrity": "sha512-74bEXKX2h+8rrfQUfsBfuZZHzsEs6Eql4pqy/T4Nn6Y9wNPggQOqD6z6pn5Bl8ZfysKouFZT/UXEH94ummEeQw==" + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.23.0.tgz", + "integrity": "sha512-vvPKKdMemU85V9WE/l5wZEmImpCtLqbnTvqDS2U1fJ96KrxoW7KrXhNsNCblQlg8Ck4b85yxdTyelsMUgFUXiw==" }, "@babel/plugin-syntax-jsx": { "version": "7.18.6", @@ -11158,39 +11159,39 @@ } }, "@babel/template": { - "version": "7.18.10", - "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.18.10.tgz", - "integrity": "sha512-TI+rCtooWHr3QJ27kJxfjutghu44DLnasDMwpDqCXVTal9RLp3RSYNh4NdBrRP2cQAoG9A8juOQl6P6oZG4JxA==", + "version": "7.22.15", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.22.15.tgz", + "integrity": "sha512-QPErUVm4uyJa60rkI73qneDacvdvzxshT3kksGqlGWYdOTIUOwJ7RDUL8sGqslY1uXWSL6xMFKEXDS3ox2uF0w==", "requires": { - "@babel/code-frame": "^7.18.6", - "@babel/parser": "^7.18.10", - "@babel/types": "^7.18.10" + "@babel/code-frame": "^7.22.13", + "@babel/parser": "^7.22.15", + "@babel/types": "^7.22.15" } }, "@babel/traverse": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.19.0.tgz", - "integrity": "sha512-4pKpFRDh+utd2mbRC8JLnlsMUii3PMHjpL6a0SZ4NMZy7YFP9aXORxEhdMVOc9CpWtDF09IkciQLEhK7Ml7gRA==", - "requires": { - "@babel/code-frame": "^7.18.6", - "@babel/generator": "^7.19.0", - "@babel/helper-environment-visitor": "^7.18.9", - "@babel/helper-function-name": "^7.19.0", - "@babel/helper-hoist-variables": "^7.18.6", - "@babel/helper-split-export-declaration": "^7.18.6", - "@babel/parser": "^7.19.0", - "@babel/types": "^7.19.0", + "version": "7.23.2", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.23.2.tgz", + "integrity": "sha512-azpe59SQ48qG6nu2CzcMLbxUudtN+dOM9kDbUqGq3HXUJRlo7i8fvPoxQUzYgLZ4cMVmuZgm8vvBpNeRhd6XSw==", + "requires": { + "@babel/code-frame": "^7.22.13", + "@babel/generator": "^7.23.0", + "@babel/helper-environment-visitor": "^7.22.20", + "@babel/helper-function-name": "^7.23.0", + "@babel/helper-hoist-variables": "^7.22.5", + "@babel/helper-split-export-declaration": "^7.22.6", + "@babel/parser": "^7.23.0", + "@babel/types": "^7.23.0", "debug": "^4.1.0", "globals": "^11.1.0" } }, "@babel/types": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.19.0.tgz", - "integrity": "sha512-YuGopBq3ke25BVSiS6fgF49Ul9gH1x70Bcr6bqRLjWCkcX8Hre1/5+z+IiWOIerRMSSEfGZVB9z9kyq7wVs9YA==", + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.23.0.tgz", + "integrity": "sha512-0oIyUfKoI3mSqMvsxBdclDwxXKXAUA8v/apZbc+iSyARYou1o8ZGDxbUYyLFoW2arqS2jDGqJuZvv1d/io1axg==", "requires": { - "@babel/helper-string-parser": "^7.18.10", - "@babel/helper-validator-identifier": "^7.18.6", + "@babel/helper-string-parser": "^7.22.5", + "@babel/helper-validator-identifier": "^7.22.20", "to-fast-properties": "^2.0.0" } }, @@ -12289,12 +12290,12 @@ "integrity": "sha512-XPSJHWmi394fuUuzDnGz1wiKqWfo1yXecHQMRf2l6hztTO+nPru658AyDngaBe7isIxEkRsPR3FZh+s7iVa4Uw==" }, "@jridgewell/trace-mapping": { - "version": "0.3.15", - "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.15.tgz", - "integrity": "sha512-oWZNOULl+UbhsgB51uuZzglikfIKSUBO/M9W2OfEjn7cmqoAiCgmv9lyACTUacZwBz0ITnJ2NqjU8Tx0DHL88g==", + "version": "0.3.19", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.19.tgz", + "integrity": "sha512-kf37QtfW+Hwx/buWGMPcR60iF9ziHa6r/CZJIHbmcm4+0qrXiVdxegAH0F6yddEVQ7zdkjcGCgCzUu+BcbhQxw==", "requires": { - "@jridgewell/resolve-uri": "^3.0.3", - "@jridgewell/sourcemap-codec": "^1.4.10" + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" } }, "@loadable/component": { @@ -12418,9 +12419,9 @@ } }, "@next/env": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/env/-/env-13.4.12.tgz", - "integrity": "sha512-RmHanbV21saP/6OEPBJ7yJMuys68cIf8OBBWd7+uj40LdpmswVAwe1uzeuFyUsd6SfeITWT3XnQfn6wULeKwDQ==" + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/env/-/env-13.5.4.tgz", + "integrity": "sha512-LGegJkMvRNw90WWphGJ3RMHMVplYcOfRWf2Be3td3sUa+1AaxmsYyANsA+znrGCBjXJNi4XAQlSoEfUxs/4kIQ==" }, "@next/eslint-plugin-next": { "version": "13.4.12", @@ -12448,57 +12449,57 @@ } }, "@next/swc-darwin-arm64": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-13.4.12.tgz", - "integrity": "sha512-deUrbCXTMZ6ZhbOoloqecnUeNpUOupi8SE2tx4jPfNS9uyUR9zK4iXBvH65opVcA/9F5I/p8vDXSYbUlbmBjZg==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-13.5.4.tgz", + "integrity": "sha512-Df8SHuXgF1p+aonBMcDPEsaahNo2TCwuie7VXED4FVyECvdXfRT9unapm54NssV9tF3OQFKBFOdlje4T43VO0w==", "optional": true }, "@next/swc-darwin-x64": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-13.4.12.tgz", - "integrity": "sha512-WRvH7RxgRHlC1yb5oG0ZLx8F7uci9AivM5/HGGv9ZyG2Als8Ij64GC3d+mQ5sJhWjusyU6T6V1WKTUoTmOB0zQ==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-13.5.4.tgz", + "integrity": "sha512-siPuUwO45PnNRMeZnSa8n/Lye5ZX93IJom9wQRB5DEOdFrw0JjOMu1GINB8jAEdwa7Vdyn1oJ2xGNaQpdQQ9Pw==", "optional": true }, "@next/swc-linux-arm64-gnu": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-13.4.12.tgz", - "integrity": "sha512-YEKracAWuxp54tKiAvvq73PUs9lok57cc8meYRibTWe/VdPB2vLgkTVWFcw31YDuRXdEhdX0fWS6Q+ESBhnEig==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-13.5.4.tgz", + "integrity": "sha512-l/k/fvRP/zmB2jkFMfefmFkyZbDkYW0mRM/LB+tH5u9pB98WsHXC0WvDHlGCYp3CH/jlkJPL7gN8nkTQVrQ/2w==", "optional": true }, "@next/swc-linux-arm64-musl": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-13.4.12.tgz", - "integrity": "sha512-LhJR7/RAjdHJ2Isl2pgc/JaoxNk0KtBgkVpiDJPVExVWA1c6gzY57+3zWuxuyWzTG+fhLZo2Y80pLXgIJv7g3g==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-13.5.4.tgz", + "integrity": "sha512-YYGb7SlLkI+XqfQa8VPErljb7k9nUnhhRrVaOdfJNCaQnHBcvbT7cx/UjDQLdleJcfyg1Hkn5YSSIeVfjgmkTg==", "optional": true }, "@next/swc-linux-x64-gnu": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-13.4.12.tgz", - "integrity": "sha512-1DWLL/B9nBNiQRng+1aqs3OaZcxC16Nf+mOnpcrZZSdyKHek3WQh6j/fkbukObgNGwmCoVevLUa/p3UFTTqgqg==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-13.5.4.tgz", + "integrity": "sha512-uE61vyUSClnCH18YHjA8tE1prr/PBFlBFhxBZis4XBRJoR+txAky5d7gGNUIbQ8sZZ7LVkSVgm/5Fc7mwXmRAg==", "optional": true }, "@next/swc-linux-x64-musl": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-13.4.12.tgz", - "integrity": "sha512-kEAJmgYFhp0VL+eRWmUkVxLVunn7oL9Mdue/FS8yzRBVj7Z0AnIrHpTIeIUl1bbdQq1VaoOztnKicAjfkLTRCQ==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-13.5.4.tgz", + "integrity": "sha512-qVEKFYML/GvJSy9CfYqAdUexA6M5AklYcQCW+8JECmkQHGoPxCf04iMh7CPR7wkHyWWK+XLt4Ja7hhsPJtSnhg==", "optional": true }, "@next/swc-win32-arm64-msvc": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-13.4.12.tgz", - "integrity": "sha512-GMLuL/loR6yIIRTnPRY6UGbLL9MBdw2anxkOnANxvLvsml4F0HNIgvnU3Ej4BjbqMTNjD4hcPFdlEow4XHPdZA==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-13.5.4.tgz", + "integrity": "sha512-mDSQfqxAlfpeZOLPxLymZkX0hYF3juN57W6vFHTvwKlnHfmh12Pt7hPIRLYIShk8uYRsKPtMTth/EzpwRI+u8w==", "optional": true }, "@next/swc-win32-ia32-msvc": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-13.4.12.tgz", - "integrity": "sha512-PhgNqN2Vnkm7XaMdRmmX0ZSwZXQAtamBVSa9A/V1dfKQCV1rjIZeiy/dbBnVYGdj63ANfsOR/30XpxP71W0eww==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-13.5.4.tgz", + "integrity": "sha512-aoqAT2XIekIWoriwzOmGFAvTtVY5O7JjV21giozBTP5c6uZhpvTWRbmHXbmsjZqY4HnEZQRXWkSAppsIBweKqw==", "optional": true }, "@next/swc-win32-x64-msvc": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-13.4.12.tgz", - "integrity": "sha512-Z+56e/Ljt0bUs+T+jPjhFyxYBcdY2RIq9ELFU+qAMQMteHo7ymbV7CKmlcX59RI9C4YzN8PgMgLyAoi916b5HA==", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-13.5.4.tgz", + "integrity": "sha512-cyRvlAxwlddlqeB9xtPSfNSCRy8BOa4wtMo0IuI9P7Y0XT2qpDrpFKRyZ7kUngZis59mPVla5k8X1oOJ8RxDYg==", "optional": true }, "@nodelib/fs.scandir": { @@ -12613,9 +12614,9 @@ "dev": true }, "@swc/helpers": { - "version": "0.5.1", - "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.1.tgz", - "integrity": "sha512-sJ902EfIzn1Fa+qYmjdQqh8tPsoxyBz+8yBKC2HKUxyezKJFwPGOn7pv4WY6QuQW//ySQi5lJjA/ZT9sNWWNTg==", + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.2.tgz", + "integrity": "sha512-E4KcWTpoLHqwPHLxidpOqQbcrZVgi0rsmmZXUle1jXmJfuIf/UWpczUJ7MZZ5tlxytgJXyp0w4PGkkeLiuIdZw==", "requires": { "tslib": "^2.4.0" } @@ -16990,9 +16991,9 @@ "optional": true }, "nanoid": { - "version": "3.3.4", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.4.tgz", - "integrity": "sha512-MqBkQh/OHTS2egovRtLk45wEyNXwF+cokD+1YPf9u5VfJiRdAiRwB2froX5Co9Rh20xs4siNPm8naNotSD6RBw==" + "version": "3.3.6", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz", + "integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==" }, "natural-compare": { "version": "1.4.0", @@ -17013,27 +17014,26 @@ "dev": true }, "next": { - "version": "13.4.12", - "resolved": "https://registry.npmjs.org/next/-/next-13.4.12.tgz", - "integrity": "sha512-eHfnru9x6NRmTMcjQp6Nz0J4XH9OubmzOa7CkWL+AUrUxpibub3vWwttjduu9No16dug1kq04hiUUpo7J3m3Xw==", - "requires": { - "@next/env": "13.4.12", - "@next/swc-darwin-arm64": "13.4.12", - "@next/swc-darwin-x64": "13.4.12", - "@next/swc-linux-arm64-gnu": "13.4.12", - "@next/swc-linux-arm64-musl": "13.4.12", - "@next/swc-linux-x64-gnu": "13.4.12", - "@next/swc-linux-x64-musl": "13.4.12", - "@next/swc-win32-arm64-msvc": "13.4.12", - "@next/swc-win32-ia32-msvc": "13.4.12", - "@next/swc-win32-x64-msvc": "13.4.12", - "@swc/helpers": "0.5.1", + "version": "13.5.4", + "resolved": "https://registry.npmjs.org/next/-/next-13.5.4.tgz", + "integrity": "sha512-+93un5S779gho8y9ASQhb/bTkQF17FNQOtXLKAj3lsNgltEcF0C5PMLLncDmH+8X1EnJH1kbqAERa29nRXqhjA==", + "requires": { + "@next/env": "13.5.4", + "@next/swc-darwin-arm64": "13.5.4", + "@next/swc-darwin-x64": "13.5.4", + "@next/swc-linux-arm64-gnu": "13.5.4", + "@next/swc-linux-arm64-musl": "13.5.4", + "@next/swc-linux-x64-gnu": "13.5.4", + "@next/swc-linux-x64-musl": "13.5.4", + "@next/swc-win32-arm64-msvc": "13.5.4", + "@next/swc-win32-ia32-msvc": "13.5.4", + "@next/swc-win32-x64-msvc": "13.5.4", + "@swc/helpers": "0.5.2", "busboy": "1.6.0", "caniuse-lite": "^1.0.30001406", - "postcss": "8.4.14", + "postcss": "8.4.31", "styled-jsx": "5.1.1", - "watchpack": "2.4.0", - "zod": "3.21.4" + "watchpack": "2.4.0" } }, "node-fetch": { @@ -17368,11 +17368,11 @@ } }, "postcss": { - "version": "8.4.14", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.14.tgz", - "integrity": "sha512-E398TUmfAYFPBSdzgeieK2Y1+1cpdxJx8yXbK/m57nRhKSmk1GB2tO4lbLBtlkfPQTDKfe4Xqv1ASWPpayPEig==", + "version": "8.4.31", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz", + "integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==", "requires": { - "nanoid": "^3.3.4", + "nanoid": "^3.3.6", "picocolors": "^1.0.0", "source-map-js": "^1.0.2" } @@ -18942,11 +18942,6 @@ "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", "dev": true }, - "zod": { - "version": "3.21.4", - "resolved": "https://registry.npmjs.org/zod/-/zod-3.21.4.tgz", - "integrity": "sha512-m46AKbrzKVzOzs/DZgVnG5H55N1sv1M8qZU3A8RIKbs3mrACDNeIOeilDymVb2HdmP8uwshOCF4uJ8uM9rCqJw==" - }, "zustand": { "version": "3.7.2", "resolved": "https://registry.npmjs.org/zustand/-/zustand-3.7.2.tgz", diff --git a/dashboard/package.json b/dashboard/package.json index a0642de4380df..67d2ff0ef1715 100644 --- a/dashboard/package.json +++ b/dashboard/package.json @@ -31,7 +31,7 @@ "fabric": "^5.2.1", "framer-motion": "^6.5.1", "lodash": "^4.17.21", - "next": "^13.4.12", + "next": "^13.5.4", "react": "^18.2.0", "react-dom": "^18.2.0", "react-flow-renderer": "10.3.16", diff --git a/dashboard/pages/await_tree.tsx b/dashboard/pages/await_tree.tsx index 8908e121deabd..3db6b6677274a 100644 --- a/dashboard/pages/await_tree.tsx +++ b/dashboard/pages/await_tree.tsx @@ -36,22 +36,32 @@ import { getClusterInfoComputeNode } from "./api/cluster" import useFetch from "./api/fetch" const SIDEBAR_WIDTH = 200 +const ALL_COMPUTE_NODES = "" export default function AwaitTreeDump() { const { response: computeNodes } = useFetch(getClusterInfoComputeNode) - const [computeNodeId, setComputeNodeId] = useState() - const [dump, setDump] = useState("") + const [computeNodeId, setComputeNodeId] = useState() + const [dump, setDump] = useState("") useEffect(() => { - if (computeNodes && !computeNodeId && computeNodes.length > 0) { - setComputeNodeId(computeNodes[0].id) + if (computeNodes && !computeNodeId) { + setComputeNodeId(ALL_COMPUTE_NODES) } }, [computeNodes, computeNodeId]) const dumpTree = async () => { - const title = `Await-Tree Dump of Compute Node ${computeNodeId}:` - setDump(undefined) + if (computeNodeId === undefined) { + return + } + + let title + if (computeNodeId === ALL_COMPUTE_NODES) { + title = "Await-Tree Dump of All Compute Nodes:" + } else { + title = `Await-Tree Dump of Compute Node ${computeNodeId}:` + } + setDump("Loading...") let result @@ -92,10 +102,13 @@ export default function AwaitTreeDump() { Compute Nodes
Actor IDInstanceBlock Rate
Fragment IDs → DownstreamBlock Rate
{m.metric.actor_id}{m.metric.instance}
{`Fragment ${m.metric.fragment_id} -> ${m.metric.downstream_fragment_id}`}
, NotificationVersion) { let catalog_guard = self.catalog_manager.get_catalog_core_guard().await; - let mut tables = catalog_guard - .database - .list_tables() - .into_iter() - .filter(|t| { - t.stream_job_status == PbStreamJobStatus::Unspecified as i32 - || t.stream_job_status == PbStreamJobStatus::Created as i32 - }) - .collect_vec(); - tables.extend(catalog_guard.database.list_creating_tables()); + let tables = catalog_guard.database.list_tables(); let notification_version = self.env.notification_manager().current_version().await; (tables, notification_version) } diff --git a/src/meta/src/rpc/service/scale_service.rs b/src/meta/service/src/scale_service.rs similarity index 100% rename from src/meta/src/rpc/service/scale_service.rs rename to src/meta/service/src/scale_service.rs diff --git a/src/meta/src/rpc/service/serving_service.rs b/src/meta/service/src/serving_service.rs similarity index 100% rename from src/meta/src/rpc/service/serving_service.rs rename to src/meta/service/src/serving_service.rs diff --git a/src/meta/src/rpc/service/sink_coordination_service.rs b/src/meta/service/src/sink_coordination_service.rs similarity index 92% rename from src/meta/src/rpc/service/sink_coordination_service.rs rename to src/meta/service/src/sink_coordination_service.rs index f7d56af9c063f..72c4cb2ff9af4 100644 --- a/src/meta/src/rpc/service/sink_coordination_service.rs +++ b/src/meta/service/src/sink_coordination_service.rs @@ -20,12 +20,12 @@ use tonic::{Request, Response, Status, Streaming}; use crate::manager::sink_coordination::SinkCoordinatorManager; #[derive(Clone)] -pub(crate) struct SinkCoordinationServiceImpl { +pub struct SinkCoordinationServiceImpl { sink_manager: SinkCoordinatorManager, } impl SinkCoordinationServiceImpl { - pub(crate) fn new(sink_manager: SinkCoordinatorManager) -> Self { + pub fn new(sink_manager: SinkCoordinatorManager) -> Self { Self { sink_manager } } } diff --git a/src/meta/src/rpc/service/stream_service.rs b/src/meta/service/src/stream_service.rs similarity index 99% rename from src/meta/src/rpc/service/stream_service.rs rename to src/meta/service/src/stream_service.rs index b2ed1ec916b08..ef232d9b04ffd 100644 --- a/src/meta/src/rpc/service/stream_service.rs +++ b/src/meta/service/src/stream_service.rs @@ -207,6 +207,7 @@ impl StreamManagerService for StreamServiceImpl { state_table_ids: fragment.state_table_ids.clone(), upstream_fragment_ids: fragment.upstream_fragment_ids.clone(), fragment_type_mask: fragment.fragment_type_mask, + parallelism: fragment.actors.len() as _, } }) }) diff --git a/src/meta/src/rpc/service/system_params_service.rs b/src/meta/service/src/system_params_service.rs similarity index 64% rename from src/meta/src/rpc/service/system_params_service.rs rename to src/meta/service/src/system_params_service.rs index 114c9aa917a68..8d557b401a2ed 100644 --- a/src/meta/src/rpc/service/system_params_service.rs +++ b/src/meta/service/src/system_params_service.rs @@ -19,16 +19,22 @@ use risingwave_pb::meta::{ }; use tonic::{Request, Response, Status}; +use crate::controller::system_param::SystemParamsControllerRef; use crate::manager::SystemParamsManagerRef; pub struct SystemParamsServiceImpl { system_params_manager: SystemParamsManagerRef, + system_params_controller: Option, } impl SystemParamsServiceImpl { - pub fn new(system_params_manager: SystemParamsManagerRef) -> Self { + pub fn new( + system_params_manager: SystemParamsManagerRef, + system_params_controller: Option, + ) -> Self { Self { system_params_manager, + system_params_controller, } } } @@ -39,8 +45,15 @@ impl SystemParamsService for SystemParamsServiceImpl { &self, _request: Request, ) -> Result, Status> { - let params = Some(self.system_params_manager.get_pb_params().await); - Ok(Response::new(GetSystemParamsResponse { params })) + let params = if let Some(ctl) = &self.system_params_controller { + ctl.get_pb_params().await + } else { + self.system_params_manager.get_pb_params().await + }; + + Ok(Response::new(GetSystemParamsResponse { + params: Some(params), + })) } async fn set_system_param( @@ -48,10 +61,14 @@ impl SystemParamsService for SystemParamsServiceImpl { request: Request, ) -> Result, Status> { let req = request.into_inner(); - let params = self - .system_params_manager - .set_param(&req.param, req.value) - .await?; + let params = if let Some(ctl) = &self.system_params_controller { + ctl.set_param(&req.param, req.value).await? + } else { + self.system_params_manager + .set_param(&req.param, req.value) + .await? + }; + Ok(Response::new(SetSystemParamResponse { params: Some(params), })) diff --git a/src/meta/src/rpc/service/telemetry_service.rs b/src/meta/service/src/telemetry_service.rs similarity index 67% rename from src/meta/src/rpc/service/telemetry_service.rs rename to src/meta/service/src/telemetry_service.rs index b1a9cdec3ef34..7c413406f13e5 100644 --- a/src/meta/src/rpc/service/telemetry_service.rs +++ b/src/meta/service/src/telemetry_service.rs @@ -14,25 +14,38 @@ use risingwave_pb::meta::telemetry_info_service_server::TelemetryInfoService; use risingwave_pb::meta::{GetTelemetryInfoRequest, TelemetryInfoResponse}; +use sea_orm::EntityTrait; use tonic::{Request, Response, Status}; +use crate::controller::SqlMetaStore; use crate::model::ClusterId; +use crate::model_v2::prelude::Cluster; use crate::storage::MetaStoreRef; +use crate::MetaResult; pub struct TelemetryInfoServiceImpl { meta_store: MetaStoreRef, + sql_meta_store: Option, } impl TelemetryInfoServiceImpl { - pub fn new(meta_store: MetaStoreRef) -> Self { - Self { meta_store } + pub fn new(meta_store: MetaStoreRef, sql_meta_store: Option) -> Self { + Self { + meta_store, + sql_meta_store, + } } - async fn get_tracking_id(&self) -> Option { - ClusterId::from_meta_store(&self.meta_store) + async fn get_tracking_id(&self) -> MetaResult> { + if let Some(store) = &self.sql_meta_store { + let cluster = Cluster::find().one(&store.conn).await?; + return Ok(cluster.map(|c| c.cluster_id.to_string().into())); + } + + Ok(ClusterId::from_meta_store(&self.meta_store) .await .ok() - .flatten() + .flatten()) } } @@ -42,7 +55,7 @@ impl TelemetryInfoService for TelemetryInfoServiceImpl { &self, _request: Request, ) -> Result, Status> { - match self.get_tracking_id().await { + match self.get_tracking_id().await? { Some(tracking_id) => Ok(Response::new(TelemetryInfoResponse { tracking_id: Some(tracking_id.into()), })), diff --git a/src/meta/src/rpc/service/user_service.rs b/src/meta/service/src/user_service.rs similarity index 99% rename from src/meta/src/rpc/service/user_service.rs rename to src/meta/service/src/user_service.rs index e1b7cc27092d5..8c982521b112a 100644 --- a/src/meta/src/rpc/service/user_service.rs +++ b/src/meta/service/src/user_service.rs @@ -151,7 +151,7 @@ impl UserService for UserServiceImpl { let update_fields = req .update_fields .iter() - .map(|i| UpdateField::from_i32(*i).unwrap()) + .map(|i| UpdateField::try_from(*i).unwrap()) .collect_vec(); let user = req.get_user()?.clone(); let version = self diff --git a/src/meta/src/backup_restore/backup_manager.rs b/src/meta/src/backup_restore/backup_manager.rs index c280572c796d4..819ea02e36346 100644 --- a/src/meta/src/backup_restore/backup_manager.rs +++ b/src/meta/src/backup_restore/backup_manager.rs @@ -68,9 +68,11 @@ pub struct BackupManager { hummock_manager: HummockManagerRef, backup_store: ArcSwap<(BoxedMetaSnapshotStorage, StoreConfig)>, /// Tracks the running backup job. Concurrent jobs is not supported. - running_backup_job: tokio::sync::Mutex>, + running_job_handle: tokio::sync::Mutex>, metrics: BackupManagerMetrics, meta_metrics: Arc, + /// (job id, status, message) + latest_job_info: ArcSwap<(MetaBackupJobId, BackupJobStatus, String)>, } impl BackupManager { @@ -147,9 +149,10 @@ impl BackupManager { env, hummock_manager, backup_store: ArcSwap::from_pointee(backup_store), - running_backup_job: tokio::sync::Mutex::new(None), + running_job_handle: tokio::sync::Mutex::new(None), metrics: BackupManagerMetrics::default(), meta_metrics, + latest_job_info: ArcSwap::from_pointee((0, BackupJobStatus::NotFound, "".into())), } } @@ -181,7 +184,7 @@ impl BackupManager { /// Starts a backup job in background. It's non-blocking. /// Returns job id. pub async fn start_backup_job(self: &Arc) -> MetaResult { - let mut guard = self.running_backup_job.lock().await; + let mut guard = self.running_job_handle.lock().await; if let Some(job) = (*guard).as_ref() { bail!(format!( "concurrent backup job is not supported: existent job {}", @@ -213,6 +216,8 @@ impl BackupManager { .id_gen_manager() .generate::<{ IdCategory::Backup }>() .await?; + self.latest_job_info + .store(Arc::new((job_id, BackupJobStatus::Running, "".into()))); let hummock_version_safe_point = self.hummock_manager.register_safe_point().await; // Ideally `BackupWorker` and its r/w IO can be made external to meta node. // The justification of keeping `BackupWorker` in meta node are: @@ -227,27 +232,12 @@ impl BackupManager { Ok(job_id) } - pub async fn get_backup_job_status( - &self, - job_id: MetaBackupJobId, - ) -> MetaResult { - if let Some(running_job) = self.running_backup_job.lock().await.as_ref() { - if running_job.job_id == job_id { - return Ok(BackupJobStatus::Running); - } - } - if self - .backup_store - .load() - .0 - .manifest() - .snapshot_metadata - .iter() - .any(|m| m.id == job_id) - { - return Ok(BackupJobStatus::Succeeded); + pub fn get_backup_job_status(&self, job_id: MetaBackupJobId) -> (BackupJobStatus, String) { + let last = self.latest_job_info.load(); + if last.0 == job_id { + return (last.1, last.2.clone()); } - Ok(BackupJobStatus::NotFound) + (BackupJobStatus::NotFound, "".into()) } async fn finish_backup_job(&self, job_id: MetaBackupJobId, job_result: BackupJobResult) { @@ -269,16 +259,24 @@ impl BackupManager { id: self.backup_store.load().0.manifest().manifest_id, }), ); + self.latest_job_info.store(Arc::new(( + job_id, + BackupJobStatus::Succeeded, + "".into(), + ))); } BackupJobResult::Failed(e) => { self.metrics.job_latency_failure.observe(job_latency); - tracing::warn!("failed backup job {}: {}", job_id, e); + let message = format!("failed backup job {}: {}", job_id, e); + tracing::warn!(message); + self.latest_job_info + .store(Arc::new((job_id, BackupJobStatus::Failed, message))); } } } async fn take_job_handle_by_job_id(&self, job_id: u64) -> Option { - let mut guard = self.running_backup_job.lock().await; + let mut guard = self.running_job_handle.lock().await; match (*guard).as_ref() { None => { return None; diff --git a/src/meta/src/backup_restore/meta_snapshot_builder.rs b/src/meta/src/backup_restore/meta_snapshot_builder.rs index e54c9f443f125..ef98c1158fd2a 100644 --- a/src/meta/src/backup_restore/meta_snapshot_builder.rs +++ b/src/meta/src/backup_restore/meta_snapshot_builder.rs @@ -191,7 +191,6 @@ mod tests { let v_ = v.clone(); async move { v_ } }; - hummock_version.insert(&meta_store).await.unwrap(); let err = builder .build(1, get_ckpt_builder(&hummock_version)) .await diff --git a/src/meta/src/backup_restore/restore.rs b/src/meta/src/backup_restore/restore.rs index 36e493686956d..ab4696e62f9bd 100644 --- a/src/meta/src/backup_restore/restore.rs +++ b/src/meta/src/backup_restore/restore.rs @@ -152,7 +152,6 @@ async fn restore_default_cf( async fn restore_metadata(meta_store: S, snapshot: MetaSnapshot) -> BackupResult<()> { restore_default_cf(&meta_store, &snapshot).await?; - restore_metadata_model(&meta_store, &[snapshot.metadata.hummock_version]).await?; restore_metadata_model(&meta_store, &[snapshot.metadata.version_stats]).await?; restore_metadata_model( &meta_store, @@ -290,7 +289,7 @@ mod tests { use itertools::Itertools; use risingwave_backup::meta_snapshot::{ClusterMetadata, MetaSnapshot}; use risingwave_common::config::{MetaBackend, SystemConfig}; - use risingwave_pb::hummock::HummockVersion; + use risingwave_pb::hummock::{HummockVersion, HummockVersionStats}; use risingwave_pb::meta::SystemParams; use crate::backup_restore::restore::restore_impl; @@ -331,8 +330,8 @@ mod tests { let backup_store = get_backup_store(opts.clone()).await.unwrap(); let nonempty_meta_store = get_meta_store(opts.clone()).await.unwrap(); dispatch_meta_store!(nonempty_meta_store.clone(), store, { - let hummock_version = HummockVersion::default(); - hummock_version.insert(&store).await.unwrap(); + let stats = HummockVersionStats::default(); + stats.insert(&store).await.unwrap(); }); let empty_meta_store = get_meta_store(opts.clone()).await.unwrap(); let system_param = get_system_params(); @@ -377,13 +376,6 @@ mod tests { .unwrap(); dispatch_meta_store!(empty_meta_store, store, { - let restored_hummock_version = HummockVersion::list(&store) - .await - .unwrap() - .into_iter() - .next() - .unwrap(); - assert_eq!(restored_hummock_version.id, 123); let restored_system_param = SystemParams::get(&store).await.unwrap().unwrap(); assert_eq!(restored_system_param, system_param); }); @@ -547,7 +539,6 @@ mod tests { .unwrap(); dispatch_meta_store!(empty_meta_store, store, { - assert!(HummockVersion::list(&store).await.unwrap().is_empty()); assert!(SystemParams::get(&store).await.unwrap().is_none()); }); } diff --git a/src/meta/src/barrier/command.rs b/src/meta/src/barrier/command.rs index 8d8076e56a233..bbe60c010b94b 100644 --- a/src/meta/src/barrier/command.rs +++ b/src/meta/src/barrier/command.rs @@ -36,7 +36,8 @@ use uuid::Uuid; use super::info::BarrierActorInfo; use super::trace::TracedEpoch; use crate::barrier::CommandChanges; -use crate::manager::{FragmentManagerRef, WorkerId}; +use crate::hummock::HummockManagerRef; +use crate::manager::{CatalogManagerRef, FragmentManagerRef, WorkerId}; use crate::model::{ActorId, DispatcherId, FragmentId, TableFragments}; use crate::stream::{build_actor_connector_splits, SourceManagerRef, SplitAssignment}; use crate::MetaResult; @@ -216,7 +217,9 @@ impl Command { /// [`CommandContext`] is used for generating barrier and doing post stuffs according to the given /// [`Command`]. pub struct CommandContext { - fragment_manager: FragmentManagerRef, + pub fragment_manager: FragmentManagerRef, + catalog_manager: CatalogManagerRef, + hummock_manager: HummockManagerRef, client_pool: StreamClientPoolRef, @@ -247,6 +250,8 @@ impl CommandContext { #[allow(clippy::too_many_arguments)] pub(super) fn new( fragment_manager: FragmentManagerRef, + catalog_manager: CatalogManagerRef, + hummock_manager: HummockManagerRef, client_pool: StreamClientPoolRef, info: BarrierActorInfo, prev_epoch: TracedEpoch, @@ -259,6 +264,8 @@ impl CommandContext { ) -> Self { Self { fragment_manager, + catalog_manager, + hummock_manager, client_pool, info: Arc::new(info), prev_epoch, @@ -663,7 +670,51 @@ impl CommandContext { Command::CancelStreamingJob(table_fragments) => { let node_actors = table_fragments.worker_actor_ids(); self.clean_up(node_actors).await?; - // Drop fragment info in meta store. + + // NOTE(kwannoel): At this point, meta has already registered the table ids. + // We should unregister them. + // This is required for background ddl, for foreground ddl this is a no-op. + // Foreground ddl is handled entirely by stream manager, so it will unregister + // the table ids on failure. + // On the other hand background ddl could be handled by barrier manager. + // It won't clean the tables on failure, + // since the failure could be recoverable. + // As such it needs to be handled here. + let table_id = table_fragments.table_id().table_id; + let mut table_ids = table_fragments.internal_table_ids(); + table_ids.push(table_id); + if let Err(e) = self.hummock_manager.unregister_table_ids(&table_ids).await { + tracing::warn!("Failed to unregister compaction group for {:#?}. They will be cleaned up on node restart. {:#?}", &table_ids, e); + } + + // NOTE(kwannoel): At this point, catalog manager has persisted the tables already. + // We need to cleanup the table state. So we can do it here. + // The logic is the same as above, for hummock_manager.unregister_table_ids. + if let Err(e) = self + .catalog_manager + .cancel_create_table_procedure( + table_fragments.table_id().table_id, + table_fragments.internal_table_ids(), + ) + .await + { + let table_id = table_fragments.table_id().table_id; + tracing::warn!( + table_id, + reason=?e, + "cancel_create_table_procedure failed for CancelStreamingJob", + ); + // If failed, check that table is not in meta store. + // If any table is, just panic, let meta do bootstrap recovery. + // Otherwise our persisted state is dirty. + let mut table_ids = table_fragments.internal_table_ids(); + table_ids.push(table_id); + self.catalog_manager.assert_tables_deleted(table_ids).await; + } + + // We need to drop table fragments here, + // since this is not done in stream manager (foreground ddl) + // OR barrier manager (background ddl) self.fragment_manager .drop_table_fragments_vec(&HashSet::from_iter(std::iter::once( table_fragments.table_id(), @@ -791,24 +842,4 @@ impl CommandContext { Ok(()) } - - /// Do some stuffs before the barrier is `finish`ed. Only used for `CreateStreamingJob`. - pub async fn pre_finish(&self) -> MetaResult<()> { - #[allow(clippy::single_match)] - match &self.command { - Command::CreateStreamingJob { - table_fragments, .. - } => { - // Update the state of the table fragments from `Creating` to `Created`, so that the - // fragments can be scaled. - self.fragment_manager - .mark_table_fragments_created(table_fragments.table_id()) - .await?; - } - - _ => {} - } - - Ok(()) - } } diff --git a/src/meta/src/barrier/mod.rs b/src/meta/src/barrier/mod.rs index cd3ee0360009f..ed6ad289a5a68 100644 --- a/src/meta/src/barrier/mod.rs +++ b/src/meta/src/barrier/mod.rs @@ -50,7 +50,7 @@ use self::info::BarrierActorInfo; use self::notifier::Notifier; use self::progress::TrackingCommand; use crate::barrier::notifier::BarrierInfo; -use crate::barrier::progress::CreateMviewProgressTracker; +use crate::barrier::progress::{CreateMviewProgressTracker, TrackingJob}; use crate::barrier::BarrierEpochState::{Completed, InFlight}; use crate::hummock::HummockManagerRef; use crate::manager::sink_coordination::SinkCoordinatorManager; @@ -58,7 +58,7 @@ use crate::manager::{ CatalogManagerRef, ClusterManagerRef, FragmentManagerRef, LocalNotification, MetaSrvEnv, WorkerId, }; -use crate::model::{ActorId, BarrierManagerState}; +use crate::model::{ActorId, BarrierManagerState, TableFragments}; use crate::rpc::metrics::MetaMetrics; use crate::stream::SourceManagerRef; use crate::{MetaError, MetaResult}; @@ -75,6 +75,35 @@ pub use self::command::{Command, Reschedule}; pub use self::schedule::BarrierScheduler; pub use self::trace::TracedEpoch; +#[derive(Debug, Default, Clone, PartialEq, Eq)] +pub(crate) struct TableMap { + inner: HashMap, +} + +impl TableMap { + pub fn remove(&mut self, table_id: &TableId) -> Option { + self.inner.remove(table_id) + } +} + +impl From> for TableMap { + fn from(inner: HashMap) -> Self { + Self { inner } + } +} + +impl From> for HashMap { + fn from(table_map: TableMap) -> Self { + table_map.inner + } +} + +pub(crate) type TableActorMap = TableMap>; +pub(crate) type TableUpstreamMvCountMap = TableMap>; +pub(crate) type TableDefinitionMap = TableMap; +pub(crate) type TableNotifierMap = TableMap; +pub(crate) type TableFragmentMap = TableMap; + /// Status of barrier manager. enum BarrierManagerStatus { /// Barrier manager is starting. @@ -139,7 +168,7 @@ pub struct GlobalBarrierManager { cluster_manager: ClusterManagerRef, - pub(crate) catalog_manager: CatalogManagerRef, + pub catalog_manager: CatalogManagerRef, fragment_manager: FragmentManagerRef, @@ -151,7 +180,7 @@ pub struct GlobalBarrierManager { metrics: Arc, - pub(crate) env: MetaSrvEnv, + pub env: MetaSrvEnv, tracker: Mutex, } @@ -177,7 +206,7 @@ struct CheckpointControl { metrics: Arc, /// Get notified when we finished Create MV and collect a barrier(checkpoint = true) - finished_commands: Vec, + finished_commands: Vec, } impl CheckpointControl { @@ -194,8 +223,8 @@ impl CheckpointControl { } /// Stash a command to finish later. - fn stash_command_to_finish(&mut self, finished_command: TrackingCommand) { - self.finished_commands.push(finished_command); + fn stash_command_to_finish(&mut self, finished_job: TrackingJob) { + self.finished_commands.push(finished_job); } /// Finish stashed commands. If the current barrier is not a `checkpoint`, we will not finish @@ -205,31 +234,32 @@ impl CheckpointControl { async fn finish_commands(&mut self, checkpoint: bool) -> MetaResult { for command in self .finished_commands - .extract_if(|c| checkpoint || c.context.kind.is_barrier()) + .extract_if(|c| checkpoint || c.is_barrier()) { // The command is ready to finish. We can now call `pre_finish`. - command.context.pre_finish().await?; - command - .notifiers - .into_iter() - .for_each(Notifier::notify_finished); + command.pre_finish().await?; + command.notify_finished(); } Ok(!self.finished_commands.is_empty()) } - fn cancel_command(&mut self, cancelled_command: TrackingCommand) { - if let Some(index) = self.command_ctx_queue.iter().position(|x| { - x.command_ctx.prev_epoch.value() == cancelled_command.context.prev_epoch.value() - }) { - self.command_ctx_queue.remove(index); - self.remove_changes(cancelled_command.context.command.changes()); + fn cancel_command(&mut self, cancelled_job: TrackingJob) { + if let TrackingJob::New(cancelled_command) = cancelled_job { + if let Some(index) = self.command_ctx_queue.iter().position(|x| { + x.command_ctx.prev_epoch.value() == cancelled_command.context.prev_epoch.value() + }) { + self.command_ctx_queue.remove(index); + self.remove_changes(cancelled_command.context.command.changes()); + } + } else { + // Recovered jobs do not need to be cancelled since only `RUNNING` actors will get recovered. } } fn cancel_stashed_command(&mut self, id: TableId) { self.finished_commands - .retain(|x| x.context.table_to_create() != Some(id)); + .retain(|x| x.table_to_create() != Some(id)); } /// Before resolving the actors to be sent or collected, we should first record the newly @@ -596,7 +626,7 @@ impl GlobalBarrierManager { let paused = self.take_pause_on_bootstrap().await.unwrap_or(false); let paused_reason = paused.then_some(PausedReason::Manual); - self.recovery(prev_epoch, paused_reason) + self.recovery(prev_epoch, paused_reason, true) .instrument(span) .await }; @@ -688,13 +718,13 @@ impl GlobalBarrierManager { }; // Tracing related stuff - prev_epoch.span().in_scope(|| { - tracing::info!(target: "rw_tracing", epoch = curr_epoch.value().0, "new barrier enqueued"); - }); + tracing::info!(target: "rw_tracing", parent: prev_epoch.span(), epoch = curr_epoch.value().0, "new barrier enqueued"); span.record("epoch", curr_epoch.value().0); let command_ctx = Arc::new(CommandContext::new( self.fragment_manager.clone(), + self.catalog_manager.clone(), + self.hummock_manager.clone(), self.env.stream_client_pool_ref(), info, prev_epoch.clone(), @@ -798,9 +828,9 @@ impl GlobalBarrierManager { actor_ids_to_send, actor_ids_to_collect, }; - tracing::trace!( + tracing::debug!( target: "events::meta::barrier::inject_barrier", - "inject barrier request: {:?}", request + ?request, "inject barrier request" ); // This RPC returns only if this worker node has injected this barrier. @@ -840,9 +870,9 @@ impl GlobalBarrierManager { prev_epoch, tracing_context, }; - tracing::trace!( + tracing::debug!( target: "events::meta::barrier::barrier_complete", - "barrier complete request: {:?}", request + ?request, "barrier complete" ); // This RPC returns only if this worker node has collected this barrier. @@ -912,6 +942,7 @@ impl GlobalBarrierManager { let fail_nodes = complete_nodes .drain(index..) .chain(checkpoint_control.barrier_failed().into_iter()); + tracing::warn!("Failed to commit epoch {}: {:?}", prev_epoch, err); self.failure_recovery(err, fail_nodes, state, checkpoint_control) .await; } @@ -939,11 +970,7 @@ impl GlobalBarrierManager { } if self.enable_recovery { - // If failed, enter recovery mode. self.set_status(BarrierManagerStatus::Recovering).await; - let mut tracker = self.tracker.lock().await; - *tracker = CreateMviewProgressTracker::new(); - let latest_snapshot = self.hummock_manager.latest_snapshot(); let prev_epoch = TracedEpoch::new(latest_snapshot.committed_epoch.into()); // we can only recovery from the committed epoch let span = tracing::info_span!( @@ -952,7 +979,12 @@ impl GlobalBarrierManager { prev_epoch = prev_epoch.value().0 ); - *state = self.recovery(prev_epoch, None).instrument(span).await; + // No need to clean dirty tables for barrier recovery, + // The foreground stream job should cleanup their own tables. + *state = self + .recovery(prev_epoch, None, false) + .instrument(span) + .await; self.set_status(BarrierManagerStatus::Running).await; } else { panic!("failed to execute barrier: {:?}", err); diff --git a/src/meta/src/barrier/notifier.rs b/src/meta/src/barrier/notifier.rs index 88acd9cd3dd7a..b28c5b01d53d9 100644 --- a/src/meta/src/barrier/notifier.rs +++ b/src/meta/src/barrier/notifier.rs @@ -30,7 +30,7 @@ pub struct BarrierInfo { /// Used for notifying the status of a scheduled command/barrier. #[derive(Debug, Default)] -pub(super) struct Notifier { +pub(crate) struct Notifier { /// Get notified when scheduled barrier is injected to compute nodes. pub injected: Option>, diff --git a/src/meta/src/barrier/progress.rs b/src/meta/src/barrier/progress.rs index d484e471f4a31..22cd6f8d9e200 100644 --- a/src/meta/src/barrier/progress.rs +++ b/src/meta/src/barrier/progress.rs @@ -25,17 +25,21 @@ use risingwave_pb::stream_service::barrier_complete_response::CreateMviewProgres use super::command::CommandContext; use super::notifier::Notifier; -use crate::barrier::Command; -use crate::model::ActorId; +use crate::barrier::{ + Command, TableActorMap, TableDefinitionMap, TableFragmentMap, TableNotifierMap, + TableUpstreamMvCountMap, +}; +use crate::manager::{FragmentManager, FragmentManagerRef}; +use crate::model::{ActorId, TableFragments}; +use crate::MetaResult; -type CreateMviewEpoch = Epoch; type ConsumedRows = u64; #[derive(Clone, Copy, Debug)] -enum ChainState { +pub enum ChainState { Init, ConsumingUpstream(Epoch, ConsumedRows), - Done, + Done(ConsumedRows), } /// Progress of all actors containing chain nodes while creating mview. @@ -45,10 +49,9 @@ struct Progress { done_count: usize, - /// Creating mv id. - creating_mv_id: TableId, - - /// Upstream mv count. Keep track of how many times each upstream MV appears. + /// Upstream mv count. + /// Keep track of how many times each upstream MV + /// appears in this stream job. upstream_mv_count: HashMap, /// Upstream mvs total key count. @@ -65,7 +68,6 @@ impl Progress { /// Create a [`Progress`] for some creating mview, with all `actors` containing the chain nodes. fn new( actors: impl IntoIterator, - creating_mv_id: TableId, upstream_mv_count: HashMap, upstream_total_key_count: u64, definition: String, @@ -79,7 +81,6 @@ impl Progress { Self { states, done_count: 0, - creating_mv_id, upstream_mv_count, upstream_total_key_count, consumed_rows: 0, @@ -93,18 +94,17 @@ impl Progress { match self.states.remove(&actor).unwrap() { ChainState::Init => {} ChainState::ConsumingUpstream(_, old_consumed_rows) => { - if !matches!(new_state, ChainState::Done) { - self.consumed_rows -= old_consumed_rows; - } + self.consumed_rows -= old_consumed_rows; } - ChainState::Done => panic!("should not report done multiple times"), + ChainState::Done(_) => panic!("should not report done multiple times"), }; match &new_state { ChainState::Init => {} ChainState::ConsumingUpstream(_, new_consumed_rows) => { self.consumed_rows += new_consumed_rows; } - ChainState::Done => { + ChainState::Done(new_consumed_rows) => { + self.consumed_rows += new_consumed_rows; self.done_count += 1; } }; @@ -140,6 +140,80 @@ impl Progress { } } +/// There are 2 kinds of `TrackingJobs`: +/// 1. `New`. This refers to the "New" type of tracking job. +/// It is instantiated and managed by the stream manager. +/// On recovery, the stream manager will stop managing the job. +/// 2. `Recovered`. This refers to the "Recovered" type of tracking job. +/// On recovery, the barrier manager will recover and start managing the job. +pub enum TrackingJob { + New(TrackingCommand), + Recovered(RecoveredTrackingJob), +} + +impl TrackingJob { + fn fragment_manager(&self) -> &FragmentManager { + match self { + TrackingJob::New(command) => command.context.fragment_manager.as_ref(), + TrackingJob::Recovered(recovered) => recovered.fragment_manager.as_ref(), + } + } + + pub(crate) fn is_barrier(&self) -> bool { + match self { + TrackingJob::Recovered(_) => true, + TrackingJob::New(command) => command.context.kind.is_barrier(), + } + } + + pub(crate) async fn pre_finish(&self) -> MetaResult<()> { + let table_fragments = match &self { + TrackingJob::New(command) => match &command.context.command { + Command::CreateStreamingJob { + table_fragments, .. + } => Some(table_fragments), + _ => None, + }, + TrackingJob::Recovered(recovered) => Some(&recovered.fragments), + }; + // Update the state of the table fragments from `Creating` to `Created`, so that the + // fragments can be scaled. + if let Some(table_fragments) = table_fragments { + self.fragment_manager() + .mark_table_fragments_created(table_fragments.table_id()) + .await?; + } + Ok(()) + } + + pub(crate) fn notify_finished(self) { + match self { + TrackingJob::New(command) => { + command + .notifiers + .into_iter() + .for_each(Notifier::notify_finished); + } + TrackingJob::Recovered(recovered) => { + recovered.finished.notify_finished(); + } + } + } + + pub(crate) fn table_to_create(&self) -> Option { + match self { + TrackingJob::New(command) => command.context.table_to_create(), + TrackingJob::Recovered(recovered) => Some(recovered.fragments.table_id()), + } + } +} + +pub struct RecoveredTrackingJob { + pub fragments: TableFragments, + pub finished: Notifier, + pub fragment_manager: FragmentManagerRef, +} + /// The command tracking by the [`CreateMviewProgressTracker`]. pub(super) struct TrackingCommand { /// The context of the command. @@ -151,15 +225,80 @@ pub(super) struct TrackingCommand { /// Track the progress of all creating mviews. When creation is done, `notify_finished` will be /// called on registered notifiers. +/// +/// Tracking is done as follows: +/// 1. We identify a `StreamJob` by its `TableId` of its `Materialized` table. +/// 2. For each stream job, there are several actors which run its tasks. +/// 3. With `progress_map` we can use the ID of the `StreamJob` to view its progress. +/// 4. With `actor_map` we can use an actor's `ActorId` to find the ID of the `StreamJob`. pub(super) struct CreateMviewProgressTracker { - /// Progress of the create-mview DDL indicated by the epoch. - progress_map: HashMap, + /// Progress of the create-mview DDL indicated by the TableId. + progress_map: HashMap, /// Find the epoch of the create-mview DDL by the actor containing the chain node. - actor_map: HashMap, + actor_map: HashMap, } impl CreateMviewProgressTracker { + /// This step recovers state from the meta side: + /// 1. `Tables`. + /// 2. `TableFragments`. + /// + /// Other state are persisted by the `BackfillExecutor`, such as: + /// 1. `CreateMviewProgress`. + /// 2. `Backfill` position. + pub fn recover( + table_map: TableActorMap, + mut upstream_mv_counts: TableUpstreamMvCountMap, + mut definitions: TableDefinitionMap, + version_stats: HummockVersionStats, + mut finished_notifiers: TableNotifierMap, + mut table_fragment_map: TableFragmentMap, + fragment_manager: FragmentManagerRef, + ) -> Self { + let mut actor_map = HashMap::new(); + let mut progress_map = HashMap::new(); + let table_map: HashMap<_, Vec> = table_map.into(); + for (creating_table_id, actors) in table_map { + // 1. Recover `ChainState` in the tracker. + let mut states = HashMap::new(); + for actor in actors { + actor_map.insert(actor, creating_table_id); + states.insert(actor, ChainState::ConsumingUpstream(Epoch(0), 0)); + } + let upstream_mv_count = upstream_mv_counts.remove(&creating_table_id).unwrap(); + let upstream_total_key_count = upstream_mv_count + .iter() + .map(|(upstream_mv, count)| { + *count as u64 + * version_stats + .table_stats + .get(&upstream_mv.table_id) + .map_or(0, |stat| stat.total_key_count as u64) + }) + .sum(); + let definition = definitions.remove(&creating_table_id).unwrap(); + let progress = Progress { + states, + done_count: 0, // Fill only after first barrier pass + upstream_mv_count, + upstream_total_key_count, + consumed_rows: 0, // Fill only after first barrier pass + definition, + }; + let tracking_job = TrackingJob::Recovered(RecoveredTrackingJob { + fragments: table_fragment_map.remove(&creating_table_id).unwrap(), + finished: finished_notifiers.remove(&creating_table_id).unwrap(), + fragment_manager: fragment_manager.clone(), + }); + progress_map.insert(creating_table_id, (progress, tracking_job)); + } + Self { + progress_map, + actor_map, + } + } + pub fn new() -> Self { Self { progress_map: Default::default(), @@ -169,9 +308,9 @@ impl CreateMviewProgressTracker { pub fn gen_ddl_progress(&self) -> Vec { self.progress_map - .values() - .map(|(x, _)| DdlProgress { - id: x.creating_mv_id.table_id as u64, + .iter() + .map(|(table_id, (x, _))| DdlProgress { + id: table_id.table_id as u64, statement: x.definition.clone(), progress: format!("{:.2}%", x.calculate_progress() * 100.0), }) @@ -184,7 +323,7 @@ impl CreateMviewProgressTracker { pub fn find_cancelled_command( &mut self, actors_to_cancel: HashSet, - ) -> Option { + ) -> Option { let epochs = actors_to_cancel .into_iter() .map(|actor_id| self.actor_map.get(&actor_id)) @@ -206,16 +345,11 @@ impl CreateMviewProgressTracker { &mut self, command: TrackingCommand, version_stats: &HummockVersionStats, - ) -> Option { + ) -> Option { let actors = command.context.actors_to_track(); if actors.is_empty() { // The command can be finished immediately. - return Some(command); - } - - let ddl_epoch = command.context.curr_epoch.value(); - for &actor in &actors { - self.actor_map.insert(actor, ddl_epoch); + return Some(TrackingJob::New(command)); } let (creating_mv_id, upstream_mv_count, upstream_total_key_count, definition) = @@ -259,14 +393,19 @@ impl CreateMviewProgressTracker { unreachable!("Must be CreateStreamingJob."); }; + for &actor in &actors { + self.actor_map.insert(actor, creating_mv_id); + } + let progress = Progress::new( actors, - creating_mv_id, upstream_mv_count, upstream_total_key_count, definition, ); - let old = self.progress_map.insert(ddl_epoch, (progress, command)); + let old = self + .progress_map + .insert(creating_mv_id, (progress, TrackingJob::New(command))); assert!(old.is_none()); None } @@ -278,22 +417,29 @@ impl CreateMviewProgressTracker { &mut self, progress: &CreateMviewProgress, version_stats: &HummockVersionStats, - ) -> Option { + ) -> Option { let actor = progress.chain_actor_id; - let Some(epoch) = self.actor_map.get(&actor).copied() else { - panic!( - "no tracked progress for actor {}, is it already finished?", + let Some(table_id) = self.actor_map.get(&actor).copied() else { + // On restart, backfill will ALWAYS notify CreateMviewProgressTracker, + // even if backfill is finished on recovery. + // This is because we don't know if only this actor is finished, + // OR the entire stream job is finished. + // For the first case, we must notify meta. + // For the second case, we can still notify meta, but ignore it here. + tracing::info!( + "no tracked progress for actor {}, the stream job could already be finished", actor ); + return None; }; let new_state = if progress.done { - ChainState::Done + ChainState::Done(progress.consumed_rows) } else { ChainState::ConsumingUpstream(progress.consumed_epoch.into(), progress.consumed_rows) }; - match self.progress_map.entry(epoch) { + match self.progress_map.entry(table_id) { Entry::Occupied(mut o) => { let progress = &mut o.get_mut().0; @@ -301,6 +447,7 @@ impl CreateMviewProgressTracker { .upstream_mv_count .iter() .map(|(upstream_mv, count)| { + assert_ne!(*count, 0); *count as u64 * version_stats .table_stats @@ -312,7 +459,10 @@ impl CreateMviewProgressTracker { progress.update(actor, new_state, upstream_total_key_count); if progress.is_done() { - tracing::debug!("all actors done for creating mview with epoch {}!", epoch); + tracing::debug!( + "all actors done for creating mview with table_id {}!", + table_id + ); // Clean-up the mapping from actors to DDL epoch. for actor in o.get().0.actors() { diff --git a/src/meta/src/barrier/recovery.rs b/src/meta/src/barrier/recovery.rs index bce901cd6f459..21197a8df98d4 100644 --- a/src/meta/src/barrier/recovery.rs +++ b/src/meta/src/barrier/recovery.rs @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{BTreeSet, HashSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::sync::Arc; use std::time::{Duration, Instant}; +use anyhow::anyhow; use futures::future::try_join_all; use itertools::Itertools; +use risingwave_common::catalog::TableId; use risingwave_pb::common::ActorInfo; use risingwave_pb::meta::PausedReason; use risingwave_pb::stream_plan::barrier::{BarrierKind, Mutation}; @@ -25,6 +27,7 @@ use risingwave_pb::stream_plan::AddMutation; use risingwave_pb::stream_service::{ BroadcastActorInfoTableRequest, BuildActorsRequest, ForceStopActorsRequest, UpdateActorsRequest, }; +use tokio::sync::oneshot; use tokio_retry::strategy::{jitter, ExponentialBackoff}; use tracing::{debug, warn, Instrument}; use uuid::Uuid; @@ -32,6 +35,8 @@ use uuid::Uuid; use super::TracedEpoch; use crate::barrier::command::CommandContext; use crate::barrier::info::BarrierActorInfo; +use crate::barrier::notifier::Notifier; +use crate::barrier::progress::CreateMviewProgressTracker; use crate::barrier::{CheckpointControl, Command, GlobalBarrierManager}; use crate::manager::WorkerId; use crate::model::{BarrierManagerState, MigrationPlan}; @@ -60,22 +65,30 @@ impl GlobalBarrierManager { .await } + /// Please look at `CatalogManager::clean_dirty_tables` for more details. + /// This should only be called for bootstrap recovery. + async fn clean_dirty_tables(&self) -> MetaResult<()> { + let fragment_manager = self.fragment_manager.clone(); + self.catalog_manager + .clean_dirty_tables(fragment_manager) + .await?; + Ok(()) + } + /// Clean up all dirty streaming jobs. async fn clean_dirty_fragments(&self) -> MetaResult<()> { let stream_job_ids = self.catalog_manager.list_stream_job_ids().await?; let to_drop_table_fragments = self .fragment_manager - .list_dirty_table_fragments(|tf| { - !stream_job_ids.contains(&tf.table_id().table_id) || !tf.is_created() - }) + .list_dirty_table_fragments(|tf| !stream_job_ids.contains(&tf.table_id().table_id)) .await; - let to_drop_streaming_ids = to_drop_table_fragments .iter() .map(|t| t.table_id()) .collect(); debug!("clean dirty table fragments: {:?}", to_drop_streaming_ids); + self.fragment_manager .drop_table_fragments_vec(&to_drop_streaming_ids) .await?; @@ -86,7 +99,7 @@ impl GlobalBarrierManager { &to_drop_table_fragments ) .await.inspect_err(|e| - tracing::warn!( + warn!( "Failed to unregister compaction group for {:#?}. They will be cleaned up on node restart. {:#?}", to_drop_table_fragments, e) @@ -100,6 +113,101 @@ impl GlobalBarrierManager { Ok(()) } + async fn recover_background_mv_progress(&self) -> MetaResult<()> { + let creating_tables = self.catalog_manager.list_creating_background_mvs().await; + let creating_table_ids = creating_tables + .iter() + .map(|t| TableId { table_id: t.id }) + .collect_vec(); + + let mut senders = HashMap::new(); + let mut receivers = Vec::new(); + for table_id in creating_table_ids.iter().copied() { + let (finished_tx, finished_rx) = oneshot::channel(); + senders.insert( + table_id, + Notifier { + finished: Some(finished_tx), + ..Default::default() + }, + ); + + let fragments = self + .fragment_manager + .select_table_fragments_by_table_id(&table_id) + .await?; + let internal_table_ids = fragments.internal_table_ids(); + let internal_tables = self.catalog_manager.get_tables(&internal_table_ids).await; + let table = self.catalog_manager.get_tables(&[table_id.table_id]).await; + assert_eq!(table.len(), 1, "should only have 1 materialized table"); + let table = table.into_iter().next().unwrap(); + receivers.push((table, internal_tables, finished_rx)); + } + + let table_map = self + .fragment_manager + .get_table_id_actor_mapping(&creating_table_ids) + .await; + let table_fragment_map = self + .fragment_manager + .get_table_id_table_fragment_map(&creating_table_ids) + .await?; + let upstream_mv_counts = self + .fragment_manager + .get_upstream_relation_counts(&creating_table_ids) + .await; + let definitions: HashMap<_, _> = creating_tables + .into_iter() + .map(|t| (TableId { table_id: t.id }, t.definition)) + .collect(); + let version_stats = self.hummock_manager.get_version_stats().await; + // If failed, enter recovery mode. + { + let mut tracker = self.tracker.lock().await; + *tracker = CreateMviewProgressTracker::recover( + table_map.into(), + upstream_mv_counts.into(), + definitions.into(), + version_stats, + senders.into(), + table_fragment_map.into(), + self.fragment_manager.clone(), + ); + } + for (table, internal_tables, finished) in receivers { + let catalog_manager = self.catalog_manager.clone(); + tokio::spawn(async move { + let res: MetaResult<()> = try { + tracing::debug!("recovering stream job {}", table.id); + finished + .await + .map_err(|e| anyhow!("failed to finish command: {}", e))?; + + tracing::debug!("finished stream job {}", table.id); + // Once notified that job is finished we need to notify frontend. + // and mark catalog as created and commit to meta. + // both of these are done by catalog manager. + catalog_manager + .finish_create_table_procedure(internal_tables, table.clone()) + .await?; + tracing::debug!("notified frontend for stream job {}", table.id); + }; + if let Err(e) = res.as_ref() { + tracing::error!( + "stream job {} interrupted, will retry after recovery: {e:?}", + table.id + ); + // NOTE(kwannoel): We should not cleanup stream jobs, + // we don't know if it's just due to CN killed, + // or the job has actually failed. + // Users have to manually cancel the stream jobs, + // if they want to clean it. + } + }); + } + Ok(()) + } + /// Recovery the whole cluster from the latest epoch. /// /// If `paused_reason` is `Some`, all data sources (including connectors and DMLs) will be @@ -107,10 +215,11 @@ impl GlobalBarrierManager { /// the cluster or `risectl` command. Used for debugging purpose. /// /// Returns the new state of the barrier manager after recovery. - pub(crate) async fn recovery( + pub async fn recovery( &self, prev_epoch: TracedEpoch, paused_reason: Option, + bootstrap_recovery: bool, ) -> BarrierManagerState { // Mark blocked and abort buffered schedules, they might be dirty already. self.scheduled_barriers @@ -118,12 +227,25 @@ impl GlobalBarrierManager { .await; tracing::info!("recovery start!"); + if bootstrap_recovery { + self.clean_dirty_tables() + .await + .expect("clean dirty tables should not fail"); + } self.clean_dirty_fragments() .await .expect("clean dirty fragments"); + self.sink_manager.reset().await; let retry_strategy = Self::get_retry_strategy(); + // Mview progress needs to be recovered. + tracing::info!("recovering mview progress"); + self.recover_background_mv_progress() + .await + .expect("recover mview progress should not fail"); + tracing::info!("recovered mview progress"); + // We take retry into consideration because this is the latency user sees for a cluster to // get recovered. let recovery_timer = self.metrics.recovery_latency.start_timer(); @@ -172,6 +294,8 @@ impl GlobalBarrierManager { // Inject the `Initial` barrier to initialize all executors. let command_ctx = Arc::new(CommandContext::new( self.fragment_manager.clone(), + self.catalog_manager.clone(), + self.hummock_manager.clone(), self.env.stream_client_pool_ref(), info, prev_epoch.clone(), @@ -204,7 +328,7 @@ impl GlobalBarrierManager { warn!(err = ?err, "post_collect failed"); Err(err) } else { - Ok((new_epoch, response)) + Ok((new_epoch.clone(), response)) } } Err(err) => { diff --git a/src/meta/src/barrier/schedule.rs b/src/meta/src/barrier/schedule.rs index 7c9fefd15606b..c4718d97d40f6 100644 --- a/src/meta/src/barrier/schedule.rs +++ b/src/meta/src/barrier/schedule.rs @@ -393,7 +393,7 @@ impl ScheduledBarriers { } /// Make the `checkpoint` of the next barrier must be true - pub(crate) fn force_checkpoint_in_next_barrier(&self) { + pub fn force_checkpoint_in_next_barrier(&self) { self.inner.force_checkpoint.store(true, Ordering::Relaxed) } diff --git a/src/meta/src/controller/catalog.rs b/src/meta/src/controller/catalog.rs new file mode 100644 index 0000000000000..cb37307384aa2 --- /dev/null +++ b/src/meta/src/controller/catalog.rs @@ -0,0 +1,887 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::iter; + +use itertools::Itertools; +use risingwave_common::bail; +use risingwave_common::catalog::{DEFAULT_SCHEMA_NAME, SYSTEM_SCHEMAS}; +use risingwave_pb::catalog::{ + PbConnection, PbDatabase, PbFunction, PbIndex, PbSchema, PbSink, PbSource, PbTable, PbView, +}; +use risingwave_pb::meta::relation::PbRelationInfo; +use risingwave_pb::meta::subscribe_response::{ + Info as NotificationInfo, Operation as NotificationOperation, +}; +use risingwave_pb::meta::{PbRelation, PbRelationGroup}; +use sea_orm::{ + ActiveModelTrait, ActiveValue, ColumnTrait, DatabaseConnection, DatabaseTransaction, + EntityTrait, QueryFilter, QuerySelect, TransactionTrait, +}; +use tokio::sync::RwLock; + +use crate::controller::rename::{alter_relation_rename, alter_relation_rename_refs}; +use crate::controller::utils::{ + check_connection_name_duplicate, check_function_signature_duplicate, + check_relation_name_duplicate, check_schema_name_duplicate, ensure_object_id, + ensure_object_not_refer, ensure_schema_empty, ensure_user_id, get_referring_objects, + get_referring_objects_cascade, PartialObject, +}; +use crate::controller::ObjectModel; +use crate::manager::{MetaSrvEnv, NotificationVersion}; +use crate::model_v2::object::ObjectType; +use crate::model_v2::prelude::*; +use crate::model_v2::{ + connection, database, function, index, object, object_dependency, schema, sink, source, table, + view, ConnectionId, DatabaseId, FunctionId, ObjectId, PrivateLinkService, SchemaId, SourceId, + TableId, UserId, +}; +use crate::rpc::ddl_controller::DropMode; +use crate::{MetaError, MetaResult}; + +/// `CatalogController` is the controller for catalog related operations, including database, schema, table, view, etc. +pub struct CatalogController { + env: MetaSrvEnv, + inner: RwLock, +} + +#[derive(Clone, Default)] +pub struct ReleaseContext { + streaming_jobs: Vec, + source_ids: Vec, + connections: Vec, +} + +impl CatalogController { + pub fn new(env: MetaSrvEnv) -> MetaResult { + let meta_store = env + .sql_meta_store() + .expect("sql meta store is not initialized"); + Ok(Self { + env, + inner: RwLock::new(CatalogControllerInner { + db: meta_store.conn, + }), + }) + } +} + +struct CatalogControllerInner { + db: DatabaseConnection, +} + +impl CatalogController { + async fn notify_frontend( + &self, + operation: NotificationOperation, + info: NotificationInfo, + ) -> NotificationVersion { + self.env + .notification_manager() + .notify_frontend(operation, info) + .await + } + + async fn notify_frontend_relation_info( + &self, + operation: NotificationOperation, + relation_info: PbRelationInfo, + ) -> NotificationVersion { + self.env + .notification_manager() + .notify_frontend_relation_info(operation, relation_info) + .await + } +} + +impl CatalogController { + pub fn snapshot(&self) -> MetaResult<()> { + todo!("snapshot") + } + + async fn create_object( + txn: &DatabaseTransaction, + obj_type: ObjectType, + owner_id: UserId, + database_id: Option, + schema_id: Option, + ) -> MetaResult { + let active_db = object::ActiveModel { + oid: Default::default(), + obj_type: ActiveValue::Set(obj_type), + owner_id: ActiveValue::Set(owner_id), + schema_id: ActiveValue::Set(schema_id), + database_id: ActiveValue::Set(database_id), + initialized_at: Default::default(), + created_at: Default::default(), + }; + Ok(active_db.insert(txn).await?) + } + + pub async fn create_database(&self, db: PbDatabase) -> MetaResult { + let inner = self.inner.write().await; + let owner_id = db.owner; + let txn = inner.db.begin().await?; + ensure_user_id(owner_id, &txn).await?; + + let db_obj = Self::create_object(&txn, ObjectType::Database, owner_id, None, None).await?; + let mut db: database::ActiveModel = db.into(); + db.database_id = ActiveValue::Set(db_obj.oid); + let db = db.insert(&txn).await?; + + let mut schemas = vec![]; + for schema_name in iter::once(DEFAULT_SCHEMA_NAME).chain(SYSTEM_SCHEMAS) { + let schema_obj = + Self::create_object(&txn, ObjectType::Schema, owner_id, Some(db_obj.oid), None) + .await?; + let schema = schema::ActiveModel { + schema_id: ActiveValue::Set(schema_obj.oid), + name: ActiveValue::Set(schema_name.into()), + }; + let schema = schema.insert(&txn).await?; + schemas.push(ObjectModel(schema, schema_obj).into()); + } + txn.commit().await?; + + let mut version = self + .notify_frontend( + NotificationOperation::Add, + NotificationInfo::Database(ObjectModel(db, db_obj).into()), + ) + .await; + for schema in schemas { + version = self + .notify_frontend(NotificationOperation::Add, NotificationInfo::Schema(schema)) + .await; + } + + Ok(version) + } + + pub async fn drop_database( + &self, + database_id: DatabaseId, + ) -> MetaResult<(ReleaseContext, NotificationVersion)> { + let inner = self.inner.write().await; + let txn = inner.db.begin().await?; + ensure_object_id(ObjectType::Database, database_id, &txn).await?; + + let streaming_jobs: Vec = Object::find() + .select_only() + .column(object::Column::Oid) + .filter( + object::Column::DatabaseId + .eq(Some(database_id)) + .and(object::Column::ObjType.is_in([ObjectType::Table, ObjectType::Sink])), + ) + .into_tuple() + .all(&txn) + .await?; + + let source_ids: Vec = Object::find() + .select_only() + .column(object::Column::Oid) + .filter( + object::Column::DatabaseId + .eq(Some(database_id)) + .and(object::Column::ObjType.eq(ObjectType::Source)), + ) + .into_tuple() + .all(&txn) + .await?; + + let connections = Connection::find() + .inner_join(Object) + .filter(object::Column::DatabaseId.eq(Some(database_id))) + .all(&txn) + .await? + .into_iter() + .map(|conn| conn.info) + .collect_vec(); + + // The schema and objects in the database will be delete cascade. + let res = Object::delete_by_id(database_id).exec(&txn).await?; + if res.rows_affected == 0 { + return Err(MetaError::catalog_id_not_found("database", database_id)); + } + + txn.commit().await?; + + let version = self + .notify_frontend( + NotificationOperation::Delete, + NotificationInfo::Database(PbDatabase { + id: database_id, + ..Default::default() + }), + ) + .await; + Ok(( + ReleaseContext { + streaming_jobs, + source_ids, + connections, + }, + version, + )) + } + + pub async fn create_schema(&self, schema: PbSchema) -> MetaResult { + let inner = self.inner.write().await; + let owner_id = schema.owner; + let txn = inner.db.begin().await?; + ensure_user_id(owner_id, &txn).await?; + ensure_object_id(ObjectType::Database, schema.database_id, &txn).await?; + check_schema_name_duplicate(&schema.name, schema.database_id, &txn).await?; + + let schema_obj = Self::create_object( + &txn, + ObjectType::Schema, + owner_id, + Some(schema.database_id), + None, + ) + .await?; + let mut schema: schema::ActiveModel = schema.into(); + schema.schema_id = ActiveValue::Set(schema_obj.oid); + let schema = schema.insert(&txn).await?; + txn.commit().await?; + + let version = self + .notify_frontend( + NotificationOperation::Add, + NotificationInfo::Schema(ObjectModel(schema, schema_obj).into()), + ) + .await; + Ok(version) + } + + pub async fn drop_schema( + &self, + schema_id: SchemaId, + drop_mode: DropMode, + ) -> MetaResult { + let inner = self.inner.write().await; + let schema_obj = Object::find_by_id(schema_id) + .one(&inner.db) + .await? + .ok_or_else(|| MetaError::catalog_id_not_found("schema", schema_id))?; + if drop_mode == DropMode::Restrict { + ensure_schema_empty(schema_id, &inner.db).await?; + } + + let res = Object::delete(object::ActiveModel { + oid: ActiveValue::Set(schema_id), + ..Default::default() + }) + .exec(&inner.db) + .await?; + if res.rows_affected == 0 { + return Err(MetaError::catalog_id_not_found("schema", schema_id)); + } + + // todo: update user privileges accordingly. + let version = self + .notify_frontend( + NotificationOperation::Delete, + NotificationInfo::Schema(PbSchema { + id: schema_id, + database_id: schema_obj.database_id.unwrap(), + ..Default::default() + }), + ) + .await; + Ok(version) + } + + pub async fn create_function( + &self, + mut pb_function: PbFunction, + ) -> MetaResult { + let inner = self.inner.write().await; + let owner_id = pb_function.owner; + let txn = inner.db.begin().await?; + ensure_user_id(owner_id, &txn).await?; + ensure_object_id(ObjectType::Database, pb_function.database_id, &txn).await?; + ensure_object_id(ObjectType::Schema, pb_function.schema_id, &txn).await?; + check_function_signature_duplicate(&pb_function, &txn).await?; + + let function_obj = Self::create_object( + &txn, + ObjectType::Function, + owner_id, + Some(pb_function.database_id), + Some(pb_function.schema_id), + ) + .await?; + pb_function.id = function_obj.oid; + let function: function::ActiveModel = pb_function.clone().into(); + function.insert(&txn).await?; + txn.commit().await?; + + let version = self + .notify_frontend( + NotificationOperation::Add, + NotificationInfo::Function(pb_function), + ) + .await; + Ok(version) + } + + pub async fn drop_function(&self, function_id: FunctionId) -> MetaResult { + let inner = self.inner.write().await; + let function_obj = Object::find_by_id(function_id) + .one(&inner.db) + .await? + .ok_or_else(|| MetaError::catalog_id_not_found("function", function_id))?; + ensure_object_not_refer(ObjectType::Function, function_id, &inner.db).await?; + + let res = Object::delete_by_id(function_id).exec(&inner.db).await?; + if res.rows_affected == 0 { + return Err(MetaError::catalog_id_not_found("function", function_id)); + } + + let version = self + .notify_frontend( + NotificationOperation::Delete, + NotificationInfo::Function(PbFunction { + id: function_id, + schema_id: function_obj.schema_id.unwrap(), + database_id: function_obj.database_id.unwrap(), + ..Default::default() + }), + ) + .await; + Ok(version) + } + + pub async fn create_connection( + &self, + mut pb_connection: PbConnection, + ) -> MetaResult { + let inner = self.inner.write().await; + let owner_id = pb_connection.owner; + let txn = inner.db.begin().await?; + ensure_user_id(owner_id, &txn).await?; + ensure_object_id(ObjectType::Database, pb_connection.database_id, &txn).await?; + ensure_object_id(ObjectType::Schema, pb_connection.schema_id, &txn).await?; + check_connection_name_duplicate(&pb_connection, &txn).await?; + + let conn_obj = Self::create_object( + &txn, + ObjectType::Connection, + owner_id, + Some(pb_connection.database_id), + Some(pb_connection.schema_id), + ) + .await?; + pb_connection.id = conn_obj.oid; + let connection: connection::ActiveModel = pb_connection.clone().into(); + connection.insert(&txn).await?; + + txn.commit().await?; + + let version = self + .notify_frontend( + NotificationOperation::Add, + NotificationInfo::Connection(pb_connection), + ) + .await; + Ok(version) + } + + pub async fn get_connection_by_id( + &self, + connection_id: ConnectionId, + ) -> MetaResult { + let inner = self.inner.read().await; + let (conn, obj) = Connection::find_by_id(connection_id) + .find_also_related(Object) + .one(&inner.db) + .await? + .ok_or_else(|| MetaError::catalog_id_not_found("connection", connection_id))?; + + Ok(ObjectModel(conn, obj.unwrap()).into()) + } + + pub async fn drop_connection( + &self, + connection_id: ConnectionId, + ) -> MetaResult { + let inner = self.inner.write().await; + let connection_obj = Object::find_by_id(connection_id) + .one(&inner.db) + .await? + .ok_or_else(|| MetaError::catalog_id_not_found("connection", connection_id))?; + ensure_object_not_refer(ObjectType::Connection, connection_id, &inner.db).await?; + + let res = Object::delete_by_id(connection_id).exec(&inner.db).await?; + if res.rows_affected == 0 { + return Err(MetaError::catalog_id_not_found("connection", connection_id)); + } + + let version = self + .notify_frontend( + NotificationOperation::Delete, + NotificationInfo::Connection(PbConnection { + id: connection_id, + schema_id: connection_obj.schema_id.unwrap(), + database_id: connection_obj.database_id.unwrap(), + ..Default::default() + }), + ) + .await; + Ok(version) + } + + pub async fn create_view(&self, mut pb_view: PbView) -> MetaResult { + let inner = self.inner.write().await; + let owner_id = pb_view.owner; + let txn = inner.db.begin().await?; + ensure_user_id(owner_id, &txn).await?; + ensure_object_id(ObjectType::Database, pb_view.database_id, &txn).await?; + ensure_object_id(ObjectType::Schema, pb_view.schema_id, &txn).await?; + check_relation_name_duplicate(&pb_view.name, pb_view.database_id, pb_view.schema_id, &txn) + .await?; + + let view_obj = Self::create_object( + &txn, + ObjectType::View, + owner_id, + Some(pb_view.database_id), + Some(pb_view.schema_id), + ) + .await?; + pb_view.id = view_obj.oid; + let view: view::ActiveModel = pb_view.clone().into(); + view.insert(&txn).await?; + + // todo: change `dependent_relations` to `dependent_objects`, which should includes connection and function as well. + // todo: shall we need to check existence of them Or let database handle it by FOREIGN KEY constraint. + for obj_id in &pb_view.dependent_relations { + object_dependency::ActiveModel { + oid: ActiveValue::Set(*obj_id), + used_by: ActiveValue::Set(view_obj.oid), + ..Default::default() + } + .insert(&txn) + .await?; + } + + txn.commit().await?; + + let version = self + .notify_frontend_relation_info( + NotificationOperation::Add, + PbRelationInfo::View(pb_view), + ) + .await; + Ok(version) + } + + pub async fn drop_relation( + &self, + object_type: ObjectType, + object_id: ObjectId, + drop_mode: DropMode, + ) -> MetaResult<(ReleaseContext, NotificationVersion)> { + let inner = self.inner.write().await; + let txn = inner.db.begin().await?; + let obj: PartialObject = Object::find_by_id(object_id) + .into_partial_model() + .one(&txn) + .await? + .ok_or_else(|| MetaError::catalog_id_not_found(object_type.as_str(), object_id))?; + assert_eq!(obj.obj_type, object_type); + + let mut to_drop_objects = match drop_mode { + DropMode::Cascade => get_referring_objects_cascade(object_id, &txn).await?, + DropMode::Restrict => { + ensure_object_not_refer(object_type, object_id, &txn).await?; + vec![] + } + }; + assert!( + to_drop_objects.iter().all(|obj| matches!( + obj.obj_type, + ObjectType::Table | ObjectType::Index | ObjectType::Sink | ObjectType::View + )), + "only these objects will depends on others" + ); + to_drop_objects.push(obj); + + let to_drop_table_ids = to_drop_objects + .iter() + .filter(|obj| obj.obj_type == ObjectType::Table) + .map(|obj| obj.oid); + let mut to_drop_streaming_jobs = to_drop_objects + .iter() + .filter(|obj| obj.obj_type == ObjectType::Table || obj.obj_type == ObjectType::Sink) + .map(|obj| obj.oid) + .collect_vec(); + // todo: record index dependency info in the object dependency table. + let to_drop_index_ids = to_drop_objects + .iter() + .filter(|obj| obj.obj_type == ObjectType::Index) + .map(|obj| obj.oid) + .collect_vec(); + + // Add associated sources. + let mut to_drop_source_ids: Vec = Table::find() + .select_only() + .column(table::Column::OptionalAssociatedSourceId) + .filter( + table::Column::TableId + .is_in(to_drop_table_ids) + .and(table::Column::OptionalAssociatedSourceId.is_not_null()), + ) + .into_tuple() + .all(&txn) + .await?; + let to_drop_source_objs: Vec = Object::find() + .filter(object::Column::Oid.is_in(to_drop_source_ids.clone())) + .into_partial_model() + .all(&txn) + .await?; + to_drop_objects.extend(to_drop_source_objs.clone()); + if object_type == ObjectType::Source { + to_drop_source_ids.push(object_id); + } + + // add internal tables. + let index_table_ids: Vec = Index::find() + .select_only() + .column(index::Column::IndexTableId) + .filter(index::Column::IndexId.is_in(to_drop_index_ids)) + .into_tuple() + .all(&txn) + .await?; + to_drop_streaming_jobs.extend(index_table_ids); + let to_drop_internal_table_objs: Vec = Object::find() + .filter(object::Column::Oid.is_in(to_drop_streaming_jobs.clone())) + .into_partial_model() + .all(&txn) + .await?; + to_drop_objects.extend(to_drop_internal_table_objs); + + // delete all in to_drop_objects. + let res = Object::delete_many() + .filter(object::Column::Oid.is_in(to_drop_objects.iter().map(|obj| obj.oid))) + .exec(&txn) + .await?; + if res.rows_affected == 0 { + return Err(MetaError::catalog_id_not_found( + object_type.as_str(), + object_id, + )); + } + + // notify about them. + let relations = to_drop_objects + .into_iter() + .map(|obj| match obj.obj_type { + ObjectType::Table => PbRelation { + relation_info: Some(PbRelationInfo::Table(PbTable { + id: obj.oid, + schema_id: obj.schema_id.unwrap(), + database_id: obj.database_id.unwrap(), + ..Default::default() + })), + }, + ObjectType::Source => PbRelation { + relation_info: Some(PbRelationInfo::Source(PbSource { + id: obj.oid, + schema_id: obj.schema_id.unwrap(), + database_id: obj.database_id.unwrap(), + ..Default::default() + })), + }, + ObjectType::Sink => PbRelation { + relation_info: Some(PbRelationInfo::Sink(PbSink { + id: obj.oid, + schema_id: obj.schema_id.unwrap(), + database_id: obj.database_id.unwrap(), + ..Default::default() + })), + }, + ObjectType::View => PbRelation { + relation_info: Some(PbRelationInfo::View(PbView { + id: obj.oid, + schema_id: obj.schema_id.unwrap(), + database_id: obj.database_id.unwrap(), + ..Default::default() + })), + }, + ObjectType::Index => PbRelation { + relation_info: Some(PbRelationInfo::Index(PbIndex { + id: obj.oid, + schema_id: obj.schema_id.unwrap(), + database_id: obj.database_id.unwrap(), + ..Default::default() + })), + }, + _ => unreachable!("only relations will be dropped."), + }) + .collect_vec(); + let version = self + .notify_frontend( + NotificationOperation::Delete, + NotificationInfo::RelationGroup(PbRelationGroup { relations }), + ) + .await; + + Ok(( + ReleaseContext { + streaming_jobs: to_drop_streaming_jobs, + source_ids: to_drop_source_ids, + connections: vec![], + }, + version, + )) + } + + pub async fn alter_relation_name( + &self, + object_type: ObjectType, + object_id: ObjectId, + object_name: &str, + ) -> MetaResult { + let inner = self.inner.write().await; + let txn = inner.db.begin().await?; + let obj: PartialObject = Object::find_by_id(object_id) + .into_partial_model() + .one(&txn) + .await? + .ok_or_else(|| MetaError::catalog_id_not_found(object_type.as_str(), object_id))?; + assert_eq!(obj.obj_type, object_type); + check_relation_name_duplicate( + object_name, + obj.database_id.unwrap(), + obj.schema_id.unwrap(), + &txn, + ) + .await?; + + let mut to_update_relations = vec![]; + // rename relation. + macro_rules! rename_relation { + ($entity:ident, $table:ident, $identity:ident, $object_id:expr) => {{ + let (mut relation, obj) = $entity::find_by_id($object_id) + .find_also_related(Object) + .one(&txn) + .await? + .unwrap(); + let old_name = relation.name.clone(); + relation.name = object_name.into(); + relation.definition = alter_relation_rename(&relation.definition, object_name); + let active_model = $table::ActiveModel { + $identity: ActiveValue::Set(relation.$identity), + name: ActiveValue::Set(object_name.into()), + definition: ActiveValue::Set(relation.definition.clone()), + ..Default::default() + }; + active_model.update(&txn).await?; + to_update_relations.push(PbRelation { + relation_info: Some(PbRelationInfo::$entity( + ObjectModel(relation, obj.unwrap()).into(), + )), + }); + old_name + }}; + } + + let old_name = match object_type { + ObjectType::Table => rename_relation!(Table, table, table_id, object_id), + ObjectType::Source => rename_relation!(Source, source, source_id, object_id), + ObjectType::Sink => rename_relation!(Sink, sink, sink_id, object_id), + ObjectType::View => rename_relation!(View, view, view_id, object_id), + ObjectType::Index => { + let (mut index, obj) = Index::find_by_id(object_id) + .find_also_related(Object) + .one(&txn) + .await? + .unwrap(); + index.name = object_name.into(); + let index_table_id = index.index_table_id; + + // the name of index and its associated table is the same. + let active_model = index::ActiveModel { + index_id: ActiveValue::Set(index.index_id), + name: ActiveValue::Set(object_name.into()), + ..Default::default() + }; + active_model.update(&txn).await?; + to_update_relations.push(PbRelation { + relation_info: Some(PbRelationInfo::Index( + ObjectModel(index, obj.unwrap()).into(), + )), + }); + rename_relation!(Table, table, table_id, index_table_id) + } + _ => unreachable!("only relation name can be altered."), + }; + + // rename referring relation name. + macro_rules! rename_relation_ref { + ($entity:ident, $table:ident, $identity:ident, $object_id:expr) => {{ + let (mut relation, obj) = $entity::find_by_id($object_id) + .find_also_related(Object) + .one(&txn) + .await? + .unwrap(); + relation.definition = + alter_relation_rename_refs(&relation.definition, &old_name, object_name); + let active_model = $table::ActiveModel { + $identity: ActiveValue::Set(relation.$identity), + definition: ActiveValue::Set(relation.definition.clone()), + ..Default::default() + }; + active_model.update(&txn).await?; + to_update_relations.push(PbRelation { + relation_info: Some(PbRelationInfo::$entity( + ObjectModel(relation, obj.unwrap()).into(), + )), + }); + }}; + } + let objs = get_referring_objects(object_id, &txn).await?; + for obj in objs { + match obj.obj_type { + ObjectType::Table => rename_relation_ref!(Table, table, table_id, obj.oid), + ObjectType::Sink => rename_relation_ref!(Sink, sink, sink_id, obj.oid), + ObjectType::View => rename_relation_ref!(View, view, view_id, obj.oid), + ObjectType::Index => { + let index_table_id: Option = Index::find_by_id(obj.oid) + .select_only() + .column(index::Column::IndexTableId) + .into_tuple() + .one(&txn) + .await?; + rename_relation_ref!(Table, table, table_id, index_table_id.unwrap()); + } + _ => bail!("only table, sink, view and index depend on other objects."), + } + } + txn.commit().await?; + + let version = self + .notify_frontend( + NotificationOperation::Update, + NotificationInfo::RelationGroup(PbRelationGroup { + relations: to_update_relations, + }), + ) + .await; + + Ok(version) + } +} + +#[cfg(test)] +#[cfg(not(madsim))] +mod tests { + use risingwave_common::catalog::DEFAULT_SUPER_USER_ID; + + use super::*; + + const TEST_DATABASE_ID: DatabaseId = 1; + const TEST_SCHEMA_ID: SchemaId = 2; + const TEST_OWNER_ID: UserId = 1; + + #[tokio::test] + async fn test_create_database() -> MetaResult<()> { + let mgr = CatalogController::new(MetaSrvEnv::for_test().await)?; + let db = PbDatabase { + name: "test".to_string(), + owner: DEFAULT_SUPER_USER_ID, + ..Default::default() + }; + mgr.create_database(db).await?; + + let db = Database::find() + .filter(database::Column::Name.eq("test")) + .one(&mgr.inner.read().await.db) + .await? + .unwrap(); + mgr.drop_database(db.database_id).await?; + Ok(()) + } + + #[tokio::test] + async fn test_create_view() -> MetaResult<()> { + let mgr = CatalogController::new(MetaSrvEnv::for_test().await)?; + let pb_view = PbView { + schema_id: TEST_SCHEMA_ID, + database_id: TEST_DATABASE_ID, + name: "view".to_string(), + owner: TEST_OWNER_ID, + sql: "CREATE VIEW view AS SELECT 1".to_string(), + ..Default::default() + }; + mgr.create_view(pb_view.clone()).await?; + assert!(mgr.create_view(pb_view).await.is_err()); + + let view = View::find().one(&mgr.inner.read().await.db).await?.unwrap(); + mgr.drop_relation(ObjectType::View, view.view_id, DropMode::Cascade) + .await?; + + Ok(()) + } + + #[tokio::test] + async fn test_create_function() -> MetaResult<()> { + let mgr = CatalogController::new(MetaSrvEnv::for_test().await)?; + let return_type = risingwave_pb::data::DataType { + type_name: risingwave_pb::data::data_type::TypeName::Int32 as _, + ..Default::default() + }; + let pb_function = PbFunction { + schema_id: TEST_SCHEMA_ID, + database_id: TEST_DATABASE_ID, + name: "test_function".to_string(), + owner: TEST_OWNER_ID, + arg_types: vec![], + return_type: Some(return_type.clone()), + language: "python".to_string(), + kind: Some(risingwave_pb::catalog::function::Kind::Scalar( + Default::default(), + )), + ..Default::default() + }; + mgr.create_function(pb_function.clone()).await?; + assert!(mgr.create_function(pb_function).await.is_err()); + + let function = Function::find() + .inner_join(Object) + .filter( + object::Column::DatabaseId + .eq(TEST_DATABASE_ID) + .and(object::Column::SchemaId.eq(TEST_SCHEMA_ID)) + .add(function::Column::Name.eq("test_function")), + ) + .one(&mgr.inner.read().await.db) + .await? + .unwrap(); + assert_eq!(function.return_type.0, return_type); + assert_eq!(function.language, "python"); + + mgr.drop_function(function.function_id).await?; + assert!(Object::find_by_id(function.function_id) + .one(&mgr.inner.read().await.db) + .await? + .is_none()); + + Ok(()) + } +} diff --git a/src/meta/src/controller/cluster.rs b/src/meta/src/controller/cluster.rs new file mode 100644 index 0000000000000..ca29380a49fca --- /dev/null +++ b/src/meta/src/controller/cluster.rs @@ -0,0 +1,988 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp; +use std::cmp::Ordering; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::ops::Add; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; + +use itertools::Itertools; +use risingwave_common::hash::ParallelUnitId; +use risingwave_hummock_sdk::HummockSstableObjectId; +use risingwave_pb::common::worker_node::{PbProperty, PbState}; +use risingwave_pb::common::{ + HostAddress, ParallelUnit, PbHostAddress, PbParallelUnit, PbWorkerNode, PbWorkerType, +}; +use risingwave_pb::meta::add_worker_node_request::Property as AddNodeProperty; +use risingwave_pb::meta::heartbeat_request; +use risingwave_pb::meta::subscribe_response::{Info, Operation}; +use risingwave_pb::meta::update_worker_node_schedulability_request::Schedulability; +use sea_orm::prelude::Expr; +use sea_orm::{ + ActiveModelTrait, ActiveValue, ColumnTrait, DatabaseConnection, EntityTrait, QueryFilter, + QuerySelect, TransactionTrait, +}; +use tokio::sync::oneshot::Sender; +use tokio::sync::{RwLock, RwLockReadGuard}; +use tokio::task::JoinHandle; + +use crate::manager::prelude::{Worker, WorkerProperty}; +use crate::manager::{LocalNotification, MetaSrvEnv, WorkerKey}; +use crate::model_v2::worker::{WorkerStatus, WorkerType}; +use crate::model_v2::{worker, worker_property, I32Array, TransactionId, WorkerId}; +use crate::{MetaError, MetaResult}; + +pub type ClusterControllerRef = Arc; + +pub struct ClusterController { + env: MetaSrvEnv, + max_heartbeat_interval: Duration, + inner: RwLock, +} + +struct WorkerInfo(worker::Model, Option); + +impl From for PbWorkerNode { + fn from(info: WorkerInfo) -> Self { + Self { + id: info.0.worker_id, + r#type: PbWorkerType::from(info.0.worker_type) as _, + host: Some(PbHostAddress { + host: info.0.host, + port: info.0.port, + }), + state: PbState::from(info.0.status) as _, + parallel_units: info + .1 + .as_ref() + .map(|p| { + p.parallel_unit_ids + .0 + .iter() + .map(|&id| PbParallelUnit { + id: id as _, + worker_node_id: info.0.worker_id, + }) + .collect_vec() + }) + .unwrap_or_default(), + property: info.1.as_ref().map(|p| PbProperty { + is_streaming: p.is_streaming, + is_serving: p.is_serving, + is_unschedulable: p.is_unschedulable, + }), + transactional_id: info.0.transaction_id, + } + } +} + +impl From for WorkerType { + fn from(worker_type: PbWorkerType) -> Self { + match worker_type { + PbWorkerType::Unspecified => unreachable!("unspecified worker type"), + PbWorkerType::Frontend => Self::Frontend, + PbWorkerType::ComputeNode => Self::ComputeNode, + PbWorkerType::RiseCtl => Self::RiseCtl, + PbWorkerType::Compactor => Self::Compactor, + PbWorkerType::Meta => Self::Meta, + } + } +} + +impl From for PbWorkerType { + fn from(worker_type: WorkerType) -> Self { + match worker_type { + WorkerType::Frontend => Self::Frontend, + WorkerType::ComputeNode => Self::ComputeNode, + WorkerType::RiseCtl => Self::RiseCtl, + WorkerType::Compactor => Self::Compactor, + WorkerType::Meta => Self::Meta, + } + } +} + +impl From for WorkerStatus { + fn from(state: PbState) -> Self { + match state { + PbState::Unspecified => unreachable!("unspecified worker status"), + PbState::Starting => Self::Starting, + PbState::Running => Self::Running, + } + } +} + +impl From for PbState { + fn from(status: WorkerStatus) -> Self { + match status { + WorkerStatus::Starting => Self::Starting, + WorkerStatus::Running => Self::Running, + } + } +} + +impl From<&PbWorkerNode> for worker::ActiveModel { + fn from(worker: &PbWorkerNode) -> Self { + let host = worker.host.clone().unwrap(); + Self { + worker_id: ActiveValue::Set(worker.id), + worker_type: ActiveValue::Set(worker.r#type().into()), + host: ActiveValue::Set(host.host), + port: ActiveValue::Set(host.port), + status: ActiveValue::Set(worker.state().into()), + ..Default::default() + } + } +} + +impl ClusterController { + pub async fn new(env: MetaSrvEnv, max_heartbeat_interval: Duration) -> MetaResult { + let meta_store = env + .sql_meta_store() + .expect("sql meta store is not initialized"); + let inner = ClusterControllerInner::new(meta_store.conn).await?; + Ok(Self { + env, + max_heartbeat_interval, + inner: RwLock::new(inner), + }) + } + + /// Used in `NotificationService::subscribe`. + /// Need to pay attention to the order of acquiring locks to prevent deadlock problems. + pub async fn get_inner_guard(&self) -> RwLockReadGuard<'_, ClusterControllerInner> { + self.inner.read().await + } + + pub async fn count_worker_by_type(&self) -> MetaResult> { + self.inner.read().await.count_worker_by_type().await + } + + /// A worker node will immediately register itself to meta when it bootstraps. + /// The meta will assign it with a unique ID and set its state as `Starting`. + /// When the worker node is fully ready to serve, it will request meta again + /// (via `activate_worker_node`) to set its state to `Running`. + pub async fn add_worker( + &self, + r#type: PbWorkerType, + host_address: HostAddress, + property: AddNodeProperty, + ) -> MetaResult { + self.inner + .write() + .await + .add_worker(r#type, host_address, property, self.max_heartbeat_interval) + .await + } + + pub async fn activate_worker(&self, worker_id: WorkerId) -> MetaResult<()> { + let inner = self.inner.write().await; + let worker = inner.activate_worker(worker_id).await?; + + // Notify frontends of new compute node. + // Always notify because a running worker's property may have been changed. + if worker.r#type() == PbWorkerType::ComputeNode { + self.env + .notification_manager() + .notify_frontend(Operation::Add, Info::Node(worker.clone())) + .await; + } + self.env + .notification_manager() + .notify_local_subscribers(LocalNotification::WorkerNodeActivated(worker)) + .await; + + Ok(()) + } + + pub async fn delete_worker(&self, host_address: HostAddress) -> MetaResult { + let mut inner = self.inner.write().await; + let worker = inner.delete_worker(host_address).await?; + if worker.r#type() == PbWorkerType::ComputeNode { + self.env + .notification_manager() + .notify_frontend(Operation::Delete, Info::Node(worker.clone())) + .await; + } + + // Notify local subscribers. + // Note: Any type of workers may pin some hummock resource. So `HummockManager` expect this + // local notification. + self.env + .notification_manager() + .notify_local_subscribers(LocalNotification::WorkerNodeDeleted(worker.clone())) + .await; + + Ok(worker) + } + + pub async fn update_schedulability( + &self, + worker_ids: Vec, + schedulability: Schedulability, + ) -> MetaResult<()> { + self.inner + .write() + .await + .update_schedulability(worker_ids, schedulability) + .await + } + + /// Invoked when it receives a heartbeat from a worker node. + pub async fn heartbeat( + &self, + worker_id: WorkerId, + info: Vec, + ) { + tracing::trace!(target: "events::meta::server_heartbeat", worker_id = worker_id, "receive heartbeat"); + self.inner + .write() + .await + .heartbeat(worker_id, self.max_heartbeat_interval, info) + } + + pub fn start_heartbeat_checker( + cluster_controller: ClusterController, + check_interval: Duration, + ) -> (JoinHandle<()>, Sender<()>) { + let (shutdown_tx, mut shutdown_rx) = tokio::sync::oneshot::channel(); + let join_handle = tokio::spawn(async move { + let mut min_interval = tokio::time::interval(check_interval); + min_interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + loop { + tokio::select! { + // Wait for interval + _ = min_interval.tick() => {}, + // Shutdown + _ = &mut shutdown_rx => { + tracing::info!("Heartbeat checker is stopped"); + return; + } + } + + let mut inner = cluster_controller.inner.write().await; + // 1. Initialize new workers' TTL. + for worker in inner + .worker_extra_info + .values_mut() + .filter(|worker| worker.expire_at.is_none()) + { + worker.update_ttl(cluster_controller.max_heartbeat_interval); + } + + // 2. Collect expired workers. + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .expect("Clock may have gone backwards") + .as_secs(); + let worker_to_delete = inner + .worker_extra_info + .iter() + .filter(|(_, info)| info.expire_at.unwrap() < now) + .map(|(id, _)| *id) + .collect_vec(); + + // 3. Delete expired workers. + let worker_infos = match Worker::find() + .select_only() + .column(worker::Column::WorkerType) + .column(worker::Column::Host) + .column(worker::Column::Port) + .into_tuple::<(WorkerType, String, i32)>() + .all(&inner.db) + .await + { + Ok(keys) => keys, + Err(err) => { + tracing::warn!("Failed to load expire worker info from db: {}", err); + continue; + } + }; + + if let Err(err) = Worker::delete_many() + .filter(worker::Column::WorkerId.is_in(worker_to_delete)) + .exec(&inner.db) + .await + { + tracing::warn!("Failed to delete expire workers from db: {}", err); + continue; + } + + for (worker_type, host, port) in worker_infos { + match worker_type { + WorkerType::Frontend + | WorkerType::ComputeNode + | WorkerType::Compactor + | WorkerType::RiseCtl => { + cluster_controller + .env + .notification_manager() + .delete_sender( + worker_type.into(), + WorkerKey(HostAddress { host, port }), + ) + .await + } + _ => {} + }; + } + } + }); + + (join_handle, shutdown_tx) + } + + /// Get live nodes with the specified type and state. + /// # Arguments + /// * `worker_type` `WorkerType` of the nodes + /// * `worker_state` Filter by this state if it is not None. + pub async fn list_workers( + &self, + worker_type: WorkerType, + worker_status: Option, + ) -> MetaResult> { + self.inner + .read() + .await + .list_workers(worker_type, worker_status) + .await + } + + /// A convenient method to get all running compute nodes that may have running actors on them + /// i.e. CNs which are running + pub async fn list_active_streaming_workers(&self) -> MetaResult> { + self.inner + .read() + .await + .list_active_streaming_workers() + .await + } + + pub async fn list_active_parallel_units(&self) -> MetaResult> { + self.inner.read().await.list_active_parallel_units().await + } + + /// Get the cluster info used for scheduling a streaming job, containing all nodes that are + /// running and schedulable + pub async fn list_active_serving_workers(&self) -> MetaResult> { + self.inner.read().await.list_active_serving_workers().await + } + + /// Get the cluster info used for scheduling a streaming job. + pub async fn get_streaming_cluster_info(&self) -> MetaResult { + self.inner.read().await.get_streaming_cluster_info().await + } + + pub async fn get_worker_by_id(&self, worker_id: WorkerId) -> MetaResult> { + self.inner.read().await.get_worker_by_id(worker_id).await + } + + pub async fn get_worker_info_by_id(&self, worker_id: WorkerId) -> Option { + self.inner + .read() + .await + .get_worker_extra_info_by_id(worker_id) + } +} + +#[derive(Default, Clone)] +pub struct WorkerExtraInfo { + // Volatile values updated by meta node as follows. + // + // Unix timestamp that the worker will expire at. + expire_at: Option, + // Monotonic increasing id since meta node bootstrap. + info_version_id: u64, + // GC watermark. + hummock_gc_watermark: Option, +} + +impl WorkerExtraInfo { + fn update_ttl(&mut self, ttl: Duration) { + let expire = cmp::max( + self.expire_at.unwrap_or_default(), + SystemTime::now() + .add(ttl) + .duration_since(SystemTime::UNIX_EPOCH) + .expect("Clock may have gone backwards") + .as_secs(), + ); + self.expire_at = Some(expire); + } + + fn update_hummock_info(&mut self, info: Vec) { + self.info_version_id += 1; + for i in info { + match i { + heartbeat_request::extra_info::Info::HummockGcWatermark(watermark) => { + self.hummock_gc_watermark = Some(watermark); + } + } + } + } +} + +/// The cluster info used for scheduling a streaming job. +#[derive(Debug, Clone)] +pub struct StreamingClusterInfo { + /// All **active** compute nodes in the cluster. + pub worker_nodes: HashMap, + + /// All parallel units of the **active** compute nodes in the cluster. + pub parallel_units: HashMap, + + /// All unschedulable parallel units of compute nodes in the cluster. + pub unschedulable_parallel_units: HashMap, +} + +pub struct ClusterControllerInner { + db: DatabaseConnection, + /// Record for tracking available machine ids, one is available. + available_transactional_ids: VecDeque, + worker_extra_info: HashMap, +} + +impl ClusterControllerInner { + pub const MAX_WORKER_REUSABLE_ID_BITS: usize = 10; + pub const MAX_WORKER_REUSABLE_ID_COUNT: usize = 1 << Self::MAX_WORKER_REUSABLE_ID_BITS; + + pub async fn new(db: DatabaseConnection) -> MetaResult { + let workers: Vec<(WorkerId, Option)> = Worker::find() + .select_only() + .column(worker::Column::WorkerId) + .column(worker::Column::TransactionId) + .into_tuple() + .all(&db) + .await?; + let inuse_txn_ids: HashSet<_> = workers + .iter() + .cloned() + .filter_map(|(_, txn_id)| txn_id) + .collect(); + let available_transactional_ids = (0..Self::MAX_WORKER_REUSABLE_ID_COUNT as TransactionId) + .filter(|id| !inuse_txn_ids.contains(id)) + .collect(); + + let worker_extra_info = workers + .into_iter() + .map(|(w, _)| (w, WorkerExtraInfo::default())) + .collect(); + + Ok(Self { + db, + available_transactional_ids, + worker_extra_info, + }) + } + + pub async fn count_worker_by_type(&self) -> MetaResult> { + let workers: Vec<(WorkerType, i32)> = Worker::find() + .select_only() + .column(worker::Column::WorkerType) + .column_as(worker::Column::WorkerId.count(), "count") + .group_by(worker::Column::WorkerType) + .into_tuple() + .all(&self.db) + .await?; + + Ok(workers.into_iter().collect()) + } + + pub fn update_worker_ttl(&mut self, worker_id: WorkerId, ttl: Duration) -> MetaResult<()> { + if let Some(info) = self.worker_extra_info.get_mut(&worker_id) { + let expire = cmp::max( + info.expire_at.unwrap_or_default(), + SystemTime::now() + .add(ttl) + .duration_since(SystemTime::UNIX_EPOCH) + .expect("Clock may have gone backwards") + .as_secs(), + ); + info.expire_at = Some(expire); + Ok(()) + } else { + Err(MetaError::invalid_worker( + worker_id, + "worker not found".into(), + )) + } + } + + fn apply_transaction_id(&self, r#type: PbWorkerType) -> MetaResult> { + match (self.available_transactional_ids.front(), r#type) { + (None, _) => Err(MetaError::unavailable( + "no available reusable machine id".to_string(), + )), + // We only assign transactional id to compute node and frontend. + (Some(id), PbWorkerType::ComputeNode | PbWorkerType::Frontend) => Ok(Some(*id)), + _ => Ok(None), + } + } + + pub async fn add_worker( + &mut self, + r#type: PbWorkerType, + host_address: HostAddress, + add_property: AddNodeProperty, + ttl: Duration, + ) -> MetaResult { + let txn = self.db.begin().await?; + + // TODO: remove this workaround when we deprecate parallel unit ids. + let derive_parallel_units = |txn_id: TransactionId, start: u32, end: u32| { + (start..end) + .map(|idx| ((idx << Self::MAX_WORKER_REUSABLE_ID_BITS) + txn_id) as i32) + .collect_vec() + }; + + let worker = Worker::find() + .filter( + worker::Column::Host + .eq(host_address.host.clone()) + .and(worker::Column::Port.eq(host_address.port)), + ) + .find_also_related(WorkerProperty) + .one(&txn) + .await?; + // Worker already exist. + if let Some((worker, property)) = worker { + assert_eq!(worker.worker_type, r#type.into()); + return if worker.worker_type == WorkerType::ComputeNode { + let property = property.unwrap(); + let txn_id = worker.transaction_id.unwrap(); + let mut current_parallelism = property.parallel_unit_ids.0.clone(); + let new_parallelism = add_property.worker_node_parallelism as usize; + + match new_parallelism.cmp(¤t_parallelism.len()) { + Ordering::Less => { + // Warn and keep the original parallelism if the worker registered with a + // smaller parallelism. + tracing::warn!( + "worker {} parallelism is less than current, current is {}, but received {}", + worker.worker_id, + current_parallelism.len(), + new_parallelism + ); + } + Ordering::Greater => { + tracing::info!( + "worker {} parallelism updated from {} to {}", + worker.worker_id, + current_parallelism.len(), + new_parallelism + ); + current_parallelism.extend(derive_parallel_units( + txn_id, + current_parallelism.len() as _, + new_parallelism as _, + )); + } + Ordering::Equal => {} + } + let mut property: worker_property::ActiveModel = property.into(); + + // keep `is_unschedulable` unchanged. + property.is_streaming = ActiveValue::Set(add_property.is_streaming); + property.is_serving = ActiveValue::Set(add_property.is_serving); + property.parallel_unit_ids = ActiveValue::Set(I32Array(current_parallelism)); + + WorkerProperty::update(property).exec(&txn).await?; + txn.commit().await?; + self.update_worker_ttl(worker.worker_id, ttl)?; + Ok(worker.worker_id) + } else { + self.update_worker_ttl(worker.worker_id, ttl)?; + Ok(worker.worker_id) + }; + } + let txn_id = self.apply_transaction_id(r#type)?; + + let worker = worker::ActiveModel { + worker_id: Default::default(), + worker_type: ActiveValue::Set(r#type.into()), + host: ActiveValue::Set(host_address.host), + port: ActiveValue::Set(host_address.port), + status: ActiveValue::Set(WorkerStatus::Starting), + transaction_id: ActiveValue::Set(txn_id), + }; + let insert_res = Worker::insert(worker).exec(&txn).await?; + let worker_id = insert_res.last_insert_id as WorkerId; + if r#type == PbWorkerType::ComputeNode { + let property = worker_property::ActiveModel { + worker_id: ActiveValue::Set(worker_id), + parallel_unit_ids: ActiveValue::Set(I32Array(derive_parallel_units( + *txn_id.as_ref().unwrap(), + 0, + add_property.worker_node_parallelism as _, + ))), + is_streaming: ActiveValue::Set(add_property.is_streaming), + is_serving: ActiveValue::Set(add_property.is_streaming), + is_unschedulable: ActiveValue::Set(add_property.is_streaming), + }; + WorkerProperty::insert(property).exec(&txn).await?; + } + + txn.commit().await?; + if let Some(txn_id) = txn_id { + self.available_transactional_ids.retain(|id| *id != txn_id); + } + self.worker_extra_info + .insert(worker_id, WorkerExtraInfo::default()); + + Ok(worker_id) + } + + pub async fn activate_worker(&self, worker_id: WorkerId) -> MetaResult { + let worker = worker::ActiveModel { + worker_id: ActiveValue::Set(worker_id), + status: ActiveValue::Set(WorkerStatus::Running), + ..Default::default() + }; + + let worker = worker.update(&self.db).await?; + let worker_property = WorkerProperty::find_by_id(worker.worker_id) + .one(&self.db) + .await?; + Ok(WorkerInfo(worker, worker_property).into()) + } + + pub async fn update_schedulability( + &self, + worker_ids: Vec, + schedulability: Schedulability, + ) -> MetaResult<()> { + let is_unschedulable = schedulability == Schedulability::Unschedulable; + WorkerProperty::update_many() + .col_expr( + worker_property::Column::IsUnschedulable, + Expr::value(is_unschedulable), + ) + .filter(worker_property::Column::WorkerId.is_in(worker_ids)) + .exec(&self.db) + .await?; + + Ok(()) + } + + pub async fn delete_worker(&mut self, host_addr: HostAddress) -> MetaResult { + let worker = Worker::find() + .filter( + worker::Column::Host + .eq(host_addr.host) + .and(worker::Column::Port.eq(host_addr.port)), + ) + .find_also_related(WorkerProperty) + .one(&self.db) + .await?; + let Some((worker, property)) = worker else { + return Err(MetaError::invalid_parameter("worker not found!")); + }; + + let res = Worker::delete_by_id(worker.worker_id) + .exec(&self.db) + .await?; + if res.rows_affected == 0 { + return Err(MetaError::invalid_parameter("worker not found!")); + } + + self.worker_extra_info.remove(&worker.worker_id); + if let Some(txn_id) = &worker.transaction_id { + self.available_transactional_ids.push_back(*txn_id); + } + Ok(WorkerInfo(worker, property).into()) + } + + pub fn heartbeat( + &mut self, + worker_id: WorkerId, + ttl: Duration, + info: Vec, + ) { + if let Some(worker_info) = self.worker_extra_info.get_mut(&worker_id) { + worker_info.update_ttl(ttl); + worker_info.update_hummock_info(info); + } + } + + pub async fn list_workers( + &self, + worker_type: WorkerType, + worker_status: Option, + ) -> MetaResult> { + let workers = if let Some(status) = worker_status { + Worker::find() + .filter( + worker::Column::WorkerType + .eq(worker_type) + .and(worker::Column::Status.eq(status)), + ) + .find_also_related(WorkerProperty) + .all(&self.db) + .await? + } else { + Worker::find() + .filter(worker::Column::WorkerType.eq(worker_type)) + .find_also_related(WorkerProperty) + .all(&self.db) + .await? + }; + + Ok(workers + .into_iter() + .map(|(worker, property)| WorkerInfo(worker, property).into()) + .collect_vec()) + } + + pub async fn list_active_streaming_workers(&self) -> MetaResult> { + let workers = Worker::find() + .filter( + worker::Column::WorkerType + .eq(WorkerType::ComputeNode) + .and(worker::Column::Status.eq(WorkerStatus::Running)), + ) + .inner_join(WorkerProperty) + .select_also(WorkerProperty) + .filter(worker_property::Column::IsStreaming.eq(true)) + .all(&self.db) + .await?; + + Ok(workers + .into_iter() + .map(|(worker, property)| WorkerInfo(worker, property).into()) + .collect_vec()) + } + + pub async fn list_active_parallel_units(&self) -> MetaResult> { + let parallel_units: Vec<(WorkerId, I32Array)> = WorkerProperty::find() + .select_only() + .column(worker_property::Column::WorkerId) + .column(worker_property::Column::ParallelUnitIds) + .inner_join(Worker) + .filter(worker::Column::Status.eq(WorkerStatus::Running)) + .into_tuple() + .all(&self.db) + .await?; + Ok(parallel_units + .into_iter() + .flat_map(|(id, pu)| { + pu.0.into_iter().map(move |parallel_unit_id| ParallelUnit { + id: parallel_unit_id as _, + worker_node_id: id, + }) + }) + .collect_vec()) + } + + pub async fn list_active_serving_workers(&self) -> MetaResult> { + let workers = Worker::find() + .filter( + worker::Column::WorkerType + .eq(WorkerType::ComputeNode) + .and(worker::Column::Status.eq(WorkerStatus::Running)), + ) + .inner_join(WorkerProperty) + .select_also(WorkerProperty) + .filter(worker_property::Column::IsServing.eq(true)) + .all(&self.db) + .await?; + + Ok(workers + .into_iter() + .map(|(worker, property)| WorkerInfo(worker, property).into()) + .collect_vec()) + } + + pub async fn get_streaming_cluster_info(&self) -> MetaResult { + let mut streaming_workers = self.list_active_streaming_workers().await?; + + let unschedulable_worker_node = streaming_workers + .extract_if(|worker| { + worker + .property + .as_ref() + .map_or(false, |p| p.is_unschedulable) + }) + .collect_vec(); + + let active_workers: HashMap<_, _> = + streaming_workers.into_iter().map(|w| (w.id, w)).collect(); + + let active_parallel_units = active_workers + .values() + .flat_map(|worker| worker.parallel_units.iter().map(|p| (p.id, p.clone()))) + .collect(); + + let unschedulable_parallel_units = unschedulable_worker_node + .iter() + .flat_map(|worker| worker.parallel_units.iter().map(|p| (p.id, p.clone()))) + .collect(); + + Ok(StreamingClusterInfo { + worker_nodes: active_workers, + parallel_units: active_parallel_units, + unschedulable_parallel_units, + }) + } + + pub async fn get_worker_by_id(&self, worker_id: WorkerId) -> MetaResult> { + let worker = Worker::find_by_id(worker_id) + .find_also_related(WorkerProperty) + .one(&self.db) + .await?; + Ok(worker.map(|(w, p)| WorkerInfo(w, p).into())) + } + + pub fn get_worker_extra_info_by_id(&self, worker_id: WorkerId) -> Option { + self.worker_extra_info.get(&worker_id).cloned() + } +} + +#[cfg(test)] +#[cfg(not(madsim))] +mod tests { + use super::*; + + fn mock_worker_hosts_for_test(count: usize) -> Vec { + (0..count) + .map(|i| HostAddress { + host: "localhost".to_string(), + port: 5000 + i as i32, + }) + .collect_vec() + } + + #[tokio::test] + async fn test_cluster_controller() -> MetaResult<()> { + let env = MetaSrvEnv::for_test().await; + let cluster_ctl = ClusterController::new(env, Duration::from_secs(1)).await?; + + let parallelism_num = 4_usize; + let worker_count = 5_usize; + let property = AddNodeProperty { + worker_node_parallelism: parallelism_num as _, + is_streaming: true, + is_serving: true, + is_unschedulable: false, + }; + let hosts = mock_worker_hosts_for_test(worker_count); + let mut worker_ids = vec![]; + for host in &hosts { + worker_ids.push( + cluster_ctl + .add_worker(PbWorkerType::ComputeNode, host.clone(), property.clone()) + .await?, + ); + } + + // Since no worker is active, the parallel unit count should be 0. + assert_eq!(cluster_ctl.list_active_parallel_units().await?.len(), 0); + + for id in &worker_ids { + cluster_ctl.activate_worker(*id).await?; + } + let worker_cnt_map = cluster_ctl.count_worker_by_type().await?; + assert_eq!( + *worker_cnt_map.get(&WorkerType::ComputeNode).unwrap() as usize, + worker_count + ); + assert_eq!( + cluster_ctl.list_active_streaming_workers().await?.len(), + worker_count + ); + assert_eq!( + cluster_ctl.list_active_serving_workers().await?.len(), + worker_count + ); + assert_eq!( + cluster_ctl.list_active_parallel_units().await?.len(), + parallelism_num * worker_count + ); + + // re-register existing worker node with larger parallelism and change its serving mode. + let mut new_property = property.clone(); + new_property.worker_node_parallelism = (parallelism_num * 2) as _; + new_property.is_serving = false; + cluster_ctl + .add_worker(PbWorkerType::ComputeNode, hosts[0].clone(), new_property) + .await?; + + assert_eq!( + cluster_ctl.list_active_streaming_workers().await?.len(), + worker_count + ); + assert_eq!( + cluster_ctl.list_active_serving_workers().await?.len(), + worker_count - 1 + ); + let parallel_units = cluster_ctl.list_active_parallel_units().await?; + assert!(parallel_units.iter().map(|pu| pu.id).all_unique()); + assert_eq!(parallel_units.len(), parallelism_num * (worker_count + 1)); + + // delete workers. + for host in hosts { + cluster_ctl.delete_worker(host).await?; + } + assert_eq!(cluster_ctl.list_active_streaming_workers().await?.len(), 0); + assert_eq!(cluster_ctl.list_active_serving_workers().await?.len(), 0); + assert_eq!(cluster_ctl.list_active_parallel_units().await?.len(), 0); + + Ok(()) + } + + #[tokio::test] + async fn test_update_schedulability() -> MetaResult<()> { + let env = MetaSrvEnv::for_test().await; + let cluster_ctl = ClusterController::new(env, Duration::from_secs(1)).await?; + + let host = HostAddress { + host: "localhost".to_string(), + port: 5001, + }; + let mut property = AddNodeProperty { + worker_node_parallelism: 4, + is_streaming: true, + is_serving: true, + is_unschedulable: false, + }; + let worker_id = cluster_ctl + .add_worker(PbWorkerType::ComputeNode, host.clone(), property.clone()) + .await?; + + cluster_ctl.activate_worker(worker_id).await?; + cluster_ctl + .update_schedulability(vec![worker_id], Schedulability::Unschedulable) + .await?; + + let workers = cluster_ctl.list_active_streaming_workers().await?; + assert_eq!(workers.len(), 1); + assert!(workers[0].property.as_ref().unwrap().is_unschedulable); + + // re-register existing worker node and change its serving mode, the schedulable state should not be changed. + property.is_unschedulable = false; + property.is_serving = false; + let new_worker_id = cluster_ctl + .add_worker(PbWorkerType::ComputeNode, host.clone(), property) + .await?; + assert_eq!(worker_id, new_worker_id); + + let workers = cluster_ctl.list_active_streaming_workers().await?; + assert_eq!(workers.len(), 1); + assert!(workers[0].property.as_ref().unwrap().is_unschedulable); + + cluster_ctl.delete_worker(host).await?; + + Ok(()) + } +} diff --git a/src/meta/src/controller/mod.rs b/src/meta/src/controller/mod.rs new file mode 100644 index 0000000000000..07793e30a17fe --- /dev/null +++ b/src/meta/src/controller/mod.rs @@ -0,0 +1,267 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::anyhow; +use risingwave_common::util::epoch::Epoch; +use risingwave_pb::catalog::connection::PbInfo as PbConnectionInfo; +use risingwave_pb::catalog::source::PbOptionalAssociatedTableId; +use risingwave_pb::catalog::table::{PbOptionalAssociatedSourceId, PbTableType}; +use risingwave_pb::catalog::{ + PbConnection, PbCreateType, PbDatabase, PbHandleConflictBehavior, PbIndex, PbSchema, PbSink, + PbSinkType, PbSource, PbStreamJobStatus, PbTable, PbView, +}; +use sea_orm::{ActiveValue, DatabaseConnection, ModelTrait}; + +use crate::model_v2::{connection, database, index, object, schema, sink, source, table, view}; +use crate::MetaError; + +#[allow(dead_code)] +pub mod catalog; +pub mod cluster; +pub mod rename; +pub mod system_param; +pub mod utils; + +// todo: refine the error transform. +impl From for MetaError { + fn from(err: sea_orm::DbErr) -> Self { + if let Some(err) = err.sql_err() { + return anyhow!(err).into(); + } + anyhow!(err).into() + } +} + +#[derive(Clone)] +pub struct SqlMetaStore { + pub conn: DatabaseConnection, +} + +impl SqlMetaStore { + pub fn new(conn: DatabaseConnection) -> Self { + Self { conn } + } + + #[cfg(any(test, feature = "test"))] + #[cfg(not(madsim))] + pub async fn for_test() -> Self { + use model_migration::{Migrator, MigratorTrait}; + let conn = sea_orm::Database::connect("sqlite::memory:").await.unwrap(); + Migrator::up(&conn, None).await.unwrap(); + Self { conn } + } +} + +pub struct ObjectModel(M, object::Model); + +impl From> for PbDatabase { + fn from(value: ObjectModel) -> Self { + Self { + id: value.0.database_id, + name: value.0.name, + owner: value.1.owner_id, + } + } +} + +impl From for database::ActiveModel { + fn from(db: PbDatabase) -> Self { + Self { + database_id: ActiveValue::Set(db.id), + name: ActiveValue::Set(db.name), + } + } +} + +impl From for schema::ActiveModel { + fn from(schema: PbSchema) -> Self { + Self { + schema_id: ActiveValue::Set(schema.id), + name: ActiveValue::Set(schema.name), + } + } +} + +impl From> for PbSchema { + fn from(value: ObjectModel) -> Self { + Self { + id: value.0.schema_id, + name: value.0.name, + database_id: value.1.database_id.unwrap(), + owner: value.1.owner_id, + } + } +} + +impl From> for PbTable { + fn from(value: ObjectModel) -> Self { + Self { + id: value.0.table_id, + schema_id: value.1.schema_id.unwrap(), + database_id: value.1.database_id.unwrap(), + name: value.0.name, + columns: value.0.columns.0, + pk: value.0.pk.0, + dependent_relations: vec![], // todo: deprecate it. + table_type: PbTableType::from(value.0.table_type) as _, + distribution_key: value.0.distribution_key.0, + stream_key: value.0.stream_key.0, + append_only: value.0.append_only, + owner: value.1.owner_id, + properties: value.0.properties.0, + fragment_id: value.0.fragment_id as u32, + vnode_col_index: value.0.vnode_col_index, + row_id_index: value.0.row_id_index, + value_indices: value.0.value_indices.0, + definition: value.0.definition, + handle_pk_conflict_behavior: PbHandleConflictBehavior::from( + value.0.handle_pk_conflict_behavior, + ) as _, + read_prefix_len_hint: value.0.read_prefix_len_hint, + watermark_indices: value.0.watermark_indices.0, + dist_key_in_pk: value.0.dist_key_in_pk.0, + dml_fragment_id: value.0.dml_fragment_id.map(|id| id as u32), + cardinality: value.0.cardinality.map(|cardinality| cardinality.0), + initialized_at_epoch: Some( + Epoch::from_unix_millis(value.1.initialized_at.timestamp_millis() as _).0, + ), + created_at_epoch: Some( + Epoch::from_unix_millis(value.1.created_at.timestamp_millis() as _).0, + ), + cleaned_by_watermark: value.0.cleaned_by_watermark, + stream_job_status: PbStreamJobStatus::from(value.0.job_status) as _, + create_type: PbCreateType::from(value.0.create_type) as _, + version: Some(value.0.version.0), + optional_associated_source_id: value + .0 + .optional_associated_source_id + .map(PbOptionalAssociatedSourceId::AssociatedSourceId), + } + } +} + +impl From> for PbSource { + fn from(value: ObjectModel) -> Self { + Self { + id: value.0.source_id, + schema_id: value.1.schema_id.unwrap(), + database_id: value.1.database_id.unwrap(), + name: value.0.name, + row_id_index: value.0.row_id_index, + columns: value.0.columns.0, + pk_column_ids: value.0.pk_column_ids.0, + properties: value.0.properties.0, + owner: value.1.owner_id, + info: value.0.source_info.map(|info| info.0), + watermark_descs: value.0.watermark_descs.0, + definition: value.0.definition, + connection_id: value.0.connection_id, + // todo: using the timestamp from the database directly. + initialized_at_epoch: Some( + Epoch::from_unix_millis(value.1.initialized_at.timestamp_millis() as _).0, + ), + created_at_epoch: Some( + Epoch::from_unix_millis(value.1.created_at.timestamp_millis() as _).0, + ), + version: value.0.version, + optional_associated_table_id: value + .0 + .optional_associated_table_id + .map(PbOptionalAssociatedTableId::AssociatedTableId), + } + } +} + +impl From> for PbSink { + fn from(value: ObjectModel) -> Self { + Self { + id: value.0.sink_id, + schema_id: value.1.schema_id.unwrap(), + database_id: value.1.database_id.unwrap(), + name: value.0.name, + columns: value.0.columns.0, + plan_pk: value.0.plan_pk.0, + dependent_relations: vec![], // todo: deprecate it. + distribution_key: value.0.distribution_key.0, + downstream_pk: value.0.downstream_pk.0, + sink_type: PbSinkType::from(value.0.sink_type) as _, + owner: value.1.owner_id, + properties: value.0.properties.0, + definition: value.0.definition, + connection_id: value.0.connection_id, + initialized_at_epoch: Some( + Epoch::from_unix_millis(value.1.initialized_at.timestamp_millis() as _).0, + ), + created_at_epoch: Some( + Epoch::from_unix_millis(value.1.created_at.timestamp_millis() as _).0, + ), + db_name: value.0.db_name, + sink_from_name: value.0.sink_from_name, + stream_job_status: PbStreamJobStatus::from(value.0.job_status) as _, + format_desc: value.0.sink_format_desc.map(|desc| desc.0), + } + } +} + +impl From> for PbIndex { + fn from(value: ObjectModel) -> Self { + Self { + id: value.0.index_id, + schema_id: value.1.schema_id.unwrap(), + database_id: value.1.database_id.unwrap(), + name: value.0.name, + owner: value.1.owner_id, + index_table_id: value.0.index_table_id, + primary_table_id: value.0.primary_table_id, + index_item: value.0.index_items.0, + original_columns: value.0.original_columns.0, + initialized_at_epoch: Some( + Epoch::from_unix_millis(value.1.initialized_at.timestamp_millis() as _).0, + ), + created_at_epoch: Some( + Epoch::from_unix_millis(value.1.created_at.timestamp_millis() as _).0, + ), + stream_job_status: PbStreamJobStatus::from(value.0.job_status) as _, + } + } +} + +impl From> for PbView { + fn from(value: ObjectModel) -> Self { + Self { + id: value.0.view_id, + schema_id: value.1.schema_id.unwrap(), + database_id: value.1.database_id.unwrap(), + name: value.0.name, + owner: value.1.owner_id, + properties: value.0.properties.0, + sql: value.0.definition, + dependent_relations: vec![], // todo: deprecate it. + columns: value.0.columns.0, + } + } +} + +impl From> for PbConnection { + fn from(value: ObjectModel) -> Self { + Self { + id: value.1.oid, + schema_id: value.1.schema_id.unwrap(), + database_id: value.1.database_id.unwrap(), + name: value.0.name, + owner: value.1.owner_id, + info: Some(PbConnectionInfo::PrivateLinkService(value.0.info.0)), + } + } +} diff --git a/src/meta/src/controller/rename.rs b/src/meta/src/controller/rename.rs new file mode 100644 index 0000000000000..254565efb391c --- /dev/null +++ b/src/meta/src/controller/rename.rs @@ -0,0 +1,430 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use itertools::Itertools; +use risingwave_common::util::column_index_mapping::ColIndexMapping; +use risingwave_pb::expr::expr_node::RexNode; +use risingwave_pb::expr::{ExprNode, FunctionCall, UserDefinedFunction}; +use risingwave_sqlparser::ast::{ + Array, CreateSink, CreateSinkStatement, CreateSourceStatement, Distinct, Expr, Function, + FunctionArg, FunctionArgExpr, Ident, ObjectName, Query, SelectItem, SetExpr, Statement, + TableAlias, TableFactor, TableWithJoins, +}; +use risingwave_sqlparser::parser::Parser; + +/// `alter_relation_rename` renames a relation to a new name in its `Create` statement, and returns +/// the updated definition raw sql. Note that the `definition` must be a `Create` statement and the +/// `new_name` must be a valid identifier, it should be validated before calling this function. To +/// update all relations that depend on the renamed one, use `alter_relation_rename_refs`. +pub fn alter_relation_rename(definition: &str, new_name: &str) -> String { + // This happens when we try to rename a table that's created by `CREATE TABLE AS`. Remove it + // when we support `SHOW CREATE TABLE` for `CREATE TABLE AS`. + if definition.is_empty() { + tracing::warn!("found empty definition when renaming relation, ignored."); + return definition.into(); + } + let ast = Parser::parse_sql(definition).expect("failed to parse relation definition"); + let mut stmt = ast + .into_iter() + .exactly_one() + .expect("should contains only one statement"); + + match &mut stmt { + Statement::CreateTable { name, .. } + | Statement::CreateView { name, .. } + | Statement::CreateIndex { name, .. } + | Statement::CreateSource { + stmt: CreateSourceStatement { + source_name: name, .. + }, + } + | Statement::CreateSink { + stmt: CreateSinkStatement { + sink_name: name, .. + }, + } => replace_table_name(name, new_name), + _ => unreachable!(), + }; + + stmt.to_string() +} + +/// `alter_relation_rename_refs` updates all references of renamed-relation in the definition of +/// target relation's `Create` statement. +pub fn alter_relation_rename_refs(definition: &str, from: &str, to: &str) -> String { + let ast = Parser::parse_sql(definition).expect("failed to parse relation definition"); + let mut stmt = ast + .into_iter() + .exactly_one() + .expect("should contains only one statement"); + + match &mut stmt { + Statement::CreateTable { + query: Some(query), .. + } + | Statement::CreateView { query, .. } + | Statement::Query(query) // Used by view, actually we store a query as the definition of view. + | Statement::CreateSink { + stmt: + CreateSinkStatement { + sink_from: CreateSink::AsQuery(query), + .. + }, + } => { + QueryRewriter::rewrite_query(query, from, to); + } + Statement::CreateIndex { table_name, .. } + | Statement::CreateSink { + stmt: + CreateSinkStatement { + sink_from: CreateSink::From(table_name), + .. + }, + } => replace_table_name(table_name, to), + _ => unreachable!(), + }; + stmt.to_string() +} + +/// Replace the last ident in the `table_name` with the given name, the object name is ensured to be +/// non-empty. e.g. `schema.table` or `database.schema.table`. +fn replace_table_name(table_name: &mut ObjectName, to: &str) { + let idx = table_name.0.len() - 1; + table_name.0[idx] = Ident::new_unchecked(to); +} + +/// `QueryRewriter` is a visitor that updates all references of relation named `from` to `to` in the +/// given query, which is the part of create statement of `relation`. +struct QueryRewriter<'a> { + from: &'a str, + to: &'a str, +} + +impl QueryRewriter<'_> { + fn rewrite_query(query: &mut Query, from: &str, to: &str) { + let rewriter = QueryRewriter { from, to }; + rewriter.visit_query(query) + } + + /// Visit the query and update all references of relation named `from` to `to`. + fn visit_query(&self, query: &mut Query) { + if let Some(with) = &mut query.with { + for cte_table in &mut with.cte_tables { + self.visit_query(&mut cte_table.query); + } + } + self.visit_set_expr(&mut query.body); + for expr in &mut query.order_by { + self.visit_expr(&mut expr.expr); + } + } + + /// Visit table factor and update all references of relation named `from` to `to`. + /// Rewrite idents(i.e. `schema.table`, `table`) that contains the old name in the + /// following pattern: + /// 1. `FROM a` to `FROM new_a AS a` + /// 2. `FROM a AS b` to `FROM new_a AS b` + /// + /// So that we DON'T have to: + /// 1. rewrite the select and expr part like `schema.table.column`, `table.column`, + /// `alias.column` etc. + /// 2. handle the case that the old name is used as alias. + /// 3. handle the case that the new name is used as alias. + fn visit_table_factor(&self, table_factor: &mut TableFactor) { + match table_factor { + TableFactor::Table { name, alias, .. } => { + let idx = name.0.len() - 1; + if name.0[idx].real_value() == self.from { + if alias.is_none() { + *alias = Some(TableAlias { + name: Ident::new_unchecked(self.from), + columns: vec![], + }); + } + name.0[idx] = Ident::new_unchecked(self.to); + } + } + TableFactor::Derived { subquery, .. } => self.visit_query(subquery), + TableFactor::TableFunction { args, .. } => { + for arg in args { + self.visit_function_args(arg); + } + } + TableFactor::NestedJoin(table_with_joins) => { + self.visit_table_with_joins(table_with_joins); + } + } + } + + /// Visit table with joins and update all references of relation named `from` to `to`. + fn visit_table_with_joins(&self, table_with_joins: &mut TableWithJoins) { + self.visit_table_factor(&mut table_with_joins.relation); + for join in &mut table_with_joins.joins { + self.visit_table_factor(&mut join.relation); + } + } + + /// Visit query body expression and update all references. + fn visit_set_expr(&self, set_expr: &mut SetExpr) { + match set_expr { + SetExpr::Select(select) => { + if let Distinct::DistinctOn(exprs) = &mut select.distinct { + for expr in exprs { + self.visit_expr(expr); + } + } + for select_item in &mut select.projection { + self.visit_select_item(select_item); + } + for from_item in &mut select.from { + self.visit_table_with_joins(from_item); + } + if let Some(where_clause) = &mut select.selection { + self.visit_expr(where_clause); + } + for expr in &mut select.group_by { + self.visit_expr(expr); + } + if let Some(having) = &mut select.having { + self.visit_expr(having); + } + } + SetExpr::Query(query) => self.visit_query(query), + SetExpr::SetOperation { left, right, .. } => { + self.visit_set_expr(left); + self.visit_set_expr(right); + } + SetExpr::Values(_) => {} + } + } + + /// Visit function arguments and update all references. + fn visit_function_args(&self, function_args: &mut FunctionArg) { + match function_args { + FunctionArg::Unnamed(arg) | FunctionArg::Named { arg, .. } => match arg { + FunctionArgExpr::Expr(expr) | FunctionArgExpr::ExprQualifiedWildcard(expr, _) => { + self.visit_expr(expr) + } + FunctionArgExpr::QualifiedWildcard(_, None) | FunctionArgExpr::Wildcard(None) => {} + FunctionArgExpr::QualifiedWildcard(_, Some(exprs)) + | FunctionArgExpr::Wildcard(Some(exprs)) => { + for expr in exprs { + self.visit_expr(expr); + } + } + }, + } + } + + /// Visit function and update all references. + fn visit_function(&self, function: &mut Function) { + for arg in &mut function.args { + self.visit_function_args(arg); + } + } + + /// Visit expression and update all references. + fn visit_expr(&self, expr: &mut Expr) { + match expr { + Expr::FieldIdentifier(expr, ..) + | Expr::IsNull(expr) + | Expr::IsNotNull(expr) + | Expr::IsTrue(expr) + | Expr::IsNotTrue(expr) + | Expr::IsFalse(expr) + | Expr::IsNotFalse(expr) + | Expr::IsUnknown(expr) + | Expr::IsNotUnknown(expr) + | Expr::IsJson { expr, .. } + | Expr::InList { expr, .. } + | Expr::SomeOp(expr) + | Expr::AllOp(expr) + | Expr::UnaryOp { expr, .. } + | Expr::Cast { expr, .. } + | Expr::TryCast { expr, .. } + | Expr::AtTimeZone { + timestamp: expr, .. + } + | Expr::Extract { expr, .. } + | Expr::Substring { expr, .. } + | Expr::Overlay { expr, .. } + | Expr::Trim { expr, .. } + | Expr::Nested(expr) + | Expr::ArrayIndex { obj: expr, .. } + | Expr::ArrayRangeIndex { obj: expr, .. } => self.visit_expr(expr), + + Expr::Position { substring, string } => { + self.visit_expr(substring); + self.visit_expr(string); + } + + Expr::InSubquery { expr, subquery, .. } => { + self.visit_expr(expr); + self.visit_query(subquery); + } + Expr::Between { + expr, low, high, .. + } => { + self.visit_expr(expr); + self.visit_expr(low); + self.visit_expr(high); + } + + Expr::IsDistinctFrom(expr1, expr2) + | Expr::IsNotDistinctFrom(expr1, expr2) + | Expr::BinaryOp { + left: expr1, + right: expr2, + .. + } => { + self.visit_expr(expr1); + self.visit_expr(expr2); + } + Expr::Function(function) => self.visit_function(function), + Expr::Exists(query) | Expr::Subquery(query) | Expr::ArraySubquery(query) => { + self.visit_query(query) + } + + Expr::GroupingSets(exprs_vec) | Expr::Cube(exprs_vec) | Expr::Rollup(exprs_vec) => { + for exprs in exprs_vec { + for expr in exprs { + self.visit_expr(expr); + } + } + } + + Expr::Row(exprs) | Expr::Array(Array { elem: exprs, .. }) => { + for expr in exprs { + self.visit_expr(expr); + } + } + + Expr::LambdaFunction { body, args: _ } => self.visit_expr(body), + + // No need to visit. + Expr::Identifier(_) + | Expr::CompoundIdentifier(_) + | Expr::Collate { .. } + | Expr::Value(_) + | Expr::Parameter { .. } + | Expr::TypedString { .. } + | Expr::Case { .. } => {} + } + } + + /// Visit select item and update all references. + fn visit_select_item(&self, select_item: &mut SelectItem) { + match select_item { + SelectItem::UnnamedExpr(expr) + | SelectItem::ExprQualifiedWildcard(expr, _) + | SelectItem::ExprWithAlias { expr, .. } => self.visit_expr(expr), + SelectItem::QualifiedWildcard(_, None) | SelectItem::Wildcard(None) => {} + SelectItem::QualifiedWildcard(_, Some(exprs)) | SelectItem::Wildcard(Some(exprs)) => { + for expr in exprs { + self.visit_expr(expr); + } + } + } + } +} + +pub struct ReplaceTableExprRewriter { + pub table_col_index_mapping: ColIndexMapping, +} + +impl ReplaceTableExprRewriter { + pub fn rewrite_expr(&self, expr: &mut ExprNode) { + let rex_node = expr.rex_node.as_mut().unwrap(); + match rex_node { + RexNode::InputRef(input_col_idx) => { + *input_col_idx = self.table_col_index_mapping.map(*input_col_idx as usize) as u32 + } + RexNode::Constant(_) => {} + RexNode::Udf(udf) => self.rewrite_udf(udf), + RexNode::FuncCall(function_call) => self.rewrite_function_call(function_call), + RexNode::Now(_) => {} + } + } + + fn rewrite_udf(&self, udf: &mut UserDefinedFunction) { + udf.children + .iter_mut() + .for_each(|expr| self.rewrite_expr(expr)); + } + + fn rewrite_function_call(&self, function_call: &mut FunctionCall) { + function_call + .children + .iter_mut() + .for_each(|expr| self.rewrite_expr(expr)); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_alter_table_rename() { + let definition = "CREATE TABLE foo (a int, b int)"; + let new_name = "bar"; + let expected = "CREATE TABLE bar (a INT, b INT)"; + let actual = alter_relation_rename(definition, new_name); + assert_eq!(expected, actual); + } + + #[test] + fn test_rename_index_refs() { + let definition = "CREATE INDEX idx1 ON foo(v1 DESC, v2)"; + let from = "foo"; + let to = "bar"; + let expected = "CREATE INDEX idx1 ON bar(v1 DESC, v2)"; + let actual = alter_relation_rename_refs(definition, from, to); + assert_eq!(expected, actual); + } + + #[test] + fn test_rename_sink_refs() { + let definition = + "CREATE SINK sink_t FROM foo WITH (connector = 'kafka', format = 'append_only')"; + let from = "foo"; + let to = "bar"; + let expected = + "CREATE SINK sink_t FROM bar WITH (connector = 'kafka', format = 'append_only')"; + let actual = alter_relation_rename_refs(definition, from, to); + assert_eq!(expected, actual); + } + + #[test] + fn test_rename_with_alias_refs() { + let definition = + "CREATE MATERIALIZED VIEW mv1 AS SELECT foo.v1 AS m1v, foo.v2 AS m2v FROM foo"; + let from = "foo"; + let to = "bar"; + let expected = + "CREATE MATERIALIZED VIEW mv1 AS SELECT foo.v1 AS m1v, foo.v2 AS m2v FROM bar AS foo"; + let actual = alter_relation_rename_refs(definition, from, to); + assert_eq!(expected, actual); + + let definition = "CREATE MATERIALIZED VIEW mv1 AS SELECT foo.v1 AS m1v, (foo.v2).v3 AS m2v FROM foo WHERE foo.v1 = 1 AND (foo.v2).v3 IS TRUE"; + let expected = "CREATE MATERIALIZED VIEW mv1 AS SELECT foo.v1 AS m1v, (foo.v2).v3 AS m2v FROM bar AS foo WHERE foo.v1 = 1 AND (foo.v2).v3 IS TRUE"; + let actual = alter_relation_rename_refs(definition, from, to); + assert_eq!(expected, actual); + + let definition = "CREATE MATERIALIZED VIEW mv1 AS SELECT bar.v1 AS m1v, (bar.v2).v3 AS m2v FROM foo AS bar WHERE bar.v1 = 1"; + let expected = "CREATE MATERIALIZED VIEW mv1 AS SELECT bar.v1 AS m1v, (bar.v2).v3 AS m2v FROM bar AS bar WHERE bar.v1 = 1"; + let actual = alter_relation_rename_refs(definition, from, to); + assert_eq!(expected, actual); + } +} diff --git a/src/meta/src/controller/system_param.rs b/src/meta/src/controller/system_param.rs new file mode 100644 index 0000000000000..0656da5ea9a46 --- /dev/null +++ b/src/meta/src/controller/system_param.rs @@ -0,0 +1,316 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::Duration; + +use anyhow::anyhow; +use risingwave_common::system_param::reader::SystemParamsReader; +use risingwave_common::system_param::{ + check_missing_params, derive_missing_fields, set_system_param, +}; +use risingwave_common::{for_all_params, key_of}; +use risingwave_pb::meta::subscribe_response::{Info, Operation}; +use risingwave_pb::meta::PbSystemParams; +use sea_orm::{ActiveModelTrait, ActiveValue, DatabaseConnection, EntityTrait, TransactionTrait}; +use tokio::sync::oneshot::Sender; +use tokio::sync::RwLock; +use tokio::task::JoinHandle; +use tracing::info; + +use crate::controller::SqlMetaStore; +use crate::manager::{LocalNotification, NotificationManagerRef}; +use crate::model_v2::prelude::SystemParameter; +use crate::model_v2::system_parameter; +use crate::{MetaError, MetaResult}; + +pub type SystemParamsControllerRef = Arc; + +pub struct SystemParamsController { + db: DatabaseConnection, + // Notify workers and local subscribers of parameter change. + notification_manager: NotificationManagerRef, + // Cached parameters. + params: RwLock, +} + +/// Derive system params from db models. +macro_rules! impl_system_params_from_db { + ($({ $field:ident, $type:ty, $default:expr, $is_mutable:expr },)*) => { + /// Try to deserialize deprecated fields as well. + /// Warn if there are unrecognized fields. + pub fn system_params_from_db(mut models: Vec) -> MetaResult { + let mut params = PbSystemParams::default(); + models.retain(|model| { + match model.name.as_str() { + $( + key_of!($field) => { + params.$field = Some(model.value.parse::<$type>().unwrap()); + false + } + )* + _ => true, + } + }); + derive_missing_fields(&mut params); + if !models.is_empty() { + let unrecognized_params = models.into_iter().map(|model| model.name).collect::>(); + tracing::warn!("unrecognized system params {:?}", unrecognized_params); + } + Ok(params) + } + }; +} + +/// Derive serialization to db models. +macro_rules! impl_system_params_to_models { + ($({ $field:ident, $type:ty, $default:expr, $is_mutable:expr },)*) => { + #[allow(clippy::vec_init_then_push)] + pub fn system_params_to_model(params: &PbSystemParams) -> MetaResult> { + check_missing_params(params).map_err(|e| anyhow!(e))?; + let mut models = Vec::new(); + $( + let value = params.$field.as_ref().unwrap().to_string(); + models.push(system_parameter::ActiveModel { + name: ActiveValue::Set(key_of!($field).to_string()), + value: ActiveValue::Set(value), + is_mutable: ActiveValue::Set($is_mutable), + description: ActiveValue::Set(None), + }); + )* + Ok(models) + } + }; +} + +// For each field in `persisted` and `init` +// 1. Some, None: The persisted field is deprecated, so just ignore it. +// 2. Some, Some: Check equality and warn if they differ. +// 3. None, Some: A new version of RW cluster is launched for the first time and newly introduced +// params are not set. Use init value. +// 4. None, None: A new version of RW cluster is launched for the first time and newly introduced +// params are not set. The new field is not initialized either, just leave it as `None`. +macro_rules! impl_merge_params { + ($({ $field:ident, $type:ty, $default:expr, $is_mutable:expr },)*) => { + fn merge_params(mut persisted: PbSystemParams, init: PbSystemParams) -> PbSystemParams { + $( + match (persisted.$field.as_ref(), init.$field) { + (Some(persisted), Some(init)) => { + if persisted != &init { + tracing::warn!( + "The initializing value of \"{:?}\" ({}) differ from persisted ({}), using persisted value", + key_of!($field), + init, + persisted + ); + } + }, + (None, Some(init)) => persisted.$field = Some(init), + _ => {}, + } + )* + persisted + } + }; +} + +for_all_params!(impl_system_params_from_db); +for_all_params!(impl_merge_params); +for_all_params!(impl_system_params_to_models); + +impl SystemParamsController { + pub async fn new( + sql_meta_store: SqlMetaStore, + notification_manager: NotificationManagerRef, + init_params: PbSystemParams, + ) -> MetaResult { + let db = sql_meta_store.conn; + let params = SystemParameter::find().all(&db).await?; + let params = merge_params(system_params_from_db(params)?, init_params); + + info!("system parameters: {:?}", params); + check_missing_params(¶ms).map_err(|e| anyhow!(e))?; + + let ctl = Self { + db, + notification_manager, + params: RwLock::new(params), + }; + // flush to db. + ctl.flush_params().await?; + + Ok(ctl) + } + + pub async fn get_pb_params(&self) -> PbSystemParams { + self.params.read().await.clone() + } + + pub async fn get_params(&self) -> SystemParamsReader { + self.params.read().await.clone().into() + } + + async fn flush_params(&self) -> MetaResult<()> { + let params = self.params.read().await; + let models = system_params_to_model(¶ms)?; + let txn = self.db.begin().await?; + // delete all params first and then insert all params. It follows the same logic + // as the old code, we'd better change it to another way later to keep consistency. + SystemParameter::delete_many().exec(&txn).await?; + + for model in models { + model.insert(&txn).await?; + } + txn.commit().await?; + Ok(()) + } + + pub async fn set_param(&self, name: &str, value: Option) -> MetaResult { + let mut params_guard = self.params.write().await; + + let Some(param) = SystemParameter::find_by_id(name.to_string()) + .one(&self.db) + .await? + else { + return Err(MetaError::system_param(format!( + "unrecognized system parameter {}", + name + ))); + }; + let mut params = params_guard.clone(); + let mut param: system_parameter::ActiveModel = param.into(); + param.value = ActiveValue::Set( + set_system_param(&mut params, name, value).map_err(MetaError::system_param)?, + ); + param.update(&self.db).await?; + *params_guard = params.clone(); + + // Sync params to other managers on the meta node only once, since it's infallible. + self.notification_manager + .notify_local_subscribers(LocalNotification::SystemParamsChange(params.clone().into())) + .await; + + // Sync params to worker nodes. + self.notify_workers(¶ms).await; + + Ok(params) + } + + // Periodically sync params to worker nodes. + pub fn start_params_notifier( + system_params_controller: Arc, + ) -> (JoinHandle<()>, Sender<()>) { + const NOTIFY_INTERVAL: Duration = Duration::from_millis(5000); + + let (shutdown_tx, mut shutdown_rx) = tokio::sync::oneshot::channel(); + let join_handle = tokio::spawn(async move { + let mut interval = tokio::time::interval(NOTIFY_INTERVAL); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + loop { + tokio::select! { + _ = interval.tick() => {}, + _ = &mut shutdown_rx => { + tracing::info!("System params notifier is stopped"); + return; + } + } + system_params_controller + .notify_workers(&*system_params_controller.params.read().await) + .await; + } + }); + + (join_handle, shutdown_tx) + } + + // Notify workers of parameter change. + async fn notify_workers(&self, params: &PbSystemParams) { + self.notification_manager + .notify_frontend(Operation::Update, Info::SystemParams(params.clone())) + .await; + self.notification_manager + .notify_compute(Operation::Update, Info::SystemParams(params.clone())) + .await; + self.notification_manager + .notify_compactor(Operation::Update, Info::SystemParams(params.clone())) + .await; + } +} + +#[cfg(test)] +mod tests { + use risingwave_common::system_param::system_params_for_test; + + use super::*; + use crate::manager::MetaSrvEnv; + + #[tokio::test] + #[cfg(not(madsim))] + async fn test_system_params() { + let env = MetaSrvEnv::for_test().await; + let meta_store = env.sql_meta_store().unwrap(); + let init_params = system_params_for_test(); + + // init system parameter controller as first launch. + let system_param_ctl = SystemParamsController::new( + meta_store.clone(), + env.notification_manager_ref(), + init_params.clone(), + ) + .await + .unwrap(); + let params = system_param_ctl.get_pb_params().await; + assert_eq!(params, system_params_for_test()); + + // set parameter. + let new_params = system_param_ctl + .set_param("pause_on_next_bootstrap", Some("true".into())) + .await + .unwrap(); + + // insert deprecated params. + let deprecated_param = system_parameter::ActiveModel { + name: ActiveValue::Set("deprecated_param".into()), + value: ActiveValue::Set("foo".into()), + is_mutable: ActiveValue::Set(true), + description: ActiveValue::Set(None), + }; + deprecated_param.insert(&system_param_ctl.db).await.unwrap(); + + // init system parameter controller as not first launch. + let system_param_ctl = SystemParamsController::new( + meta_store, + env.notification_manager_ref(), + init_params.clone(), + ) + .await + .unwrap(); + // check deprecated params are cleaned up. + assert!(SystemParameter::find_by_id("deprecated_param".to_string()) + .one(&system_param_ctl.db) + .await + .unwrap() + .is_none()); + // check new params are set. + let params = system_param_ctl.get_pb_params().await; + assert_eq!(params, new_params); + // check db consistency. + let models = SystemParameter::find() + .all(&system_param_ctl.db) + .await + .unwrap(); + let db_params = system_params_from_db(models).unwrap(); + assert_eq!(db_params, new_params); + } +} diff --git a/src/meta/src/controller/utils.rs b/src/meta/src/controller/utils.rs new file mode 100644 index 0000000000000..d36918db3820d --- /dev/null +++ b/src/meta/src/controller/utils.rs @@ -0,0 +1,356 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::anyhow; +use model_migration::WithQuery; +use risingwave_pb::catalog::{PbConnection, PbFunction}; +use sea_orm::sea_query::{ + Alias, CommonTableExpression, Expr, Query, QueryStatementBuilder, SelectStatement, UnionType, + WithClause, +}; +use sea_orm::{ + ColumnTrait, ConnectionTrait, DerivePartialModel, EntityTrait, FromQueryResult, JoinType, + Order, PaginatorTrait, QueryFilter, QuerySelect, RelationTrait, Statement, +}; + +use crate::model_v2::object::ObjectType; +use crate::model_v2::prelude::*; +use crate::model_v2::{ + connection, function, index, object, object_dependency, schema, sink, source, table, view, + DataTypeArray, DatabaseId, ObjectId, SchemaId, UserId, +}; +use crate::{MetaError, MetaResult}; + +/// This function will construct a query using recursive cte to find all objects[(id, `obj_type`)] that are used by the given object. +/// +/// # Examples +/// +/// ``` +/// use risingwave_meta::controller::utils::construct_obj_dependency_query; +/// use sea_orm::sea_query::*; +/// use sea_orm::*; +/// +/// let query = construct_obj_dependency_query(1); +/// +/// assert_eq!( +/// query.to_string(MysqlQueryBuilder), +/// r#"WITH RECURSIVE `used_by_object_ids` (`used_by`) AS (SELECT `used_by` FROM `object_dependency` WHERE `object_dependency`.`oid` = 1 UNION ALL (SELECT `object_dependency`.`used_by` FROM `object_dependency` INNER JOIN `used_by_object_ids` ON `used_by_object_ids`.`used_by` = `oid`)) SELECT DISTINCT `oid`, `obj_type`, `schema_id`, `database_id` FROM `used_by_object_ids` INNER JOIN `object` ON `used_by_object_ids`.`used_by` = `oid` ORDER BY `oid` DESC"# +/// ); +/// assert_eq!( +/// query.to_string(PostgresQueryBuilder), +/// r#"WITH RECURSIVE "used_by_object_ids" ("used_by") AS (SELECT "used_by" FROM "object_dependency" WHERE "object_dependency"."oid" = 1 UNION ALL (SELECT "object_dependency"."used_by" FROM "object_dependency" INNER JOIN "used_by_object_ids" ON "used_by_object_ids"."used_by" = "oid")) SELECT DISTINCT "oid", "obj_type", "schema_id", "database_id" FROM "used_by_object_ids" INNER JOIN "object" ON "used_by_object_ids"."used_by" = "oid" ORDER BY "oid" DESC"# +/// ); +/// assert_eq!( +/// query.to_string(SqliteQueryBuilder), +/// r#"WITH RECURSIVE "used_by_object_ids" ("used_by") AS (SELECT "used_by" FROM "object_dependency" WHERE "object_dependency"."oid" = 1 UNION ALL SELECT "object_dependency"."used_by" FROM "object_dependency" INNER JOIN "used_by_object_ids" ON "used_by_object_ids"."used_by" = "oid") SELECT DISTINCT "oid", "obj_type", "schema_id", "database_id" FROM "used_by_object_ids" INNER JOIN "object" ON "used_by_object_ids"."used_by" = "oid" ORDER BY "oid" DESC"# +/// ); +/// ``` +pub fn construct_obj_dependency_query(obj_id: ObjectId) -> WithQuery { + let cte_alias = Alias::new("used_by_object_ids"); + let cte_return_alias = Alias::new("used_by"); + + let mut base_query = SelectStatement::new() + .column(object_dependency::Column::UsedBy) + .from(ObjectDependency) + .and_where(object_dependency::Column::Oid.eq(obj_id)) + .to_owned(); + + let cte_referencing = Query::select() + .column((ObjectDependency, object_dependency::Column::UsedBy)) + .from(ObjectDependency) + .inner_join( + cte_alias.clone(), + Expr::col((cte_alias.clone(), cte_return_alias.clone())) + .equals(object_dependency::Column::Oid), + ) + .to_owned(); + + let common_table_expr = CommonTableExpression::new() + .query(base_query.union(UnionType::All, cte_referencing).to_owned()) + .column(cte_return_alias.clone()) + .table_name(cte_alias.clone()) + .to_owned(); + + SelectStatement::new() + .distinct() + .columns([ + object::Column::Oid, + object::Column::ObjType, + object::Column::SchemaId, + object::Column::DatabaseId, + ]) + .from(cte_alias.clone()) + .inner_join( + Object, + Expr::col((cte_alias, cte_return_alias.clone())).equals(object::Column::Oid), + ) + .order_by(object::Column::Oid, Order::Desc) + .to_owned() + .with( + WithClause::new() + .recursive(true) + .cte(common_table_expr) + .to_owned(), + ) + .to_owned() +} + +#[derive(Clone, DerivePartialModel, FromQueryResult)] +#[sea_orm(entity = "Object")] +pub struct PartialObject { + pub oid: ObjectId, + pub obj_type: ObjectType, + pub schema_id: Option, + pub database_id: Option, +} + +/// List all objects that are using the given one in a cascade way. It runs a recursive CTE to find all the dependencies. +pub async fn get_referring_objects_cascade( + obj_id: ObjectId, + db: &C, +) -> MetaResult> +where + C: ConnectionTrait, +{ + let query = construct_obj_dependency_query(obj_id); + let (sql, values) = query.build_any(&*db.get_database_backend().get_query_builder()); + let objects = PartialObject::find_by_statement(Statement::from_sql_and_values( + db.get_database_backend(), + sql, + values, + )) + .all(db) + .await?; + Ok(objects) +} + +/// `ensure_object_id` ensures the existence of target object in the cluster. +pub async fn ensure_object_id( + object_type: ObjectType, + obj_id: ObjectId, + db: &C, +) -> MetaResult<()> +where + C: ConnectionTrait, +{ + let count = Object::find_by_id(obj_id).count(db).await?; + if count == 0 { + return Err(MetaError::catalog_id_not_found( + object_type.as_str(), + obj_id, + )); + } + Ok(()) +} + +/// `ensure_user_id` ensures the existence of target user in the cluster. +pub async fn ensure_user_id(user_id: UserId, db: &C) -> MetaResult<()> +where + C: ConnectionTrait, +{ + let count = User::find_by_id(user_id).count(db).await?; + if count == 0 { + return Err(anyhow!("user {} was concurrently dropped", user_id).into()); + } + Ok(()) +} + +/// `check_function_signature_duplicate` checks whether the function name and its signature is already used in the target namespace. +pub async fn check_function_signature_duplicate( + pb_function: &PbFunction, + db: &C, +) -> MetaResult<()> +where + C: ConnectionTrait, +{ + let count = Function::find() + .inner_join(Object) + .filter( + object::Column::DatabaseId + .eq(pb_function.database_id as DatabaseId) + .and(object::Column::SchemaId.eq(pb_function.schema_id as SchemaId)) + .and(function::Column::Name.eq(&pb_function.name)) + .and(function::Column::ArgTypes.eq(DataTypeArray(pb_function.arg_types.clone()))), + ) + .count(db) + .await?; + if count > 0 { + assert_eq!(count, 1); + return Err(MetaError::catalog_duplicated("function", &pb_function.name)); + } + Ok(()) +} + +/// `check_connection_name_duplicate` checks whether the connection name is already used in the target namespace. +pub async fn check_connection_name_duplicate( + pb_connection: &PbConnection, + db: &C, +) -> MetaResult<()> +where + C: ConnectionTrait, +{ + let count = Connection::find() + .inner_join(Object) + .filter( + object::Column::DatabaseId + .eq(pb_connection.database_id as DatabaseId) + .and(object::Column::SchemaId.eq(pb_connection.schema_id as SchemaId)) + .and(connection::Column::Name.eq(&pb_connection.name)), + ) + .count(db) + .await?; + if count > 0 { + assert_eq!(count, 1); + return Err(MetaError::catalog_duplicated( + "connection", + &pb_connection.name, + )); + } + Ok(()) +} + +/// `check_relation_name_duplicate` checks whether the relation name is already used in the target namespace. +pub async fn check_relation_name_duplicate( + name: &str, + database_id: DatabaseId, + schema_id: SchemaId, + db: &C, +) -> MetaResult<()> +where + C: ConnectionTrait, +{ + macro_rules! check_duplicated { + ($obj_type:expr, $entity:ident, $table:ident) => { + let count = Object::find() + .inner_join($entity) + .filter( + object::Column::DatabaseId + .eq(Some(database_id)) + .and(object::Column::SchemaId.eq(Some(schema_id))) + .and($table::Column::Name.eq(name)), + ) + .count(db) + .await?; + if count != 0 { + return Err(MetaError::catalog_duplicated($obj_type.as_str(), name)); + } + }; + } + check_duplicated!(ObjectType::Table, Table, table); + check_duplicated!(ObjectType::Source, Source, source); + check_duplicated!(ObjectType::Sink, Sink, sink); + check_duplicated!(ObjectType::Index, Index, index); + check_duplicated!(ObjectType::View, View, view); + + Ok(()) +} + +/// `check_schema_name_duplicate` checks whether the schema name is already used in the target database. +pub async fn check_schema_name_duplicate( + name: &str, + database_id: DatabaseId, + db: &C, +) -> MetaResult<()> +where + C: ConnectionTrait, +{ + let count = Object::find() + .inner_join(Schema) + .filter( + object::Column::ObjType + .eq(ObjectType::Schema) + .and(object::Column::DatabaseId.eq(Some(database_id))) + .and(schema::Column::Name.eq(name)), + ) + .count(db) + .await?; + if count != 0 { + return Err(MetaError::catalog_duplicated("schema", name)); + } + + Ok(()) +} + +/// `ensure_object_not_refer` ensures that object are not used by any other ones except indexes. +pub async fn ensure_object_not_refer( + object_type: ObjectType, + object_id: ObjectId, + db: &C, +) -> MetaResult<()> +where + C: ConnectionTrait, +{ + // Ignore indexes. + let count = if object_type == ObjectType::Table { + ObjectDependency::find() + .join( + JoinType::InnerJoin, + object_dependency::Relation::Object1.def(), + ) + .filter( + object_dependency::Column::Oid + .eq(object_id) + .and(object::Column::ObjType.ne(ObjectType::Index)), + ) + .count(db) + .await? + } else { + ObjectDependency::find() + .filter(object_dependency::Column::Oid.eq(object_id)) + .count(db) + .await? + }; + if count != 0 { + return Err(MetaError::permission_denied(format!( + "{} used by {} other objects.", + object_type.as_str(), + count + ))); + } + Ok(()) +} + +/// List all objects that are using the given one. +pub async fn get_referring_objects(object_id: ObjectId, db: &C) -> MetaResult> +where + C: ConnectionTrait, +{ + let objs = ObjectDependency::find() + .filter(object_dependency::Column::Oid.eq(object_id)) + .join( + JoinType::InnerJoin, + object_dependency::Relation::Object1.def(), + ) + .into_partial_model() + .all(db) + .await?; + + Ok(objs) +} + +/// `ensure_schema_empty` ensures that the schema is empty, used by `DROP SCHEMA`. +pub async fn ensure_schema_empty(schema_id: SchemaId, db: &C) -> MetaResult<()> +where + C: ConnectionTrait, +{ + let count = Object::find() + .filter(object::Column::SchemaId.eq(Some(schema_id))) + .count(db) + .await?; + if count != 0 { + return Err(MetaError::permission_denied("schema is not empty".into())); + } + + Ok(()) +} diff --git a/src/meta/src/dashboard/mod.rs b/src/meta/src/dashboard/mod.rs index 2ebc9d924f50b..6a10dd9c02bb3 100644 --- a/src/meta/src/dashboard/mod.rs +++ b/src/meta/src/dashboard/mod.rs @@ -57,10 +57,10 @@ pub(super) mod handlers { use axum::Json; use itertools::Itertools; use risingwave_common::bail; - use risingwave_common::heap_profiling::COLLAPSED_SUFFIX; + use risingwave_common_heap_profiling::COLLAPSED_SUFFIX; use risingwave_pb::catalog::table::TableType; use risingwave_pb::catalog::{Sink, Source, Table}; - use risingwave_pb::common::WorkerNode; + use risingwave_pb::common::{WorkerNode, WorkerType}; use risingwave_pb::meta::{ActorLocation, PbTableFragments}; use risingwave_pb::monitor_service::{ HeapProfilingResponse, ListHeapProfilingResponse, StackTraceResponse, @@ -101,12 +101,11 @@ pub(super) mod handlers { Path(ty): Path, Extension(srv): Extension, ) -> Result>> { - use risingwave_pb::common::WorkerType; let mut result = srv .cluster_manager .list_worker_node( - WorkerType::from_i32(ty) - .ok_or_else(|| anyhow!("invalid worker type")) + WorkerType::try_from(ty) + .map_err(|_| anyhow!("invalid worker type")) .map_err(err)?, None, ) @@ -198,6 +197,39 @@ pub(super) mod handlers { Ok(Json(table_fragments)) } + async fn dump_await_tree_inner( + worker_nodes: impl IntoIterator, + compute_clients: &ComputeClientPool, + ) -> Result> { + let mut all = Default::default(); + + fn merge(a: &mut StackTraceResponse, b: StackTraceResponse) { + a.actor_traces.extend(b.actor_traces); + a.rpc_traces.extend(b.rpc_traces); + a.compaction_task_traces.extend(b.compaction_task_traces); + } + + for worker_node in worker_nodes { + let client = compute_clients.get(worker_node).await.map_err(err)?; + let result = client.stack_trace().await.map_err(err)?; + + merge(&mut all, result); + } + + Ok(all.into()) + } + + pub async fn dump_await_tree_all( + Extension(srv): Extension, + ) -> Result> { + let worker_nodes = srv + .cluster_manager + .list_worker_node(WorkerType::ComputeNode, None) + .await; + + dump_await_tree_inner(&worker_nodes, &srv.compute_clients).await + } + pub async fn dump_await_tree( Path(worker_id): Path, Extension(srv): Extension, @@ -210,11 +242,7 @@ pub(super) mod handlers { .map_err(err)? .worker_node; - let client = srv.compute_clients.get(&worker_node).await.map_err(err)?; - - let result = client.stack_trace().await.map_err(err)?; - - Ok(result.into()) + dump_await_tree_inner(std::iter::once(&worker_node), &srv.compute_clients).await } pub async fn heap_profile( @@ -325,6 +353,7 @@ impl DashboardService { get(prometheus::list_prometheus_actor_back_pressure), ) .route("/monitor/await_tree/:worker_id", get(dump_await_tree)) + .route("/monitor/await_tree/", get(dump_await_tree_all)) .route("/monitor/dump_heap_profile/:worker_id", get(heap_profile)) .route( "/monitor/list_heap_profile/:worker_id", diff --git a/src/meta/src/dashboard/prometheus.rs b/src/meta/src/dashboard/prometheus.rs index 49431a29afd65..24709348c7865 100644 --- a/src/meta/src/dashboard/prometheus.rs +++ b/src/meta/src/dashboard/prometheus.rs @@ -134,7 +134,7 @@ pub async fn list_prometheus_actor_back_pressure( ) -> Result> { if let Some(ref client) = srv.prometheus_client { let now = SystemTime::now(); - let back_pressure_query = "rate(stream_actor_output_buffer_blocking_duration_ns{job=~\"compute\"}[60s]) / 1000000000"; + let back_pressure_query = "avg(rate(stream_actor_output_buffer_blocking_duration_ns[60s])) by (fragment_id, downstream_fragment_id) / 1000000000"; let result = client .query_range( back_pressure_query, diff --git a/src/meta/src/error.rs b/src/meta/src/error.rs index b056fea0e35e7..03323d53fa0af 100644 --- a/src/meta/src/error.rs +++ b/src/meta/src/error.rs @@ -20,7 +20,6 @@ use risingwave_common::error::BoxedError; use risingwave_connector::sink::SinkError; use risingwave_pb::PbFieldNotFound; use risingwave_rpc_client::error::RpcError; -use sqlx::Error; use crate::hummock::error::Error as HummockError; use crate::manager::WorkerId; @@ -56,6 +55,9 @@ enum MetaErrorInner { #[error("{0} id not found: {1}")] CatalogIdNotFound(&'static str, u32), + #[error("table_fragment not exist: id={0}")] + FragmentNotFound(u32), + #[error("{0} with name {1} exists")] Duplicated(&'static str, String), @@ -135,6 +137,14 @@ impl MetaError { MetaErrorInner::CatalogIdNotFound(relation, id.into()).into() } + pub fn fragment_not_found>(id: T) -> Self { + MetaErrorInner::FragmentNotFound(id.into()).into() + } + + pub fn is_fragment_not_found(&self) -> bool { + matches!(self.inner.as_ref(), &MetaErrorInner::FragmentNotFound(..)) + } + pub fn catalog_duplicated>(relation: &'static str, name: T) -> Self { MetaErrorInner::Duplicated(relation, name.into()).into() } @@ -170,12 +180,6 @@ impl From for MetaError { } } -impl From for MetaError { - fn from(value: Error) -> Self { - MetaErrorInner::Election(value.to_string()).into() - } -} - impl From for MetaError { fn from(e: RpcError) -> Self { MetaErrorInner::RpcError(e).into() diff --git a/src/meta/src/hummock/compaction/mod.rs b/src/meta/src/hummock/compaction/mod.rs index 23585da8999a9..a056414034243 100644 --- a/src/meta/src/hummock/compaction/mod.rs +++ b/src/meta/src/hummock/compaction/mod.rs @@ -15,41 +15,36 @@ #![expect(clippy::arc_with_non_send_sync, reason = "FIXME: later")] pub mod compaction_config; -mod level_selector; mod overlap_strategy; -mod tombstone_compaction_selector; use risingwave_common::catalog::TableOption; -use risingwave_hummock_sdk::compaction_group::StateTableId; use risingwave_hummock_sdk::prost_key_range::KeyRangeExt; -use risingwave_pb::hummock::compact_task::{self, TaskStatus}; +use risingwave_pb::hummock::compact_task::{self, TaskStatus, TaskType}; mod picker; +pub mod selector; + use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::sync::Arc; -use picker::{LevelCompactionPicker, ManualCompactionPicker, TierCompactionPicker}; +use picker::{LevelCompactionPicker, TierCompactionPicker}; use risingwave_hummock_sdk::{ - can_concat, CompactionGroupId, HummockCompactionTaskId, HummockEpoch, HummockSstableId, + can_concat, CompactionGroupId, HummockCompactionTaskId, HummockEpoch, }; use risingwave_pb::hummock::compaction_config::CompactionMode; use risingwave_pb::hummock::hummock_version::Levels; use risingwave_pb::hummock::{CompactTask, CompactionConfig, KeyRange, LevelType}; +pub use selector::CompactionSelector; -pub use crate::hummock::compaction::level_selector::{ - default_level_selector, DynamicLevelSelector, DynamicLevelSelectorCore, EmergencySelector, - LevelSelector, ManualCompactionSelector, SpaceReclaimCompactionSelector, TtlCompactionSelector, -}; +use self::selector::LocalSelectorStatistic; use crate::hummock::compaction::overlap_strategy::{OverlapStrategy, RangeOverlapStrategy}; -use crate::hummock::compaction::picker::{CompactionInput, LocalPickerStatistic}; -pub use crate::hummock::compaction::tombstone_compaction_selector::TombstoneCompactionSelector; +use crate::hummock::compaction::picker::CompactionInput; use crate::hummock::level_handler::LevelHandler; use crate::hummock::model::CompactionGroup; -use crate::rpc::metrics::MetaMetrics; pub struct CompactStatus { - pub(crate) compaction_group_id: CompactionGroupId, - pub(crate) level_handlers: Vec, + pub compaction_group_id: CompactionGroupId, + pub level_handlers: Vec, } impl Debug for CompactStatus { @@ -111,7 +106,7 @@ impl CompactStatus { task_id: HummockCompactionTaskId, group: &CompactionGroup, stats: &mut LocalSelectorStatistic, - selector: &mut Box, + selector: &mut Box, table_id_to_options: HashMap, ) -> Option { // When we compact the files, we must make the result of compaction meet the following @@ -161,6 +156,10 @@ impl CompactStatus { } pub fn is_trivial_move_task(task: &CompactTask) -> bool { + if task.task_type() != TaskType::Dynamic && task.task_type() != TaskType::Emergency { + return false; + } + if task.input_ssts.len() == 1 { return task.input_ssts[0].level_idx == 0 && can_concat(&task.input_ssts[0].table_infos); @@ -209,74 +208,6 @@ impl CompactStatus { } } -#[derive(Clone, Debug, PartialEq)] -pub struct ManualCompactionOption { - /// Filters out SSTs to pick. Has no effect if empty. - pub sst_ids: Vec, - /// Filters out SSTs to pick. - pub key_range: KeyRange, - /// Filters out SSTs to pick. Has no effect if empty. - pub internal_table_id: HashSet, - /// Input level. - pub level: usize, -} - -impl Default for ManualCompactionOption { - fn default() -> Self { - Self { - sst_ids: vec![], - key_range: KeyRange { - left: vec![], - right: vec![], - right_exclusive: false, - }, - internal_table_id: HashSet::default(), - level: 1, - } - } -} - -#[derive(Default)] -pub struct LocalSelectorStatistic { - skip_picker: Vec<(usize, usize, LocalPickerStatistic)>, -} - -impl LocalSelectorStatistic { - pub fn report_to_metrics(&self, group_id: u64, metrics: &MetaMetrics) { - for (start_level, target_level, stats) in &self.skip_picker { - let level_label = format!("cg{}-{}-to-{}", group_id, start_level, target_level); - if stats.skip_by_write_amp_limit > 0 { - metrics - .compact_skip_frequency - .with_label_values(&[level_label.as_str(), "write-amp"]) - .inc(); - } - if stats.skip_by_count_limit > 0 { - metrics - .compact_skip_frequency - .with_label_values(&[level_label.as_str(), "count"]) - .inc(); - } - if stats.skip_by_pending_files > 0 { - metrics - .compact_skip_frequency - .with_label_values(&[level_label.as_str(), "pending-files"]) - .inc(); - } - if stats.skip_by_overlapping > 0 { - metrics - .compact_skip_frequency - .with_label_values(&[level_label.as_str(), "overlapping"]) - .inc(); - } - metrics - .compact_skip_frequency - .with_label_values(&[level_label.as_str(), "picker"]) - .inc(); - } - } -} - pub fn create_compaction_task( compaction_config: &CompactionConfig, input: CompactionInput, diff --git a/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs index d9bd8d6020a23..6e1b33b1935d2 100644 --- a/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/base_level_compaction_picker.rs @@ -137,7 +137,7 @@ impl LevelCompactionPicker { min_compaction_bytes, // divide by 2 because we need to select files of base level and it need use the other // half quota. - std::cmp::min( + std::cmp::max( self.config.max_bytes_for_level_base, self.config.max_compaction_bytes / 2, ), @@ -240,12 +240,7 @@ pub mod tests { use super::*; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; - use crate::hummock::compaction::level_selector::tests::{ - generate_l0_nonoverlapping_multi_sublevels, generate_l0_nonoverlapping_sublevels, - generate_l0_overlapping_sublevels, generate_level, generate_table, - push_table_level0_nonoverlapping, push_table_level0_overlapping, - push_tables_level0_nonoverlapping, - }; + use crate::hummock::compaction::selector::tests::*; use crate::hummock::compaction::{CompactionMode, TierCompactionPicker}; fn create_compaction_picker_for_test() -> LevelCompactionPicker { @@ -573,6 +568,7 @@ pub mod tests { let config = Arc::new( CompactionConfigBuilder::new() .max_compaction_bytes(100010) + .max_bytes_for_level_base(512) .level0_sub_level_compact_level_count(1) .build(), ); diff --git a/src/meta/src/hummock/compaction/picker/compaction_task_validator.rs b/src/meta/src/hummock/compaction/picker/compaction_task_validator.rs index eafe074a88288..7452f65d6503a 100644 --- a/src/meta/src/hummock/compaction/picker/compaction_task_validator.rs +++ b/src/meta/src/hummock/compaction/picker/compaction_task_validator.rs @@ -89,32 +89,32 @@ struct TierCompactionTaskValidationRule { impl CompactionTaskValidationRule for TierCompactionTaskValidationRule { fn validate(&self, input: &CompactionInput, stats: &mut LocalPickerStatistic) -> bool { - // so the design here wants to merge multiple overlapping-levels in one compaction - let max_compaction_bytes = std::cmp::min( - self.config.max_compaction_bytes, - self.config.sub_level_max_compaction_bytes - * self.config.level0_overlapping_sub_level_compact_level_count as u64, - ); - // Limit sstable file count to avoid using too much memory. let overlapping_max_compact_file_numer = std::cmp::min( self.config.level0_max_compact_file_number, MAX_COMPACT_LEVEL_COUNT as u64, ); - let waiting_enough_files = { - if input.select_input_size > max_compaction_bytes { - false - } else { - input.total_file_count <= overlapping_max_compact_file_numer - } - }; + if input.total_file_count >= overlapping_max_compact_file_numer + || input.input_levels.len() >= MAX_COMPACT_LEVEL_COUNT + { + return true; + } + + // so the design here wants to merge multiple overlapping-levels in one compaction + let max_compaction_bytes = std::cmp::min( + self.config.max_compaction_bytes, + self.config.sub_level_max_compaction_bytes + * self.config.level0_overlapping_sub_level_compact_level_count as u64, + ); // If waiting_enough_files is not satisfied, we will raise the priority of the number of // levels to ensure that we can merge as many sub_levels as possible let tier_sub_level_compact_level_count = self.config.level0_overlapping_sub_level_compact_level_count as usize; - if input.input_levels.len() < tier_sub_level_compact_level_count && waiting_enough_files { + if input.input_levels.len() < tier_sub_level_compact_level_count + && input.select_input_size < max_compaction_bytes + { stats.skip_by_count_limit += 1; return false; } @@ -129,7 +129,9 @@ struct IntraCompactionTaskValidationRule { impl CompactionTaskValidationRule for IntraCompactionTaskValidationRule { fn validate(&self, input: &CompactionInput, stats: &mut LocalPickerStatistic) -> bool { - if input.total_file_count >= self.config.level0_max_compact_file_number { + if input.total_file_count >= self.config.level0_max_compact_file_number + || input.input_levels.len() >= MAX_COMPACT_LEVEL_COUNT + { return true; } @@ -175,6 +177,12 @@ struct BaseCompactionTaskValidationRule { impl CompactionTaskValidationRule for BaseCompactionTaskValidationRule { fn validate(&self, input: &CompactionInput, stats: &mut LocalPickerStatistic) -> bool { + if input.total_file_count >= self.config.level0_max_compact_file_number + || input.input_levels.len() >= MAX_COMPACT_LEVEL_COUNT + { + return true; + } + // The size of target level may be too large, we shall skip this compact task and wait // the data in base level compact to lower level. if input.target_input_size > self.config.max_compaction_bytes { diff --git a/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs index 541b93254172b..980c3030a98fb 100644 --- a/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/intra_compaction_picker.rs @@ -260,7 +260,7 @@ pub mod tests { use super::*; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; - use crate::hummock::compaction::level_selector::tests::{ + use crate::hummock::compaction::selector::tests::{ generate_l0_nonoverlapping_multi_sublevels, generate_l0_nonoverlapping_sublevels, generate_l0_overlapping_sublevels, generate_level, generate_table, push_table_level0_overlapping, push_tables_level0_nonoverlapping, @@ -623,53 +623,4 @@ pub mod tests { assert!(is_l0_trivial_move(&ret)); assert_eq!(ret.input_levels[0].table_infos.len(), 1); } - - #[test] - fn test_issue_11154() { - let mut local_stats = LocalPickerStatistic::default(); - let mut l0 = generate_l0_overlapping_sublevels(vec![ - vec![ - generate_table(4, 1, 1, 200, 1), - generate_table(5, 1, 400, 600, 1), - ], - vec![ - generate_table(6, 1, 1, 200, 1), - generate_table(7, 1, 400, 600, 1), - ], - vec![ - generate_table(8, 1, 1, 200, 1), - generate_table(9, 1, 400, 600, 1), - ], - vec![generate_table(10, 1, 1, 600, 1)], - ]); - // We can set level_type only because the input above is valid. - for s in &mut l0.sub_levels { - s.level_type = LevelType::Nonoverlapping as i32; - } - let levels = Levels { - l0: Some(l0), - levels: vec![generate_level(1, vec![generate_table(3, 1, 0, 100000, 1)])], - member_table_ids: vec![1], - ..Default::default() - }; - let levels_handler = vec![LevelHandler::new(0), LevelHandler::new(1)]; - - // Pick with large max_compaction_bytes results all sub levels included in input. - let config = Arc::new( - CompactionConfigBuilder::new() - .max_compaction_bytes(800) - .sub_level_max_compaction_bytes(50000) - .max_bytes_for_level_base(500000) - .level0_sub_level_compact_level_count(1) - .build(), - ); - // Only include sub-level 0 results will violate MAX_WRITE_AMPLIFICATION. - // So all sub-levels are included to make write amplification < MAX_WRITE_AMPLIFICATION. - let mut picker = IntraCompactionPicker::new(config); - let ret = picker - .pick_compaction(&levels, &levels_handler, &mut local_stats) - .unwrap(); - // avoid add sst_10 and cause a big task - assert_eq!(3, ret.input_levels.len()); - } } diff --git a/src/meta/src/hummock/compaction/picker/manual_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/manual_compaction_picker.rs index e8f8c908d0fd3..23b1f0b6a9960 100644 --- a/src/meta/src/hummock/compaction/picker/manual_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/manual_compaction_picker.rs @@ -24,7 +24,7 @@ use super::{CompactionInput, CompactionPicker, LocalPickerStatistic}; use crate::hummock::compaction::overlap_strategy::{ OverlapInfo, OverlapStrategy, RangeOverlapInfo, }; -use crate::hummock::compaction::ManualCompactionOption; +use crate::hummock::compaction::selector::ManualCompactionOption; use crate::hummock::level_handler::LevelHandler; pub struct ManualCompactionPicker { @@ -333,12 +333,12 @@ pub mod tests { use super::*; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; - use crate::hummock::compaction::level_selector::tests::{ + use crate::hummock::compaction::overlap_strategy::RangeOverlapStrategy; + use crate::hummock::compaction::selector::tests::{ assert_compaction_task, generate_l0_nonoverlapping_sublevels, generate_l0_overlapping_sublevels, generate_level, generate_table, }; - use crate::hummock::compaction::level_selector::{LevelSelector, ManualCompactionSelector}; - use crate::hummock::compaction::overlap_strategy::RangeOverlapStrategy; + use crate::hummock::compaction::selector::{CompactionSelector, ManualCompactionSelector}; use crate::hummock::compaction::LocalSelectorStatistic; use crate::hummock::model::CompactionGroup; use crate::hummock::test_utils::iterator_test_key_of_epoch; diff --git a/src/meta/src/hummock/compaction/picker/min_overlap_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/min_overlap_compaction_picker.rs index 0cf44795e0acb..c17fa305be0e4 100644 --- a/src/meta/src/hummock/compaction/picker/min_overlap_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/min_overlap_compaction_picker.rs @@ -209,6 +209,13 @@ impl NonOverlapSubLevelPicker { break; } + // more than 1 sub_level + if ret.total_file_count > 1 && ret.total_file_size >= self.max_compaction_bytes + || ret.total_file_count >= self.max_file_count as usize + { + break; + } + let mut overlap_files_range = overlap_info.check_multiple_include(&target_level.table_infos); if overlap_files_range.is_empty() { @@ -288,15 +295,6 @@ impl NonOverlapSubLevelPicker { .map(|(_, files)| files.len()) .sum::(); - // more than 1 sub_level - if ret.total_file_count > 1 - && (ret.total_file_size + (add_files_size + current_level_size) - >= self.max_compaction_bytes - || ret.total_file_count + add_files_count >= self.max_file_count as usize) - { - break; - } - if ret .sstable_infos .iter() @@ -379,10 +377,10 @@ pub mod tests { pub use risingwave_pb::hummock::{KeyRange, Level, LevelType}; use super::*; - use crate::hummock::compaction::level_selector::tests::{ + use crate::hummock::compaction::overlap_strategy::RangeOverlapStrategy; + use crate::hummock::compaction::selector::tests::{ generate_l0_nonoverlapping_sublevels, generate_table, }; - use crate::hummock::compaction::overlap_strategy::RangeOverlapStrategy; #[test] fn test_compact_l1() { diff --git a/src/meta/src/hummock/compaction/picker/space_reclaim_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/space_reclaim_compaction_picker.rs index a3ff21831fef8..7dc7a4688e644 100644 --- a/src/meta/src/hummock/compaction/picker/space_reclaim_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/space_reclaim_compaction_picker.rs @@ -104,7 +104,6 @@ impl SpaceReclaimCompactionPicker { } while state.last_level <= levels.levels.len() { let mut is_trivial_task = true; - let mut select_file_size = 0; for sst in &levels.levels[state.last_level - 1].table_infos { let exist_count = self.exist_table_count(sst); let need_reclaim = exist_count < sst.table_ids.len(); @@ -122,15 +121,14 @@ impl SpaceReclaimCompactionPicker { } if !is_trivial_sst { - if !select_input_ssts.is_empty() && is_trivial_task { + if !select_input_ssts.is_empty() { break; } is_trivial_task = false; } select_input_ssts.push(sst.clone()); - select_file_size += sst.file_size; - if select_file_size > self.max_space_reclaim_bytes && !is_trivial_task { + if !is_trivial_task { break; } } @@ -174,12 +172,13 @@ mod test { use super::*; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; - use crate::hummock::compaction::level_selector::tests::{ + use crate::hummock::compaction::selector::tests::{ assert_compaction_task, generate_l0_nonoverlapping_sublevels, generate_level, generate_table_with_ids_and_epochs, }; - use crate::hummock::compaction::level_selector::SpaceReclaimCompactionSelector; - use crate::hummock::compaction::{LevelSelector, LocalSelectorStatistic}; + use crate::hummock::compaction::selector::{ + CompactionSelector, LocalSelectorStatistic, SpaceReclaimCompactionSelector, + }; use crate::hummock::model::CompactionGroup; #[test] diff --git a/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs index 5b3058317a4b0..a64bf489a197a 100644 --- a/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/tier_compaction_picker.rs @@ -163,12 +163,12 @@ pub mod tests { use risingwave_pb::hummock::{LevelType, OverlappingLevel}; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; - use crate::hummock::compaction::level_selector::tests::{ - generate_l0_overlapping_sublevels, generate_table, push_table_level0_overlapping, - }; use crate::hummock::compaction::picker::{ CompactionPicker, LocalPickerStatistic, TierCompactionPicker, }; + use crate::hummock::compaction::selector::tests::{ + generate_l0_overlapping_sublevels, generate_table, push_table_level0_overlapping, + }; use crate::hummock::level_handler::LevelHandler; #[test] diff --git a/src/meta/src/hummock/compaction/picker/tombstone_reclaim_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/tombstone_reclaim_compaction_picker.rs index 994bfbc5ea557..04d8cb791c881 100644 --- a/src/meta/src/hummock/compaction/picker/tombstone_reclaim_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/tombstone_reclaim_compaction_picker.rs @@ -23,7 +23,6 @@ use crate::hummock::level_handler::LevelHandler; pub struct TombstoneReclaimCompactionPicker { overlap_strategy: Arc, - max_compaction_bytes: u64, delete_ratio: u64, range_delete_ratio: u64, } @@ -36,13 +35,11 @@ pub struct TombstoneReclaimPickerState { impl TombstoneReclaimCompactionPicker { pub fn new( overlap_strategy: Arc, - max_compaction_bytes: u64, delete_ratio: u64, range_delete_ratio: u64, ) -> Self { Self { overlap_strategy, - max_compaction_bytes, delete_ratio, range_delete_ratio, } @@ -55,34 +52,23 @@ impl TombstoneReclaimCompactionPicker { state: &mut TombstoneReclaimPickerState, ) -> Option { assert!(!levels.levels.is_empty()); - let mut select_input_ssts = vec![]; if state.last_level == 0 { state.last_level = 1; } while state.last_level <= levels.levels.len() { - let mut select_file_size = 0; + let mut select_input_ssts = vec![]; for sst in &levels.levels[state.last_level - 1].table_infos { let need_reclaim = (sst.range_tombstone_count * 100 >= sst.total_key_count * self.range_delete_ratio) || (sst.stale_key_count * 100 >= sst.total_key_count * self.delete_ratio); if !need_reclaim || level_handlers[state.last_level].is_pending_compact(&sst.sst_id) { - if !select_input_ssts.is_empty() { - // Our goal is to pick as many complete layers of data as possible and keep - // the picked files contiguous to avoid overlapping - // key_ranges, so the strategy is to pick as many - // contiguous files as possible (at least one) - break; - } continue; } select_input_ssts.push(sst.clone()); - select_file_size += sst.file_size; - if select_file_size > self.max_compaction_bytes { - break; - } + break; } // turn to next_round @@ -108,6 +94,7 @@ impl TombstoneReclaimCompactionPicker { } } if pending_compact { + state.last_level += 1; continue; } InputLevel { @@ -151,7 +138,7 @@ pub mod tests { use super::*; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; use crate::hummock::compaction::create_overlap_strategy; - use crate::hummock::compaction::level_selector::tests::{generate_level, generate_table}; + use crate::hummock::compaction::selector::tests::{generate_level, generate_table}; #[test] fn test_basic() { @@ -180,12 +167,7 @@ pub mod tests { let config = Arc::new(CompactionConfigBuilder::new().build()); let strategy = create_overlap_strategy(config.compaction_mode()); - let picker = TombstoneReclaimCompactionPicker::new( - strategy.clone(), - config.max_compaction_bytes, - 40, - 20, - ); + let picker = TombstoneReclaimCompactionPicker::new(strategy.clone(), 40, 20); let ret = picker.pick_compaction(&levels, &levels_handler, &mut state); assert!(ret.is_none()); let mut sst = generate_table(3, 1, 201, 300, 1); @@ -203,8 +185,7 @@ pub mod tests { sst.range_tombstone_count = 30; sst.total_key_count = 100; levels.levels[0].table_infos.push(sst); - let picker = - TombstoneReclaimCompactionPicker::new(strategy, config.max_compaction_bytes, 50, 10); + let picker = TombstoneReclaimCompactionPicker::new(strategy, 50, 10); let mut state = TombstoneReclaimPickerState::default(); let ret = picker .pick_compaction(&levels, &levels_handler, &mut state) diff --git a/src/meta/src/hummock/compaction/picker/ttl_reclaim_compaction_picker.rs b/src/meta/src/hummock/compaction/picker/ttl_reclaim_compaction_picker.rs index 9f84b99453f17..cc3b3ca41d84f 100644 --- a/src/meta/src/hummock/compaction/picker/ttl_reclaim_compaction_picker.rs +++ b/src/meta/src/hummock/compaction/picker/ttl_reclaim_compaction_picker.rs @@ -61,15 +61,11 @@ impl TtlPickerState { } pub struct TtlReclaimCompactionPicker { - max_ttl_reclaim_bytes: u64, table_id_to_ttl: HashMap, } impl TtlReclaimCompactionPicker { - pub fn new( - max_ttl_reclaim_bytes: u64, - table_id_to_options: HashMap, - ) -> Self { + pub fn new(table_id_to_options: HashMap) -> Self { let table_id_to_ttl: HashMap = table_id_to_options .iter() .filter(|id_to_option| { @@ -79,10 +75,7 @@ impl TtlReclaimCompactionPicker { .map(|id_to_option| (*id_to_option.0, id_to_option.1.retention_seconds.unwrap())) .collect(); - Self { - max_ttl_reclaim_bytes, - table_id_to_ttl, - } + Self { table_id_to_ttl } } fn filter(&self, sst: &SstableInfo, current_epoch_physical_time: u64) -> bool { @@ -154,7 +147,6 @@ impl TtlReclaimCompactionPicker { } let current_epoch_physical_time = Epoch::now().physical_time(); - let mut select_file_size = 0; for sst in &reclaimed_level.table_infos { let unmatched_sst = sst @@ -167,22 +159,11 @@ impl TtlReclaimCompactionPicker { || level_handler.is_pending_compact(&sst.sst_id) || self.filter(sst, current_epoch_physical_time) { - if !select_input_ssts.is_empty() { - // Our goal is to pick as many complete layers of data as possible and keep the - // picked files contiguous to avoid overlapping key_ranges, so the strategy is - // to pick as many contiguous files as possible (at least one) - break; - } - continue; } select_input_ssts.push(sst.clone()); - select_file_size += sst.file_size; - - if select_file_size > self.max_ttl_reclaim_bytes { - break; - } + break; } // turn to next_round @@ -227,11 +208,11 @@ mod test { use super::*; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; - use crate::hummock::compaction::level_selector::tests::{ + use crate::hummock::compaction::selector::tests::{ assert_compaction_task, generate_l0_nonoverlapping_sublevels, generate_level, generate_table_with_ids_and_epochs, }; - use crate::hummock::compaction::level_selector::{LevelSelector, TtlCompactionSelector}; + use crate::hummock::compaction::selector::{CompactionSelector, TtlCompactionSelector}; use crate::hummock::compaction::LocalSelectorStatistic; use crate::hummock::model::CompactionGroup; @@ -402,7 +383,7 @@ mod test { assert_compaction_task(&task, &levels_handler); assert_eq!(task.input.input_levels.len(), 2); assert_eq!(task.input.input_levels[0].level_idx, 4); - assert_eq!(task.input.input_levels[0].table_infos.len(), 5); + assert_eq!(task.input.input_levels[0].table_infos.len(), 1); let mut start_id = 2; for sst in &task.input.input_levels[0].table_infos { @@ -453,9 +434,9 @@ mod test { assert_eq!(task.input.input_levels[0].level_idx, 4); // test select index, picker will select file from state - assert_eq!(task.input.input_levels[0].table_infos.len(), 4); + assert_eq!(task.input.input_levels[0].table_infos.len(), 1); - let mut start_id = 7; + let mut start_id = 3; for sst in &task.input.input_levels[0].table_infos { assert_eq!(start_id, sst.get_sst_id()); start_id += 1; @@ -495,17 +476,6 @@ mod test { assert_eq!(start_id, sst.get_sst_id()); start_id += 1; } - - assert!(selector - .pick_compaction( - 1, - &group_config, - &levels, - &mut levels_handler, - &mut local_stats, - table_id_to_options, - ) - .is_none()) } { @@ -633,8 +603,8 @@ mod test { }, ); - let expect_task_file_count = [3, 2, 1]; - let expect_task_sst_id_range = vec![vec![2, 3, 4], vec![6, 7], vec![10]]; + let expect_task_file_count = [1, 1, 1]; + let expect_task_sst_id_range = vec![vec![2], vec![3], vec![4]]; for (index, x) in expect_task_file_count.iter().enumerate() { // // pick ttl reclaim let task = selector @@ -715,8 +685,8 @@ mod test { }, ); - let expect_task_file_count = [3, 3]; - let expect_task_sst_id_range = vec![vec![2, 3, 4], vec![5, 6, 7]]; + let expect_task_file_count = [1, 1]; + let expect_task_sst_id_range = vec![vec![2], vec![3]]; for (index, x) in expect_task_file_count.iter().enumerate() { if index == expect_task_file_count.len() - 1 { table_id_to_options.insert( diff --git a/src/meta/src/hummock/compaction/selector/emergency_selector.rs b/src/meta/src/hummock/compaction/selector/emergency_selector.rs new file mode 100644 index 0000000000000..3f5a81e264956 --- /dev/null +++ b/src/meta/src/hummock/compaction/selector/emergency_selector.rs @@ -0,0 +1,70 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use risingwave_common::catalog::TableOption; +use risingwave_hummock_sdk::HummockCompactionTaskId; +use risingwave_pb::hummock::compact_task; +use risingwave_pb::hummock::hummock_version::Levels; + +use super::{CompactionSelector, DynamicLevelSelectorCore, LocalSelectorStatistic}; +use crate::hummock::compaction::picker::{EmergencyCompactionPicker, LocalPickerStatistic}; +use crate::hummock::compaction::{create_compaction_task, CompactionTask}; +use crate::hummock::level_handler::LevelHandler; +use crate::hummock::model::CompactionGroup; + +#[derive(Default)] +pub struct EmergencySelector {} + +impl CompactionSelector for EmergencySelector { + fn pick_compaction( + &mut self, + task_id: HummockCompactionTaskId, + group: &CompactionGroup, + levels: &Levels, + level_handlers: &mut [LevelHandler], + selector_stats: &mut LocalSelectorStatistic, + _table_id_to_options: HashMap, + ) -> Option { + let dynamic_level_core = DynamicLevelSelectorCore::new(group.compaction_config.clone()); + let ctx = dynamic_level_core.calculate_level_base_size(levels); + let picker = + EmergencyCompactionPicker::new(ctx.base_level, group.compaction_config.clone()); + + let mut stats = LocalPickerStatistic::default(); + if let Some(compaction_input) = picker.pick_compaction(levels, level_handlers, &mut stats) { + compaction_input.add_pending_task(task_id, level_handlers); + + return Some(create_compaction_task( + group.compaction_config.as_ref(), + compaction_input, + ctx.base_level, + self.task_type(), + )); + } + + selector_stats.skip_picker.push((0, ctx.base_level, stats)); + + None + } + + fn name(&self) -> &'static str { + "EmergencyCompaction" + } + + fn task_type(&self) -> compact_task::TaskType { + compact_task::TaskType::Emergency + } +} diff --git a/src/meta/src/hummock/compaction/level_selector.rs b/src/meta/src/hummock/compaction/selector/level_selector.rs similarity index 63% rename from src/meta/src/hummock/compaction/level_selector.rs rename to src/meta/src/hummock/compaction/selector/level_selector.rs index 05975f64da1b1..7c57b2ecf00d4 100644 --- a/src/meta/src/hummock/compaction/level_selector.rs +++ b/src/meta/src/hummock/compaction/selector/level_selector.rs @@ -25,23 +25,17 @@ use risingwave_hummock_sdk::HummockCompactionTaskId; use risingwave_pb::hummock::hummock_version::Levels; use risingwave_pb::hummock::{compact_task, CompactionConfig, LevelType}; -use super::picker::{ - CompactionTaskValidator, EmergencyCompactionPicker, IntraCompactionPicker, - SpaceReclaimCompactionPicker, SpaceReclaimPickerState, TtlPickerState, - TtlReclaimCompactionPicker, -}; use super::{ - create_compaction_task, LevelCompactionPicker, ManualCompactionOption, ManualCompactionPicker, - TierCompactionPicker, + create_compaction_task, CompactionSelector, LevelCompactionPicker, TierCompactionPicker, }; use crate::hummock::compaction::overlap_strategy::OverlapStrategy; use crate::hummock::compaction::picker::{ - CompactionPicker, LocalPickerStatistic, MinOverlappingPicker, + CompactionPicker, CompactionTaskValidator, IntraCompactionPicker, LocalPickerStatistic, + MinOverlappingPicker, }; use crate::hummock::compaction::{create_overlap_strategy, CompactionTask, LocalSelectorStatistic}; use crate::hummock::level_handler::LevelHandler; use crate::hummock::model::CompactionGroup; -use crate::rpc::metrics::MetaMetrics; pub const SCORE_BASE: u64 = 100; @@ -54,30 +48,23 @@ pub enum PickerType { BottomLevel, } -#[derive(Default, Debug)] -pub struct PickerInfo { - score: u64, - select_level: usize, - target_level: usize, - picker_type: PickerType, +impl ToString for PickerType { + fn to_string(&self) -> String { + match self { + PickerType::Tier => String::from("Tier"), + PickerType::Intra => String::from("Intra"), + PickerType::ToBase => String::from("ToBase"), + PickerType::BottomLevel => String::from("BottomLevel"), + } + } } -pub trait LevelSelector: Sync + Send { - fn pick_compaction( - &mut self, - task_id: HummockCompactionTaskId, - group: &CompactionGroup, - levels: &Levels, - level_handlers: &mut [LevelHandler], - selector_stats: &mut LocalSelectorStatistic, - table_id_to_options: HashMap, - ) -> Option; - - fn report_statistic_metrics(&self, _metrics: &MetaMetrics) {} - - fn name(&self) -> &'static str; - - fn task_type(&self) -> compact_task::TaskType; +#[derive(Default, Debug)] +pub struct PickerInfo { + pub score: u64, + pub select_level: usize, + pub target_level: usize, + pub picker_type: PickerType, } #[derive(Default, Debug)] @@ -202,7 +189,11 @@ impl DynamicLevelSelectorCore { ctx } - fn get_priority_levels(&self, levels: &Levels, handlers: &[LevelHandler]) -> SelectContext { + pub(crate) fn get_priority_levels( + &self, + levels: &Levels, + handlers: &[LevelHandler], + ) -> SelectContext { let mut ctx = self.calculate_level_base_size(levels); let idle_file_count = levels @@ -279,7 +270,7 @@ impl DynamicLevelSelectorCore { // Reduce the level num of l0 non-overlapping sub_level ctx.score_levels.push({ PickerInfo { - score: non_overlapping_score, + score: non_overlapping_score + 1, select_level: 0, target_level: ctx.base_level, picker_type: PickerType::ToBase, @@ -410,7 +401,7 @@ impl DynamicLevelSelectorCore { } } -impl LevelSelector for DynamicLevelSelector { +impl CompactionSelector for DynamicLevelSelector { fn pick_compaction( &mut self, task_id: HummockCompactionTaskId, @@ -467,441 +458,26 @@ impl LevelSelector for DynamicLevelSelector { } } -pub struct ManualCompactionSelector { - option: ManualCompactionOption, -} - -impl ManualCompactionSelector { - pub fn new(option: ManualCompactionOption) -> Self { - Self { option } - } -} - -impl LevelSelector for ManualCompactionSelector { - fn pick_compaction( - &mut self, - task_id: HummockCompactionTaskId, - group: &CompactionGroup, - levels: &Levels, - level_handlers: &mut [LevelHandler], - _selector_stats: &mut LocalSelectorStatistic, - _table_id_to_options: HashMap, - ) -> Option { - let dynamic_level_core = DynamicLevelSelectorCore::new(group.compaction_config.clone()); - let overlap_strategy = create_overlap_strategy(group.compaction_config.compaction_mode()); - let ctx = dynamic_level_core.calculate_level_base_size(levels); - let (mut picker, base_level) = { - let target_level = if self.option.level == 0 { - ctx.base_level - } else if self.option.level == group.compaction_config.max_level as usize { - self.option.level - } else { - self.option.level + 1 - }; - if self.option.level > 0 && self.option.level < ctx.base_level { - return None; - } - ( - ManualCompactionPicker::new(overlap_strategy, self.option.clone(), target_level), - ctx.base_level, - ) - }; - - let compaction_input = - picker.pick_compaction(levels, level_handlers, &mut LocalPickerStatistic::default())?; - compaction_input.add_pending_task(task_id, level_handlers); - - Some(create_compaction_task( - group.compaction_config.as_ref(), - compaction_input, - base_level, - self.task_type(), - )) - } - - fn name(&self) -> &'static str { - "ManualCompactionSelector" - } - - fn task_type(&self) -> compact_task::TaskType { - compact_task::TaskType::Manual - } -} - -#[derive(Default)] -pub struct SpaceReclaimCompactionSelector { - state: HashMap, -} - -impl LevelSelector for SpaceReclaimCompactionSelector { - fn pick_compaction( - &mut self, - task_id: HummockCompactionTaskId, - group: &CompactionGroup, - levels: &Levels, - level_handlers: &mut [LevelHandler], - _selector_stats: &mut LocalSelectorStatistic, - _table_id_to_options: HashMap, - ) -> Option { - let dynamic_level_core = DynamicLevelSelectorCore::new(group.compaction_config.clone()); - let mut picker = SpaceReclaimCompactionPicker::new( - group.compaction_config.max_space_reclaim_bytes, - levels.member_table_ids.iter().cloned().collect(), - ); - let ctx = dynamic_level_core.calculate_level_base_size(levels); - let state = self.state.entry(group.group_id).or_default(); - - let compaction_input = picker.pick_compaction(levels, level_handlers, state)?; - compaction_input.add_pending_task(task_id, level_handlers); - - Some(create_compaction_task( - dynamic_level_core.get_config(), - compaction_input, - ctx.base_level, - self.task_type(), - )) - } - - fn name(&self) -> &'static str { - "SpaceReclaimCompaction" - } - - fn task_type(&self) -> compact_task::TaskType { - compact_task::TaskType::SpaceReclaim - } -} - -#[derive(Default)] -pub struct TtlCompactionSelector { - state: HashMap, -} - -impl LevelSelector for TtlCompactionSelector { - fn pick_compaction( - &mut self, - task_id: HummockCompactionTaskId, - group: &CompactionGroup, - levels: &Levels, - level_handlers: &mut [LevelHandler], - _selector_stats: &mut LocalSelectorStatistic, - table_id_to_options: HashMap, - ) -> Option { - let dynamic_level_core = DynamicLevelSelectorCore::new(group.compaction_config.clone()); - let ctx = dynamic_level_core.calculate_level_base_size(levels); - let picker = TtlReclaimCompactionPicker::new( - group.compaction_config.max_space_reclaim_bytes, - table_id_to_options, - ); - let state = self.state.entry(group.group_id).or_default(); - let compaction_input = picker.pick_compaction(levels, level_handlers, state)?; - compaction_input.add_pending_task(task_id, level_handlers); - - Some(create_compaction_task( - group.compaction_config.as_ref(), - compaction_input, - ctx.base_level, - self.task_type(), - )) - } - - fn name(&self) -> &'static str { - "TtlCompaction" - } - - fn task_type(&self) -> compact_task::TaskType { - compact_task::TaskType::Ttl - } -} - -pub fn default_level_selector() -> Box { - Box::::default() -} - -#[derive(Default)] -pub struct EmergencySelector {} - -impl LevelSelector for EmergencySelector { - fn pick_compaction( - &mut self, - task_id: HummockCompactionTaskId, - group: &CompactionGroup, - levels: &Levels, - level_handlers: &mut [LevelHandler], - selector_stats: &mut LocalSelectorStatistic, - _table_id_to_options: HashMap, - ) -> Option { - let dynamic_level_core = DynamicLevelSelectorCore::new(group.compaction_config.clone()); - let ctx = dynamic_level_core.calculate_level_base_size(levels); - let picker = - EmergencyCompactionPicker::new(ctx.base_level, group.compaction_config.clone()); - - let mut stats = LocalPickerStatistic::default(); - if let Some(compaction_input) = picker.pick_compaction(levels, level_handlers, &mut stats) { - compaction_input.add_pending_task(task_id, level_handlers); - - return Some(create_compaction_task( - group.compaction_config.as_ref(), - compaction_input, - ctx.base_level, - self.task_type(), - )); - } - - selector_stats.skip_picker.push((0, ctx.base_level, stats)); - - None - } - - fn name(&self) -> &'static str { - "EmergencyCompaction" - } - - fn task_type(&self) -> compact_task::TaskType { - compact_task::TaskType::Emergency - } -} - #[cfg(test)] pub mod tests { - use std::ops::Range; + use std::collections::HashMap; + use std::sync::Arc; use itertools::Itertools; use risingwave_common::constants::hummock::CompactionFilterFlag; use risingwave_pb::hummock::compaction_config::CompactionMode; - use risingwave_pb::hummock::{KeyRange, Level, LevelType, OverlappingLevel, SstableInfo}; + use risingwave_pb::hummock::hummock_version::Levels; - use super::*; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; - use crate::hummock::test_utils::iterator_test_key_of_epoch; - - pub fn push_table_level0_overlapping(levels: &mut Levels, sst: SstableInfo) { - levels.l0.as_mut().unwrap().total_file_size += sst.file_size; - levels.l0.as_mut().unwrap().sub_levels.push(Level { - level_idx: 0, - level_type: LevelType::Overlapping as i32, - total_file_size: sst.file_size, - uncompressed_file_size: sst.uncompressed_file_size, - sub_level_id: sst.get_sst_id(), - table_infos: vec![sst], - }); - } - - pub fn push_table_level0_nonoverlapping(levels: &mut Levels, sst: SstableInfo) { - push_table_level0_overlapping(levels, sst); - levels - .l0 - .as_mut() - .unwrap() - .sub_levels - .last_mut() - .unwrap() - .level_type = LevelType::Nonoverlapping as i32; - } - - pub fn push_tables_level0_nonoverlapping(levels: &mut Levels, table_infos: Vec) { - let total_file_size = table_infos.iter().map(|table| table.file_size).sum::(); - let uncompressed_file_size = table_infos - .iter() - .map(|table| table.uncompressed_file_size) - .sum(); - let sub_level_id = table_infos[0].get_sst_id(); - levels.l0.as_mut().unwrap().total_file_size += total_file_size; - levels.l0.as_mut().unwrap().sub_levels.push(Level { - level_idx: 0, - level_type: LevelType::Nonoverlapping as i32, - total_file_size, - sub_level_id, - table_infos, - uncompressed_file_size, - }); - } - - pub fn generate_table( - id: u64, - table_prefix: u64, - left: usize, - right: usize, - epoch: u64, - ) -> SstableInfo { - SstableInfo { - object_id: id, - sst_id: id, - key_range: Some(KeyRange { - left: iterator_test_key_of_epoch(table_prefix, left, epoch), - right: iterator_test_key_of_epoch(table_prefix, right, epoch), - right_exclusive: false, - }), - file_size: (right - left + 1) as u64, - table_ids: vec![table_prefix as u32], - uncompressed_file_size: (right - left + 1) as u64, - total_key_count: (right - left + 1) as u64, - ..Default::default() - } - } - - #[allow(clippy::too_many_arguments)] - pub fn generate_table_with_ids_and_epochs( - id: u64, - table_prefix: u64, - left: usize, - right: usize, - epoch: u64, - table_ids: Vec, - min_epoch: u64, - max_epoch: u64, - ) -> SstableInfo { - SstableInfo { - object_id: id, - sst_id: id, - key_range: Some(KeyRange { - left: iterator_test_key_of_epoch(table_prefix, left, epoch), - right: iterator_test_key_of_epoch(table_prefix, right, epoch), - right_exclusive: false, - }), - file_size: (right - left + 1) as u64, - table_ids, - uncompressed_file_size: (right - left + 1) as u64, - min_epoch, - max_epoch, - ..Default::default() - } - } - - pub fn generate_tables( - ids: Range, - keys: Range, - epoch: u64, - file_size: u64, - ) -> Vec { - let step = (keys.end - keys.start) / (ids.end - ids.start) as usize; - let mut start = keys.start; - let mut tables = vec![]; - for id in ids { - let mut table = generate_table(id, 1, start, start + step - 1, epoch); - table.file_size = file_size; - tables.push(table); - start += step; - } - tables - } - - pub fn generate_level(level_idx: u32, table_infos: Vec) -> Level { - let total_file_size = table_infos.iter().map(|sst| sst.file_size).sum(); - let uncompressed_file_size = table_infos - .iter() - .map(|sst| sst.uncompressed_file_size) - .sum(); - Level { - level_idx, - level_type: LevelType::Nonoverlapping as i32, - table_infos, - total_file_size, - sub_level_id: 0, - uncompressed_file_size, - } - } - - /// Returns a `OverlappingLevel`, with each `table_infos`'s element placed in a nonoverlapping - /// sub-level. - pub fn generate_l0_nonoverlapping_sublevels(table_infos: Vec) -> OverlappingLevel { - let total_file_size = table_infos.iter().map(|table| table.file_size).sum::(); - let uncompressed_file_size = table_infos - .iter() - .map(|table| table.uncompressed_file_size) - .sum::(); - OverlappingLevel { - sub_levels: table_infos - .into_iter() - .enumerate() - .map(|(idx, table)| Level { - level_idx: 0, - level_type: LevelType::Nonoverlapping as i32, - total_file_size: table.file_size, - uncompressed_file_size: table.uncompressed_file_size, - sub_level_id: idx as u64, - table_infos: vec![table], - }) - .collect_vec(), - total_file_size, - uncompressed_file_size, - } - } - - pub fn generate_l0_nonoverlapping_multi_sublevels( - table_infos: Vec>, - ) -> OverlappingLevel { - let mut l0 = OverlappingLevel { - sub_levels: table_infos - .into_iter() - .enumerate() - .map(|(idx, table)| Level { - level_idx: 0, - level_type: LevelType::Nonoverlapping as i32, - total_file_size: table.iter().map(|table| table.file_size).sum::(), - uncompressed_file_size: table - .iter() - .map(|sst| sst.uncompressed_file_size) - .sum::(), - sub_level_id: idx as u64, - table_infos: table, - }) - .collect_vec(), - total_file_size: 0, - uncompressed_file_size: 0, - }; - - l0.total_file_size = l0.sub_levels.iter().map(|l| l.total_file_size).sum::(); - l0.uncompressed_file_size = l0 - .sub_levels - .iter() - .map(|l| l.uncompressed_file_size) - .sum::(); - l0 - } - - /// Returns a `OverlappingLevel`, with each `table_infos`'s element placed in a overlapping - /// sub-level. - pub fn generate_l0_overlapping_sublevels( - table_infos: Vec>, - ) -> OverlappingLevel { - let mut l0 = OverlappingLevel { - sub_levels: table_infos - .into_iter() - .enumerate() - .map(|(idx, table)| Level { - level_idx: 0, - level_type: LevelType::Overlapping as i32, - total_file_size: table.iter().map(|table| table.file_size).sum::(), - sub_level_id: idx as u64, - table_infos: table.clone(), - uncompressed_file_size: table - .iter() - .map(|sst| sst.uncompressed_file_size) - .sum::(), - }) - .collect_vec(), - total_file_size: 0, - uncompressed_file_size: 0, - }; - l0.total_file_size = l0.sub_levels.iter().map(|l| l.total_file_size).sum::(); - l0.uncompressed_file_size = l0 - .sub_levels - .iter() - .map(|l| l.uncompressed_file_size) - .sum::(); - l0 - } - - pub(crate) fn assert_compaction_task( - compact_task: &CompactionTask, - level_handlers: &[LevelHandler], - ) { - for i in &compact_task.input.input_levels { - for t in &i.table_infos { - assert!(level_handlers[i.level_idx as usize].is_pending_compact(&t.sst_id)); - } - } - } + use crate::hummock::compaction::selector::tests::{ + assert_compaction_task, generate_l0_nonoverlapping_sublevels, generate_level, + generate_tables, push_tables_level0_nonoverlapping, + }; + use crate::hummock::compaction::selector::{ + CompactionSelector, DynamicLevelSelector, DynamicLevelSelectorCore, LocalSelectorStatistic, + }; + use crate::hummock::level_handler::LevelHandler; + use crate::hummock::model::CompactionGroup; #[test] fn test_dynamic_level() { diff --git a/src/meta/src/hummock/compaction/selector/manual_selector.rs b/src/meta/src/hummock/compaction/selector/manual_selector.rs new file mode 100644 index 0000000000000..a00565a9807cd --- /dev/null +++ b/src/meta/src/hummock/compaction/selector/manual_selector.rs @@ -0,0 +1,122 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +use std::collections::{HashMap, HashSet}; + +use risingwave_common::catalog::TableOption; +use risingwave_hummock_sdk::compaction_group::StateTableId; +use risingwave_hummock_sdk::{HummockCompactionTaskId, HummockSstableId}; +use risingwave_pb::hummock::hummock_version::Levels; +use risingwave_pb::hummock::{compact_task, KeyRange}; + +use super::{CompactionSelector, DynamicLevelSelectorCore, LocalSelectorStatistic}; +use crate::hummock::compaction::picker::{ + CompactionPicker, LocalPickerStatistic, ManualCompactionPicker, +}; +use crate::hummock::compaction::{create_compaction_task, create_overlap_strategy, CompactionTask}; +use crate::hummock::level_handler::LevelHandler; +use crate::hummock::model::CompactionGroup; + +#[derive(Clone, Debug, PartialEq)] +pub struct ManualCompactionOption { + /// Filters out SSTs to pick. Has no effect if empty. + pub sst_ids: Vec, + /// Filters out SSTs to pick. + pub key_range: KeyRange, + /// Filters out SSTs to pick. Has no effect if empty. + pub internal_table_id: HashSet, + /// Input level. + pub level: usize, +} + +impl Default for ManualCompactionOption { + fn default() -> Self { + Self { + sst_ids: vec![], + key_range: KeyRange { + left: vec![], + right: vec![], + right_exclusive: false, + }, + internal_table_id: HashSet::default(), + level: 1, + } + } +} + +pub struct ManualCompactionSelector { + option: ManualCompactionOption, +} + +impl ManualCompactionSelector { + pub fn new(option: ManualCompactionOption) -> Self { + Self { option } + } +} + +impl CompactionSelector for ManualCompactionSelector { + fn pick_compaction( + &mut self, + task_id: HummockCompactionTaskId, + group: &CompactionGroup, + levels: &Levels, + level_handlers: &mut [LevelHandler], + _selector_stats: &mut LocalSelectorStatistic, + _table_id_to_options: HashMap, + ) -> Option { + let dynamic_level_core = DynamicLevelSelectorCore::new(group.compaction_config.clone()); + let overlap_strategy = create_overlap_strategy(group.compaction_config.compaction_mode()); + let ctx = dynamic_level_core.calculate_level_base_size(levels); + let (mut picker, base_level) = { + let target_level = if self.option.level == 0 { + ctx.base_level + } else if self.option.level == group.compaction_config.max_level as usize { + self.option.level + } else { + self.option.level + 1 + }; + if self.option.level > 0 && self.option.level < ctx.base_level { + return None; + } + ( + ManualCompactionPicker::new(overlap_strategy, self.option.clone(), target_level), + ctx.base_level, + ) + }; + + let compaction_input = + picker.pick_compaction(levels, level_handlers, &mut LocalPickerStatistic::default())?; + compaction_input.add_pending_task(task_id, level_handlers); + + Some(create_compaction_task( + group.compaction_config.as_ref(), + compaction_input, + base_level, + self.task_type(), + )) + } + + fn name(&self) -> &'static str { + "ManualCompactionSelector" + } + + fn task_type(&self) -> compact_task::TaskType { + compact_task::TaskType::Manual + } +} diff --git a/src/meta/src/hummock/compaction/selector/mod.rs b/src/meta/src/hummock/compaction/selector/mod.rs new file mode 100644 index 0000000000000..ef640b5e611ab --- /dev/null +++ b/src/meta/src/hummock/compaction/selector/mod.rs @@ -0,0 +1,345 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +mod emergency_selector; +pub(crate) mod level_selector; +mod manual_selector; +mod space_reclaim_selector; +mod tombstone_compaction_selector; +mod ttl_selector; + +use std::collections::HashMap; + +pub use emergency_selector::EmergencySelector; +pub use level_selector::{DynamicLevelSelector, DynamicLevelSelectorCore}; +pub use manual_selector::{ManualCompactionOption, ManualCompactionSelector}; +use risingwave_common::catalog::TableOption; +use risingwave_hummock_sdk::HummockCompactionTaskId; +use risingwave_pb::hummock::compact_task; +use risingwave_pb::hummock::hummock_version::Levels; +pub use space_reclaim_selector::SpaceReclaimCompactionSelector; +pub use tombstone_compaction_selector::TombstoneCompactionSelector; +pub use ttl_selector::TtlCompactionSelector; + +use super::picker::LocalPickerStatistic; +use super::{create_compaction_task, LevelCompactionPicker, TierCompactionPicker}; +use crate::hummock::compaction::CompactionTask; +use crate::hummock::level_handler::LevelHandler; +use crate::hummock::model::CompactionGroup; +use crate::rpc::metrics::MetaMetrics; + +pub trait CompactionSelector: Sync + Send { + fn pick_compaction( + &mut self, + task_id: HummockCompactionTaskId, + group: &CompactionGroup, + levels: &Levels, + level_handlers: &mut [LevelHandler], + selector_stats: &mut LocalSelectorStatistic, + table_id_to_options: HashMap, + ) -> Option; + + fn report_statistic_metrics(&self, _metrics: &MetaMetrics) {} + + fn name(&self) -> &'static str; + + fn task_type(&self) -> compact_task::TaskType; +} + +pub fn default_compaction_selector() -> Box { + Box::::default() +} + +#[derive(Default)] +pub struct LocalSelectorStatistic { + skip_picker: Vec<(usize, usize, LocalPickerStatistic)>, +} + +impl LocalSelectorStatistic { + pub fn report_to_metrics(&self, group_id: u64, metrics: &MetaMetrics) { + for (start_level, target_level, stats) in &self.skip_picker { + let level_label = format!("cg{}-{}-to-{}", group_id, start_level, target_level); + if stats.skip_by_write_amp_limit > 0 { + metrics + .compact_skip_frequency + .with_label_values(&[level_label.as_str(), "write-amp"]) + .inc(); + } + if stats.skip_by_count_limit > 0 { + metrics + .compact_skip_frequency + .with_label_values(&[level_label.as_str(), "count"]) + .inc(); + } + if stats.skip_by_pending_files > 0 { + metrics + .compact_skip_frequency + .with_label_values(&[level_label.as_str(), "pending-files"]) + .inc(); + } + if stats.skip_by_overlapping > 0 { + metrics + .compact_skip_frequency + .with_label_values(&[level_label.as_str(), "overlapping"]) + .inc(); + } + metrics + .compact_skip_frequency + .with_label_values(&[level_label.as_str(), "picker"]) + .inc(); + } + } +} + +#[cfg(test)] +pub mod tests { + use std::ops::Range; + + use itertools::Itertools; + use risingwave_pb::hummock::{KeyRange, Level, LevelType, OverlappingLevel, SstableInfo}; + + use super::*; + use crate::hummock::test_utils::iterator_test_key_of_epoch; + + pub fn push_table_level0_overlapping(levels: &mut Levels, sst: SstableInfo) { + levels.l0.as_mut().unwrap().total_file_size += sst.file_size; + levels.l0.as_mut().unwrap().sub_levels.push(Level { + level_idx: 0, + level_type: LevelType::Overlapping as i32, + total_file_size: sst.file_size, + uncompressed_file_size: sst.uncompressed_file_size, + sub_level_id: sst.get_sst_id(), + table_infos: vec![sst], + }); + } + + pub fn push_table_level0_nonoverlapping(levels: &mut Levels, sst: SstableInfo) { + push_table_level0_overlapping(levels, sst); + levels + .l0 + .as_mut() + .unwrap() + .sub_levels + .last_mut() + .unwrap() + .level_type = LevelType::Nonoverlapping as i32; + } + + pub fn push_tables_level0_nonoverlapping(levels: &mut Levels, table_infos: Vec) { + let total_file_size = table_infos.iter().map(|table| table.file_size).sum::(); + let uncompressed_file_size = table_infos + .iter() + .map(|table| table.uncompressed_file_size) + .sum(); + let sub_level_id = table_infos[0].get_sst_id(); + levels.l0.as_mut().unwrap().total_file_size += total_file_size; + levels.l0.as_mut().unwrap().sub_levels.push(Level { + level_idx: 0, + level_type: LevelType::Nonoverlapping as i32, + total_file_size, + sub_level_id, + table_infos, + uncompressed_file_size, + }); + } + + pub fn generate_table( + id: u64, + table_prefix: u64, + left: usize, + right: usize, + epoch: u64, + ) -> SstableInfo { + SstableInfo { + object_id: id, + sst_id: id, + key_range: Some(KeyRange { + left: iterator_test_key_of_epoch(table_prefix, left, epoch), + right: iterator_test_key_of_epoch(table_prefix, right, epoch), + right_exclusive: false, + }), + file_size: (right - left + 1) as u64, + table_ids: vec![table_prefix as u32], + uncompressed_file_size: (right - left + 1) as u64, + total_key_count: (right - left + 1) as u64, + ..Default::default() + } + } + + #[allow(clippy::too_many_arguments)] + pub fn generate_table_with_ids_and_epochs( + id: u64, + table_prefix: u64, + left: usize, + right: usize, + epoch: u64, + table_ids: Vec, + min_epoch: u64, + max_epoch: u64, + ) -> SstableInfo { + SstableInfo { + object_id: id, + sst_id: id, + key_range: Some(KeyRange { + left: iterator_test_key_of_epoch(table_prefix, left, epoch), + right: iterator_test_key_of_epoch(table_prefix, right, epoch), + right_exclusive: false, + }), + file_size: (right - left + 1) as u64, + table_ids, + uncompressed_file_size: (right - left + 1) as u64, + min_epoch, + max_epoch, + ..Default::default() + } + } + + pub fn generate_tables( + ids: Range, + keys: Range, + epoch: u64, + file_size: u64, + ) -> Vec { + let step = (keys.end - keys.start) / (ids.end - ids.start) as usize; + let mut start = keys.start; + let mut tables = vec![]; + for id in ids { + let mut table = generate_table(id, 1, start, start + step - 1, epoch); + table.file_size = file_size; + tables.push(table); + start += step; + } + tables + } + + pub fn generate_level(level_idx: u32, table_infos: Vec) -> Level { + let total_file_size = table_infos.iter().map(|sst| sst.file_size).sum(); + let uncompressed_file_size = table_infos + .iter() + .map(|sst| sst.uncompressed_file_size) + .sum(); + Level { + level_idx, + level_type: LevelType::Nonoverlapping as i32, + table_infos, + total_file_size, + sub_level_id: 0, + uncompressed_file_size, + } + } + + /// Returns a `OverlappingLevel`, with each `table_infos`'s element placed in a nonoverlapping + /// sub-level. + pub fn generate_l0_nonoverlapping_sublevels(table_infos: Vec) -> OverlappingLevel { + let total_file_size = table_infos.iter().map(|table| table.file_size).sum::(); + let uncompressed_file_size = table_infos + .iter() + .map(|table| table.uncompressed_file_size) + .sum::(); + OverlappingLevel { + sub_levels: table_infos + .into_iter() + .enumerate() + .map(|(idx, table)| Level { + level_idx: 0, + level_type: LevelType::Nonoverlapping as i32, + total_file_size: table.file_size, + uncompressed_file_size: table.uncompressed_file_size, + sub_level_id: idx as u64, + table_infos: vec![table], + }) + .collect_vec(), + total_file_size, + uncompressed_file_size, + } + } + + pub fn generate_l0_nonoverlapping_multi_sublevels( + table_infos: Vec>, + ) -> OverlappingLevel { + let mut l0 = OverlappingLevel { + sub_levels: table_infos + .into_iter() + .enumerate() + .map(|(idx, table)| Level { + level_idx: 0, + level_type: LevelType::Nonoverlapping as i32, + total_file_size: table.iter().map(|table| table.file_size).sum::(), + uncompressed_file_size: table + .iter() + .map(|sst| sst.uncompressed_file_size) + .sum::(), + sub_level_id: idx as u64, + table_infos: table, + }) + .collect_vec(), + total_file_size: 0, + uncompressed_file_size: 0, + }; + + l0.total_file_size = l0.sub_levels.iter().map(|l| l.total_file_size).sum::(); + l0.uncompressed_file_size = l0 + .sub_levels + .iter() + .map(|l| l.uncompressed_file_size) + .sum::(); + l0 + } + + /// Returns a `OverlappingLevel`, with each `table_infos`'s element placed in a overlapping + /// sub-level. + pub fn generate_l0_overlapping_sublevels( + table_infos: Vec>, + ) -> OverlappingLevel { + let mut l0 = OverlappingLevel { + sub_levels: table_infos + .into_iter() + .enumerate() + .map(|(idx, table)| Level { + level_idx: 0, + level_type: LevelType::Overlapping as i32, + total_file_size: table.iter().map(|table| table.file_size).sum::(), + sub_level_id: idx as u64, + table_infos: table.clone(), + uncompressed_file_size: table + .iter() + .map(|sst| sst.uncompressed_file_size) + .sum::(), + }) + .collect_vec(), + total_file_size: 0, + uncompressed_file_size: 0, + }; + l0.total_file_size = l0.sub_levels.iter().map(|l| l.total_file_size).sum::(); + l0.uncompressed_file_size = l0 + .sub_levels + .iter() + .map(|l| l.uncompressed_file_size) + .sum::(); + l0 + } + + pub fn assert_compaction_task(compact_task: &CompactionTask, level_handlers: &[LevelHandler]) { + for i in &compact_task.input.input_levels { + for t in &i.table_infos { + assert!(level_handlers[i.level_idx as usize].is_pending_compact(&t.sst_id)); + } + } + } +} diff --git a/src/meta/src/hummock/compaction/selector/space_reclaim_selector.rs b/src/meta/src/hummock/compaction/selector/space_reclaim_selector.rs new file mode 100644 index 0000000000000..48941a4273d66 --- /dev/null +++ b/src/meta/src/hummock/compaction/selector/space_reclaim_selector.rs @@ -0,0 +1,74 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +use std::collections::HashMap; + +use risingwave_common::catalog::TableOption; +use risingwave_hummock_sdk::HummockCompactionTaskId; +use risingwave_pb::hummock::compact_task; +use risingwave_pb::hummock::hummock_version::Levels; + +use super::{CompactionSelector, DynamicLevelSelectorCore}; +use crate::hummock::compaction::picker::{SpaceReclaimCompactionPicker, SpaceReclaimPickerState}; +use crate::hummock::compaction::{create_compaction_task, CompactionTask, LocalSelectorStatistic}; +use crate::hummock::level_handler::LevelHandler; +use crate::hummock::model::CompactionGroup; + +#[derive(Default)] +pub struct SpaceReclaimCompactionSelector { + state: HashMap, +} + +impl CompactionSelector for SpaceReclaimCompactionSelector { + fn pick_compaction( + &mut self, + task_id: HummockCompactionTaskId, + group: &CompactionGroup, + levels: &Levels, + level_handlers: &mut [LevelHandler], + _selector_stats: &mut LocalSelectorStatistic, + _table_id_to_options: HashMap, + ) -> Option { + let dynamic_level_core = DynamicLevelSelectorCore::new(group.compaction_config.clone()); + let mut picker = SpaceReclaimCompactionPicker::new( + group.compaction_config.max_space_reclaim_bytes, + levels.member_table_ids.iter().cloned().collect(), + ); + let ctx = dynamic_level_core.calculate_level_base_size(levels); + let state = self.state.entry(group.group_id).or_default(); + + let compaction_input = picker.pick_compaction(levels, level_handlers, state)?; + compaction_input.add_pending_task(task_id, level_handlers); + + Some(create_compaction_task( + dynamic_level_core.get_config(), + compaction_input, + ctx.base_level, + self.task_type(), + )) + } + + fn name(&self) -> &'static str { + "SpaceReclaimCompaction" + } + + fn task_type(&self) -> compact_task::TaskType { + compact_task::TaskType::SpaceReclaim + } +} diff --git a/src/meta/src/hummock/compaction/tombstone_compaction_selector.rs b/src/meta/src/hummock/compaction/selector/tombstone_compaction_selector.rs similarity index 88% rename from src/meta/src/hummock/compaction/tombstone_compaction_selector.rs rename to src/meta/src/hummock/compaction/selector/tombstone_compaction_selector.rs index f6a26dcc13013..505c9b47e30c9 100644 --- a/src/meta/src/hummock/compaction/tombstone_compaction_selector.rs +++ b/src/meta/src/hummock/compaction/selector/tombstone_compaction_selector.rs @@ -19,12 +19,12 @@ use risingwave_hummock_sdk::HummockCompactionTaskId; use risingwave_pb::hummock::compact_task; use risingwave_pb::hummock::hummock_version::Levels; +use super::{CompactionSelector, DynamicLevelSelectorCore}; use crate::hummock::compaction::picker::{ TombstoneReclaimCompactionPicker, TombstoneReclaimPickerState, }; use crate::hummock::compaction::{ - create_compaction_task, create_overlap_strategy, CompactionTask, DynamicLevelSelectorCore, - LevelSelector, LocalSelectorStatistic, + create_compaction_task, create_overlap_strategy, CompactionTask, LocalSelectorStatistic, }; use crate::hummock::level_handler::LevelHandler; use crate::hummock::model::CompactionGroup; @@ -34,7 +34,7 @@ pub struct TombstoneCompactionSelector { state: HashMap, } -impl LevelSelector for TombstoneCompactionSelector { +impl CompactionSelector for TombstoneCompactionSelector { fn pick_compaction( &mut self, task_id: HummockCompactionTaskId, @@ -44,11 +44,15 @@ impl LevelSelector for TombstoneCompactionSelector { _selector_stats: &mut LocalSelectorStatistic, _table_id_to_options: HashMap, ) -> Option { + if group.compaction_config.tombstone_reclaim_ratio == 0 { + // it might cause full-compaction when tombstone_reclaim_ratio == 0 + return None; + } + let dynamic_level_core = DynamicLevelSelectorCore::new(group.compaction_config.clone()); let ctx = dynamic_level_core.calculate_level_base_size(levels); let picker = TombstoneReclaimCompactionPicker::new( create_overlap_strategy(group.compaction_config.compaction_mode()), - group.compaction_config.max_compaction_bytes, group.compaction_config.tombstone_reclaim_ratio as u64, group.compaction_config.tombstone_reclaim_ratio as u64 / 2, ); diff --git a/src/meta/src/hummock/compaction/selector/ttl_selector.rs b/src/meta/src/hummock/compaction/selector/ttl_selector.rs new file mode 100644 index 0000000000000..ded292ef2021e --- /dev/null +++ b/src/meta/src/hummock/compaction/selector/ttl_selector.rs @@ -0,0 +1,70 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +use std::collections::HashMap; + +use risingwave_common::catalog::TableOption; +use risingwave_hummock_sdk::HummockCompactionTaskId; +use risingwave_pb::hummock::compact_task; +use risingwave_pb::hummock::hummock_version::Levels; + +use super::{CompactionSelector, DynamicLevelSelectorCore}; +use crate::hummock::compaction::picker::{TtlPickerState, TtlReclaimCompactionPicker}; +use crate::hummock::compaction::{create_compaction_task, CompactionTask, LocalSelectorStatistic}; +use crate::hummock::level_handler::LevelHandler; +use crate::hummock::model::CompactionGroup; + +#[derive(Default)] +pub struct TtlCompactionSelector { + state: HashMap, +} + +impl CompactionSelector for TtlCompactionSelector { + fn pick_compaction( + &mut self, + task_id: HummockCompactionTaskId, + group: &CompactionGroup, + levels: &Levels, + level_handlers: &mut [LevelHandler], + _selector_stats: &mut LocalSelectorStatistic, + table_id_to_options: HashMap, + ) -> Option { + let dynamic_level_core = DynamicLevelSelectorCore::new(group.compaction_config.clone()); + let ctx = dynamic_level_core.calculate_level_base_size(levels); + let picker = TtlReclaimCompactionPicker::new(table_id_to_options); + let state = self.state.entry(group.group_id).or_default(); + let compaction_input = picker.pick_compaction(levels, level_handlers, state)?; + compaction_input.add_pending_task(task_id, level_handlers); + + Some(create_compaction_task( + group.compaction_config.as_ref(), + compaction_input, + ctx.base_level, + self.task_type(), + )) + } + + fn name(&self) -> &'static str { + "TtlCompaction" + } + + fn task_type(&self) -> compact_task::TaskType { + compact_task::TaskType::Ttl + } +} diff --git a/src/meta/src/hummock/compactor_manager.rs b/src/meta/src/hummock/compactor_manager.rs index c6dd5e2f82387..c3e62d98a190f 100644 --- a/src/meta/src/hummock/compactor_manager.rs +++ b/src/meta/src/hummock/compactor_manager.rs @@ -453,7 +453,7 @@ mod tests { use risingwave_hummock_sdk::compaction_group::StaticCompactionGroupId; use risingwave_pb::hummock::CompactTaskProgress; - use crate::hummock::compaction::default_level_selector; + use crate::hummock::compaction::selector::default_compaction_selector; use crate::hummock::test_utils::{ add_ssts, register_table_ids_to_compaction_group, setup_compute_env, }; @@ -477,7 +477,7 @@ mod tests { hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() diff --git a/src/meta/src/hummock/manager/checkpoint.rs b/src/meta/src/hummock/manager/checkpoint.rs index 4e6bb094d5a59..6aa64292b9db1 100644 --- a/src/meta/src/hummock/manager/checkpoint.rs +++ b/src/meta/src/hummock/manager/checkpoint.rs @@ -36,7 +36,7 @@ const HUMMOCK_INIT_FLAG_KEY: &[u8] = b"hummock_init_flag"; impl HummockManager { /// # Panics /// if checkpoint is not found. - pub(crate) async fn read_checkpoint(&self) -> Result { + pub async fn read_checkpoint(&self) -> Result { use prost::Message; let data = match self .object_store @@ -173,23 +173,23 @@ impl HummockManager { .map_err(Into::into) } - pub(crate) fn pause_version_checkpoint(&self) { + pub fn pause_version_checkpoint(&self) { self.pause_version_checkpoint.store(true, Ordering::Relaxed); tracing::info!("hummock version checkpoint is paused."); } - pub(crate) fn resume_version_checkpoint(&self) { + pub fn resume_version_checkpoint(&self) { self.pause_version_checkpoint .store(false, Ordering::Relaxed); tracing::info!("hummock version checkpoint is resumed."); } - pub(crate) fn is_version_checkpoint_paused(&self) -> bool { + pub fn is_version_checkpoint_paused(&self) -> bool { self.pause_version_checkpoint.load(Ordering::Relaxed) } #[named] - pub(crate) async fn get_checkpoint_version(&self) -> HummockVersion { + pub async fn get_checkpoint_version(&self) -> HummockVersion { let versioning_guard = read_lock!(self, versioning).await; versioning_guard .checkpoint diff --git a/src/meta/src/hummock/manager/compaction.rs b/src/meta/src/hummock/manager/compaction.rs index db34da26e62fd..3701daaa4c6ee 100644 --- a/src/meta/src/hummock/manager/compaction.rs +++ b/src/meta/src/hummock/manager/compaction.rs @@ -19,6 +19,8 @@ use itertools::Itertools; use risingwave_hummock_sdk::{CompactionGroupId, HummockCompactionTaskId}; use risingwave_pb::hummock::{CompactStatus as PbCompactStatus, CompactTaskAssignment}; +use crate::hummock::compaction::selector::level_selector::PickerInfo; +use crate::hummock::compaction::selector::DynamicLevelSelectorCore; use crate::hummock::compaction::CompactStatus; use crate::hummock::manager::read_lock; use crate::hummock::HummockManager; @@ -71,4 +73,29 @@ impl HummockManager { .collect(), ) } + + #[named] + pub async fn get_compaction_scores( + &self, + compaction_group_id: CompactionGroupId, + ) -> Vec { + let (status, levels, config) = { + let compaction = read_lock!(self, compaction).await; + let versioning = read_lock!(self, versioning).await; + let config_manager = self.compaction_group_manager.read().await; + match ( + compaction.compaction_statuses.get(&compaction_group_id), + versioning.current_version.levels.get(&compaction_group_id), + config_manager.try_get_compaction_group_config(compaction_group_id), + ) { + (Some(cs), Some(v), Some(cf)) => (cs.to_owned(), v.to_owned(), cf), + _ => { + return vec![]; + } + } + }; + let dynamic_level_core = DynamicLevelSelectorCore::new(config.compaction_config); + let ctx = dynamic_level_core.get_priority_levels(&levels, &status.level_handlers); + ctx.score_levels + } } diff --git a/src/meta/src/hummock/manager/compaction_group_manager.rs b/src/meta/src/hummock/manager/compaction_group_manager.rs index 3e83937897b3d..f3853c8d08df5 100644 --- a/src/meta/src/hummock/manager/compaction_group_manager.rs +++ b/src/meta/src/hummock/manager/compaction_group_manager.rs @@ -255,7 +255,7 @@ impl HummockManager { assert!(sst_split_info.is_empty()); let mut trx = Transaction::default(); - new_version_delta.apply_to_txn(&mut trx)?; + new_version_delta.apply_to_txn(&mut trx).await?; self.env.meta_store().txn(trx).await?; versioning.current_version = current_version; new_version_delta.commit(); @@ -350,7 +350,7 @@ impl HummockManager { assert!(sst_split_info.is_empty()); let mut trx = Transaction::default(); - new_version_delta.apply_to_txn(&mut trx)?; + new_version_delta.apply_to_txn(&mut trx).await?; self.env.meta_store().txn(trx).await?; for group_id in &groups_to_remove { let max_level = versioning @@ -386,8 +386,9 @@ impl HummockManager { &self, compaction_group_ids: &[CompactionGroupId], config_to_update: &[MutableConfig], - ) -> Result<()> { - self.compaction_group_manager + ) -> Result> { + let result = self + .compaction_group_manager .write() .await .update_compaction_config( @@ -402,7 +403,7 @@ impl HummockManager { { self.try_update_write_limits(compaction_group_ids).await; } - Ok(()) + Ok(result) } /// Gets complete compaction group info. @@ -475,6 +476,7 @@ impl HummockManager { ))); } } + if table_ids.len() == parent_group.member_table_ids.len() { return Err(Error::CompactionGroup(format!( "invalid split attempt for group {}: all member tables are moved", @@ -593,11 +595,13 @@ impl HummockManager { new_compaction_group_id } }; + let mut current_version = versioning.current_version.clone(); let sst_split_info = current_version.apply_version_delta(&new_version_delta); - let mut branched_ssts = BTreeMapTransaction::new(&mut versioning.branched_ssts); + + let mut branched_ssts = BTreeMapTransaction::<'_, _, _>::new(&mut versioning.branched_ssts); let mut trx = Transaction::default(); - new_version_delta.apply_to_txn(&mut trx)?; + new_version_delta.apply_to_txn(&mut trx).await?; if let Some((new_compaction_group_id, config)) = new_group { let mut compaction_group_manager = self.compaction_group_manager.write().await; let insert = BTreeMapEntryTransaction::new_insert( @@ -608,7 +612,7 @@ impl HummockManager { compaction_config: Arc::new(config), }, ); - insert.apply_to_txn(&mut trx)?; + insert.apply_to_txn(&mut trx).await?; self.env.meta_store().txn(trx).await?; insert.commit(); } else { @@ -652,10 +656,17 @@ impl HummockManager { } } } - for mut task in canceled_tasks { - task.set_task_status(TaskStatus::ManualCanceled); + + for task in canceled_tasks { if !self - .report_compact_task_impl(&mut task, &mut compaction_guard, None) + .report_compact_task_impl( + task.task_id, + None, + TaskStatus::ManualCanceled, + vec![], + &mut compaction_guard, + None, + ) .await .unwrap_or(false) { @@ -769,7 +780,7 @@ impl CompactionGroupManager { compaction_groups.insert(*id, new_entry); } let mut trx = Transaction::default(); - compaction_groups.apply_to_txn(&mut trx)?; + compaction_groups.apply_to_txn(&mut trx).await?; meta_store.txn(trx).await?; compaction_groups.commit(); let r = compaction_group_ids @@ -791,13 +802,14 @@ impl CompactionGroupManager { self.default_config.clone() } - async fn update_compaction_config( + pub async fn update_compaction_config( &mut self, compaction_group_ids: &[CompactionGroupId], config_to_update: &[MutableConfig], meta_store: &S, - ) -> Result<()> { + ) -> Result> { let mut compaction_groups = BTreeMapTransaction::new(&mut self.compaction_groups); + let mut result = Vec::with_capacity(compaction_group_ids.len()); for compaction_group_id in compaction_group_ids.iter().unique() { let group = compaction_groups.get(compaction_group_id).ok_or_else(|| { Error::CompactionGroup(format!("invalid group {}", *compaction_group_id)) @@ -809,14 +821,15 @@ impl CompactionGroupManager { } let mut new_group = group.clone(); new_group.compaction_config = Arc::new(config); - compaction_groups.insert(*compaction_group_id, new_group); + compaction_groups.insert(*compaction_group_id, new_group.clone()); + result.push(new_group); } let mut trx = Transaction::default(); - compaction_groups.apply_to_txn(&mut trx)?; + compaction_groups.apply_to_txn(&mut trx).await?; meta_store.txn(trx).await?; compaction_groups.commit(); - Ok(()) + Ok(result) } /// Initializes the config for a group. @@ -836,7 +849,7 @@ impl CompactionGroupManager { }, ); let mut trx = Transaction::default(); - insert.apply_to_txn(&mut trx)?; + insert.apply_to_txn(&mut trx).await?; meta_store.txn(trx).await?; insert.commit(); Ok(()) @@ -862,7 +875,7 @@ impl CompactionGroupManager { compaction_groups.remove(group); } let mut trx = Transaction::default(); - compaction_groups.apply_to_txn(&mut trx)?; + compaction_groups.apply_to_txn(&mut trx).await?; meta_store.txn(trx).await?; compaction_groups.commit(); Ok(()) @@ -914,6 +927,9 @@ fn update_compaction_config(target: &mut CompactionConfig, items: &[MutableConfi MutableConfig::EnableEmergencyPicker(c) => { target.enable_emergency_picker = *c; } + MutableConfig::TombstoneReclaimRatio(c) => { + target.tombstone_reclaim_ratio = *c; + } } } } diff --git a/src/meta/src/hummock/manager/context.rs b/src/meta/src/hummock/manager/context.rs index 21751bb968421..b069a31ce5bd3 100644 --- a/src/meta/src/hummock/manager/context.rs +++ b/src/meta/src/hummock/manager/context.rs @@ -112,7 +112,7 @@ impl HummockManager { Ok(invalid_context_ids) } - pub(crate) async fn commit_epoch_sanity_check( + pub async fn commit_epoch_sanity_check( &self, epoch: HummockEpoch, sstables: &Vec, diff --git a/src/meta/src/hummock/manager/mod.rs b/src/meta/src/hummock/manager/mod.rs index 4cc4c7bb6a771..2b0c3e3db87dc 100644 --- a/src/meta/src/hummock/manager/mod.rs +++ b/src/meta/src/hummock/manager/mod.rs @@ -28,6 +28,7 @@ use futures::stream::{BoxStream, FuturesUnordered}; use futures::{FutureExt, StreamExt}; use itertools::Itertools; use parking_lot::Mutex; +use risingwave_common::config::default::compaction_config; use risingwave_common::monitor::rwlock::MonitoredRwLock; use risingwave_common::util::epoch::{Epoch, INVALID_EPOCH}; use risingwave_common::util::{pending_on_none, select_all}; @@ -44,6 +45,7 @@ use risingwave_hummock_sdk::{ }; use risingwave_pb::hummock::compact_task::{self, TaskStatus, TaskType}; use risingwave_pb::hummock::group_delta::DeltaType; +use risingwave_pb::hummock::rise_ctl_update_compaction_config_request::mutable_config; use risingwave_pb::hummock::subscribe_compaction_event_request::{ Event as RequestEvent, HeartBeat, PullTask, ReportTask, }; @@ -54,7 +56,7 @@ use risingwave_pb::hummock::{ version_update_payload, CompactTask, CompactTaskAssignment, CompactionConfig, GroupDelta, HummockPinnedSnapshot, HummockPinnedVersion, HummockSnapshot, HummockVersion, HummockVersionCheckpoint, HummockVersionDelta, HummockVersionDeltas, HummockVersionStats, - IntraLevelDelta, SubscribeCompactionEventRequest, TableOption, + IntraLevelDelta, SstableInfo, SubscribeCompactionEventRequest, TableOption, }; use risingwave_pb::meta::subscribe_response::{Info, Operation}; use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender}; @@ -65,10 +67,11 @@ use tokio_stream::wrappers::IntervalStream; use tonic::Streaming; use tracing::warn; -use crate::hummock::compaction::{ - CompactStatus, EmergencySelector, LocalSelectorStatistic, ManualCompactionOption, - TombstoneCompactionSelector, +use crate::hummock::compaction::selector::{ + DynamicLevelSelector, LocalSelectorStatistic, ManualCompactionOption, ManualCompactionSelector, + SpaceReclaimCompactionSelector, TombstoneCompactionSelector, TtlCompactionSelector, }; +use crate::hummock::compaction::CompactStatus; use crate::hummock::error::{Error, Result}; use crate::hummock::metrics_utils::{ trigger_delta_log_stats, trigger_lsm_stat, trigger_mv_stat, trigger_pin_unpin_snapshot_state, @@ -114,14 +117,13 @@ pub struct HummockManager { catalog_manager: CatalogManagerRef, fragment_manager: FragmentManagerRef, - // `CompactionGroupManager` manages `CompactionGroup`'s members. - // Note that all hummock state store user should register to `CompactionGroupManager`. It - // includes all state tables of streaming jobs except sink. - compaction_group_manager: tokio::sync::RwLock, - // When trying to locks compaction and versioning at the same time, compaction lock should - // be requested before versioning lock. + /// Lock order: compaction, versioning, compaction_group_manager. + /// - Lock compaction first, then versioning, and finally compaction_group_manager. + /// - This order should be strictly followed to prevent deadlock. compaction: MonitoredRwLock, versioning: MonitoredRwLock, + /// `CompactionGroupManager` manages compaction configs for compaction groups. + compaction_group_manager: tokio::sync::RwLock, latest_snapshot: Snapshot, pub metrics: Arc, @@ -156,7 +158,7 @@ macro_rules! commit_multi_var { let mut trx = $trx_extern_part; // Apply the change in `ValTransaction` to trx $( - $val_txn.apply_to_txn(&mut trx)?; + $val_txn.apply_to_txn(&mut trx).await?; )* // Commit to state store $hummock_mgr.commit_trx($hummock_mgr.env.meta_store(), trx, $context_id) @@ -225,10 +227,6 @@ macro_rules! start_measure_real_process_timer { } pub(crate) use start_measure_real_process_timer; -use super::compaction::{ - DynamicLevelSelector, LevelSelector, ManualCompactionSelector, SpaceReclaimCompactionSelector, - TtlCompactionSelector, -}; use crate::hummock::manager::compaction_group_manager::CompactionGroupManager; use crate::hummock::manager::worker::HummockManagerEventSender; @@ -254,7 +252,7 @@ pub enum CompactionResumeTrigger { } impl HummockManager { - pub(crate) async fn new( + pub async fn new( env: MetaSrvEnv, cluster_manager: ClusterManagerRef, fragment_manager: FragmentManagerRef, @@ -354,7 +352,12 @@ impl HummockManager { if let risingwave_object_store::object::ObjectStoreImpl::S3(s3) = object_store.as_ref() && !env.opts.do_not_config_object_storage_lifecycle { - s3.inner().configure_bucket_lifecycle().await; + let is_bucket_expiration_configured = s3.inner().configure_bucket_lifecycle().await; + if is_bucket_expiration_configured{ + return Err(ObjectError::internal("Cluster cannot start with object expiration configured for bucket because RisingWave data will be lost when object expiration kicks in. + Please disable object expiration and restart the cluster.") + .into()); + } } } let checkpoint_path = version_checkpoint_path(state_store_dir); @@ -438,29 +441,16 @@ impl HummockManager { .collect(); let mut redo_state = if self.need_init().await? { - // For backward compatibility, try to read checkpoint from meta store. - let versions = HummockVersion::list(self.env.meta_store()).await?; - let checkpoint_version = if !versions.is_empty() { - let checkpoint = versions.into_iter().next().unwrap(); - tracing::warn!( - "read hummock version checkpoint from meta store: {:#?}", - checkpoint - ); - checkpoint - } else { - // As no record found in stores, create a initial version. - let default_compaction_config = self - .compaction_group_manager - .read() - .await - .default_compaction_config(); - let checkpoint = create_init_version(default_compaction_config); - tracing::info!("init hummock version checkpoint"); - HummockVersionStats::default() - .insert(self.env.meta_store()) - .await?; - checkpoint - }; + let default_compaction_config = self + .compaction_group_manager + .read() + .await + .default_compaction_config(); + let checkpoint_version = create_init_version(default_compaction_config); + tracing::info!("init hummock version checkpoint"); + HummockVersionStats::default() + .insert(self.env.meta_store()) + .await?; versioning_guard.checkpoint = HummockVersionCheckpoint { version: Some(checkpoint_version.clone()), stale_objects: Default::default(), @@ -514,7 +504,7 @@ impl HummockManager { versioning_guard.mark_objects_for_deletion(); let all_group_ids = get_compaction_group_ids(&versioning_guard.current_version); - let configs = self + let mut configs = self .compaction_group_manager .write() .await @@ -523,6 +513,46 @@ impl HummockManager { self.env.meta_store(), ) .await?; + + // We've already lowered the default limit for write limit in PR-12183, and to prevent older clusters from continuing to use the outdated configuration, we've introduced a new logic to rewrite it in a uniform way. + let mut rewrite_cg_ids = vec![]; + for (cg_id, compaction_group_config) in &mut configs { + // update write limit + let relaxed_default_write_stop_level_count = 1000; + if compaction_group_config + .compaction_config + .level0_sub_level_compact_level_count + == relaxed_default_write_stop_level_count + { + rewrite_cg_ids.push(*cg_id); + } + } + + if !rewrite_cg_ids.is_empty() { + tracing::info!("Compaction group {:?} configs rewrite ", rewrite_cg_ids); + + // update meta store + let result = self + .compaction_group_manager + .write() + .await + .update_compaction_config( + &rewrite_cg_ids, + &[ + mutable_config::MutableConfig::Level0StopWriteThresholdSubLevelNumber( + compaction_config::level0_stop_write_threshold_sub_level_number(), + ), + ], + self.env.meta_store(), + ) + .await?; + + // update memory + for new_config in result { + configs.insert(new_config.group_id(), new_config); + } + } + versioning_guard.write_limit = calc_new_write_limits(configs, HashMap::new(), &versioning_guard.current_version); trigger_write_stop_stats(&self.metrics, &versioning_guard.write_limit); @@ -777,7 +807,7 @@ impl HummockManager { pub async fn get_compact_task_impl( &self, compaction_group_id: CompactionGroupId, - selector: &mut Box, + selector: &mut Box, ) -> Result> { // TODO: `get_all_table_options` will hold catalog_manager async lock, to avoid holding the // lock in compaction_guard, take out all table_options in advance there may be a @@ -822,7 +852,6 @@ impl HummockManager { return Ok(None); } }; - let (current_version, watermark) = { let versioning_guard = read_lock!(self, versioning).await; let max_committed_epoch = versioning_guard.current_version.max_committed_epoch; @@ -831,6 +860,7 @@ impl HummockManager { .values() .map(|v| v.minimal_pinned_snapshot) .fold(max_committed_epoch, std::cmp::min); + (versioning_guard.current_version.clone(), watermark) }; if current_version.levels.get(&compaction_group_id).is_none() { @@ -865,6 +895,7 @@ impl HummockManager { } Some(task) => task, }; + compact_task.watermark = watermark; compact_task.existing_table_ids = current_version .levels @@ -877,8 +908,15 @@ impl HummockManager { if is_trivial_reclaim { compact_task.set_task_status(TaskStatus::Success); - self.report_compact_task_impl(&mut compact_task, &mut compaction_guard, None) - .await?; + self.report_compact_task_impl( + task_id, + Some(compact_task.clone()), + TaskStatus::Success, + vec![], + &mut compaction_guard, + None, + ) + .await?; tracing::debug!( "TrivialReclaim for compaction group {}: remove {} sstables, cost time: {:?}", compaction_group_id, @@ -890,11 +928,19 @@ impl HummockManager { start_time.elapsed() ); } else if is_trivial_move && can_trivial_move { - compact_task.sorted_output_ssts = compact_task.input_ssts[0].table_infos.clone(); // this task has been finished and `trivial_move_task` does not need to be schedule. compact_task.set_task_status(TaskStatus::Success); - self.report_compact_task_impl(&mut compact_task, &mut compaction_guard, None) - .await?; + compact_task.sorted_output_ssts = compact_task.input_ssts[0].table_infos.clone(); + self.report_compact_task_impl( + task_id, + Some(compact_task.clone()), + TaskStatus::Success, + compact_task.input_ssts[0].table_infos.clone(), + &mut compaction_guard, + None, + ) + .await?; + tracing::debug!( "TrivialMove for compaction group {}: pick up {} sstables in level {} to compact to target_level {} cost time: {:?}", compaction_group_id, @@ -1008,24 +1054,30 @@ impl HummockManager { } /// Cancels a compaction task no matter it's assigned or unassigned. - pub async fn cancel_compact_task( - &self, - compact_task: &mut CompactTask, - task_status: TaskStatus, - ) -> Result { - compact_task.set_task_status(task_status); + pub async fn cancel_compact_task(&self, task_id: u64, task_status: TaskStatus) -> Result { fail_point!("fp_cancel_compact_task", |_| Err(Error::MetaStore( anyhow::anyhow!("failpoint metastore err") ))); - self.cancel_compact_task_impl(compact_task).await + self.cancel_compact_task_impl(task_id, task_status).await } #[named] - pub async fn cancel_compact_task_impl(&self, compact_task: &mut CompactTask) -> Result { - assert!(CANCEL_STATUS_SET.contains(&compact_task.task_status())); + pub async fn cancel_compact_task_impl( + &self, + task_id: u64, + task_status: TaskStatus, + ) -> Result { + assert!(CANCEL_STATUS_SET.contains(&task_status)); let mut compaction_guard = write_lock!(self, compaction).await; let ret = self - .report_compact_task_impl(compact_task, &mut compaction_guard, None) + .report_compact_task_impl( + task_id, + None, + task_status, + vec![], + &mut compaction_guard, + None, + ) .await?; #[cfg(test)] { @@ -1057,7 +1109,7 @@ impl HummockManager { pub async fn get_compact_task( &self, compaction_group_id: CompactionGroupId, - selector: &mut Box, + selector: &mut Box, ) -> Result> { fail_point!("fp_get_compact_task", |_| Err(Error::MetaStore( anyhow::anyhow!("failpoint metastore error") @@ -1084,7 +1136,7 @@ impl HummockManager { compaction_group_id: CompactionGroupId, manual_compaction_option: ManualCompactionOption, ) -> Result> { - let mut selector: Box = + let mut selector: Box = Box::new(ManualCompactionSelector::new(manual_compaction_option)); self.get_compact_task(compaction_group_id, &mut selector) .await @@ -1112,19 +1164,21 @@ impl HummockManager { #[named] pub async fn report_compact_task( &self, - compact_task: &mut CompactTask, + task_id: u64, + task_status: TaskStatus, + sorted_output_ssts: Vec, table_stats_change: Option, ) -> Result { let mut guard = write_lock!(self, compaction).await; - let ret = self - .report_compact_task_impl(compact_task, &mut guard, table_stats_change) - .await?; - #[cfg(test)] - { - drop(guard); - self.check_state_consistency().await; - } - Ok(ret) + self.report_compact_task_impl( + task_id, + None, + task_status, + sorted_output_ssts, + &mut guard, + table_stats_change, + ) + .await } /// Finishes or cancels a compaction task, according to `task_status`. @@ -1137,7 +1191,10 @@ impl HummockManager { #[named] pub async fn report_compact_task_impl( &self, - compact_task: &mut CompactTask, + task_id: u64, + trivial_move_compact_task: Option, + task_status: TaskStatus, + sorted_output_ssts: Vec, compaction_guard: &mut RwLockWriteGuard<'_, Compaction>, table_stats_change: Option, ) -> Result { @@ -1149,18 +1206,28 @@ impl HummockManager { let mut compact_task_assignment = BTreeMapTransaction::new(&mut compaction.compact_task_assignment); - let is_trivial_reclaim = CompactStatus::is_trivial_reclaim(compact_task); - let is_trivial_move = CompactStatus::is_trivial_move_task(compact_task); - // remove task_assignment - if compact_task_assignment - .remove(compact_task.task_id) - .is_none() - && !(is_trivial_reclaim || is_trivial_move) + let mut compact_task = if let Some(input_task) = trivial_move_compact_task { + input_task + } else { + match compact_task_assignment.remove(task_id) { + Some(compact_task) => compact_task.compact_task.unwrap(), + None => { + tracing::warn!("{}", format!("compact task {} not found", task_id)); + return Ok(false); + } + } + }; + { - return Ok(false); + // apply result + compact_task.set_task_status(task_status); + compact_task.sorted_output_ssts = sorted_output_ssts; } + let is_trivial_reclaim = CompactStatus::is_trivial_reclaim(&compact_task); + let is_trivial_move = CompactStatus::is_trivial_move_task(&compact_task); + { // The compaction task is finished. let mut versioning_guard = write_lock!(self, versioning).await; @@ -1175,7 +1242,7 @@ impl HummockManager { match compact_statuses.get_mut(compact_task.compaction_group_id) { Some(mut compact_status) => { - compact_status.report_compact_task(compact_task); + compact_status.report_compact_task(&compact_task); } None => { compact_task.set_task_status(TaskStatus::InvalidGroupCanceled); @@ -1199,7 +1266,7 @@ impl HummockManager { let is_success = if let TaskStatus::Success = compact_task.task_status() { // if member_table_ids changes, the data of sstable may stale. let is_expired = - Self::is_compact_task_expired(compact_task, &versioning.branched_ssts); + Self::is_compact_task_expired(&compact_task, &versioning.branched_ssts); if is_expired { compact_task.set_task_status(TaskStatus::InputOutdatedCanceled); false @@ -1214,7 +1281,7 @@ impl HummockManager { compact_task.set_task_status(TaskStatus::InvalidGroupCanceled); warn!( "The task may be expired because of group split, task:\n {:?}", - compact_task_to_string(compact_task) + compact_task_to_string(&compact_task) ); } input_exist @@ -1230,7 +1297,7 @@ impl HummockManager { &mut hummock_version_deltas, &mut branched_ssts, ¤t_version, - compact_task, + &compact_task, deterministic_mode, ); let mut version_stats = VarTransaction::new(&mut versioning.version_stats); @@ -1298,7 +1365,7 @@ impl HummockManager { tracing::trace!( "Reported compaction task. {}. cost time: {:?}", - compact_task_to_string(compact_task), + compact_task_to_string(&compact_task), start_time.elapsed(), ); @@ -1427,7 +1494,7 @@ impl HummockManager { .id_gen_manager() .generate_interval::<{ IdCategory::HummockSstableId }>(new_sst_id_number as u64) .await?; - let mut branched_ssts = BTreeMapTransaction::new(&mut versioning.branched_ssts); + let mut branched_ssts = BTreeMapTransaction::<'_, _, _>::new(&mut versioning.branched_ssts); let original_sstables = std::mem::take(&mut sstables); sstables.reserve_exact(original_sstables.len() - incorrect_ssts.len() + new_sst_id_number); let mut incorrect_ssts = incorrect_ssts.into_iter(); @@ -1478,6 +1545,7 @@ impl HummockManager { .into_iter() .map(|ExtendedSstableInfo { sst_info, .. }| sst_info) .collect_vec(); + let group_deltas = &mut new_version_delta .group_deltas .entry(compaction_group_id) @@ -1905,10 +1973,12 @@ impl HummockManager { Ok(()) } + #[cfg(any(test, feature = "test"))] pub fn compactor_manager_ref_for_test(&self) -> CompactorManagerRef { self.compactor_manager.clone() } + #[cfg(any(test, feature = "test"))] #[named] pub async fn compaction_task_from_assignment_for_test( &self, @@ -1919,6 +1989,31 @@ impl HummockManager { assignment_ref.get(&task_id).cloned() } + #[cfg(any(test, feature = "test"))] + #[named] + pub async fn report_compact_task_for_test( + &self, + task_id: u64, + compact_task: Option, + task_status: TaskStatus, + sorted_output_ssts: Vec, + table_stats_change: Option, + ) -> Result { + let mut guard = write_lock!(self, compaction).await; + + // In the test, the contents of the compact task may have been modified directly, while the contents of compact_task_assignment were not modified. + // So we pass the modified compact_task directly into the `report_compact_task_impl` + self.report_compact_task_impl( + task_id, + compact_task, + task_status, + sorted_output_ssts, + &mut guard, + table_stats_change, + ) + .await + } + pub fn cluster_manager(&self) -> &ClusterManagerRef { &self.cluster_manager } @@ -2188,13 +2283,12 @@ impl HummockManager { // side, and meta is just used as a last resort to clean up the // tasks that compactor has expired. - // - for mut task in + for task in compactor_manager.get_expired_tasks(Some(INTERVAL_SEC)) { if let Err(e) = hummock_manager .cancel_compact_task( - &mut task, + task.task_id, TaskStatus::HeartbeatCanceled, ) .await @@ -2555,7 +2649,7 @@ impl HummockManager { assert_ne!(0, pull_task_count); if let Some(compactor) = hummock_manager.compactor_manager.get_compactor(context_id) { if let Some((group, task_type)) = hummock_manager.auto_pick_compaction_group_and_type().await { - let selector: &mut Box = { + let selector: &mut Box = { let versioning_guard = read_lock!(hummock_manager, versioning).await; let versioning = versioning_guard.deref(); @@ -2578,9 +2672,7 @@ impl HummockManager { } else { compaction_selectors.get_mut(&task_type).unwrap() } - }; - for _ in 0..pull_task_count { let compact_task = hummock_manager @@ -2636,15 +2728,14 @@ impl HummockManager { }, RequestEvent::ReportTask(ReportTask { - compact_task, + task_id, + task_status, + sorted_output_ssts, table_stats_change }) => { - if let Some(mut compact_task) = compact_task { - if let Err(e) = hummock_manager - .report_compact_task(&mut compact_task, Some(table_stats_change)) + if let Err(e) = hummock_manager.report_compact_task(task_id, TaskStatus::try_from(task_status).unwrap(), sorted_output_ssts, Some(table_stats_change)) .await { tracing::error!("report compact_tack fail {e:?}"); - } } }, @@ -2655,7 +2746,7 @@ impl HummockManager { let cancel_tasks = compactor_manager.update_task_heartbeats(&progress); // TODO: task cancellation can be batched - for mut task in cancel_tasks { + for task in cancel_tasks { tracing::info!( "Task with task_id {} with context_id {} has expired due to lack of visible progress", context_id, @@ -2664,7 +2755,7 @@ impl HummockManager { if let Err(e) = hummock_manager - .cancel_compact_task(&mut task, TaskStatus::HeartbeatCanceled) + .cancel_compact_task(task.task_id, TaskStatus::HeartbeatCanceled) .await { tracing::error!("Attempt to remove compaction task due to elapsed heartbeat failed. We will continue to track its heartbeat @@ -2846,6 +2937,7 @@ fn gen_version_delta<'a>( group_deltas.push(group_delta); version_delta.gc_object_ids.append(&mut gc_object_ids); version_delta.safe_epoch = std::cmp::max(old_version.safe_epoch, compact_task.watermark); + // Don't persist version delta generated by compaction to meta store in deterministic mode. // Because it will override existing version delta that has same ID generated in the data // ingestion phase. @@ -2884,8 +2976,8 @@ async fn write_exclusive_cluster_id( } } -fn init_selectors() -> HashMap> { - let mut compaction_selectors: HashMap> = +fn init_selectors() -> HashMap> { + let mut compaction_selectors: HashMap> = HashMap::default(); compaction_selectors.insert( compact_task::TaskType::Dynamic, @@ -2903,7 +2995,6 @@ fn init_selectors() -> HashMap> { compact_task::TaskType::Tombstone, Box::::default(), ); - compaction_selectors.insert( compact_task::TaskType::Emergency, Box::::default(), @@ -2914,6 +3005,9 @@ fn init_selectors() -> HashMap> { type CompactionRequestChannelItem = (CompactionGroupId, compact_task::TaskType); use tokio::sync::mpsc::error::SendError; +use super::compaction::selector::EmergencySelector; +use super::compaction::CompactionSelector; + #[derive(Debug, Default)] pub struct CompactionState { scheduled: Mutex>, @@ -2955,6 +3049,8 @@ impl CompactionState { Some(compact_task::TaskType::SpaceReclaim) } else if guard.contains(&(group, compact_task::TaskType::Ttl)) { Some(compact_task::TaskType::Ttl) + } else if guard.contains(&(group, compact_task::TaskType::Tombstone)) { + Some(compact_task::TaskType::Tombstone) } else if guard.contains(&(group, compact_task::TaskType::Dynamic)) { Some(compact_task::TaskType::Dynamic) } else { diff --git a/src/meta/src/hummock/manager/tests.rs b/src/meta/src/hummock/manager/tests.rs index 596149df3b8aa..4d5de0cc19011 100644 --- a/src/meta/src/hummock/manager/tests.rs +++ b/src/meta/src/hummock/manager/tests.rs @@ -39,10 +39,11 @@ use risingwave_pb::hummock::{ use risingwave_pb::meta::add_worker_node_request::Property; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; -use crate::hummock::compaction::{ - default_level_selector, CompactStatus, LevelSelector, ManualCompactionOption, +use crate::hummock::compaction::selector::{ + default_compaction_selector, CompactionSelector, ManualCompactionOption, SpaceReclaimCompactionSelector, }; +use crate::hummock::compaction::CompactStatus; use crate::hummock::error::Error; use crate::hummock::test_utils::*; use crate::hummock::{HummockManager, HummockManagerRef}; @@ -161,7 +162,7 @@ async fn test_hummock_compaction_task() { assert!(hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() @@ -189,10 +190,10 @@ async fn test_hummock_compaction_task() { .unwrap(); // Get a compaction task. - let mut compact_task = hummock_manager + let compact_task = hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() @@ -209,25 +210,24 @@ async fn test_hummock_compaction_task() { // Cancel the task and succeed. assert!(hummock_manager - .cancel_compact_task(&mut compact_task, TaskStatus::ManualCanceled) + .cancel_compact_task(compact_task.task_id, TaskStatus::ManualCanceled) .await .unwrap()); // Get a compaction task. - let mut compact_task = hummock_manager + let compact_task = hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() .unwrap(); assert_eq!(compact_task.get_task_id(), 3); // Finish the task and succeed. - compact_task.set_task_status(TaskStatus::Success); assert!(hummock_manager - .report_compact_task(&mut compact_task, None) + .report_compact_task(compact_task.task_id, TaskStatus::Success, vec![], None) .await .unwrap()); } @@ -731,7 +731,7 @@ async fn test_print_compact_task() { let compact_task = hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() @@ -847,15 +847,6 @@ async fn test_trigger_manual_compaction() { assert!(result.is_ok()); } - let task_id: u64 = 4; - let compact_task = hummock_manager - .compaction_task_from_assignment_for_test(task_id) - .await - .unwrap() - .compact_task - .unwrap(); - assert_eq!(task_id, compact_task.task_id); - { let option = ManualCompactionOption::default(); // all sst pending , test no compaction avail @@ -887,7 +878,7 @@ async fn test_hummock_compaction_task_heartbeat() { assert!(hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() @@ -915,10 +906,10 @@ async fn test_hummock_compaction_task_heartbeat() { .unwrap(); // Get a compaction task. - let mut compact_task = hummock_manager + let compact_task = hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() @@ -946,17 +937,21 @@ async fn test_hummock_compaction_task_heartbeat() { } // Cancel the task immediately and succeed. - compact_task.set_task_status(TaskStatus::ExecuteFailed); assert!(hummock_manager - .report_compact_task(&mut compact_task, None) + .report_compact_task( + compact_task.task_id, + TaskStatus::ExecuteFailed, + vec![], + None + ) .await .unwrap()); // Get a compaction task. - let mut compact_task = hummock_manager + let compact_task = hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() @@ -965,14 +960,18 @@ async fn test_hummock_compaction_task_heartbeat() { assert_eq!(compact_task.get_task_id(), 3); // Cancel the task after heartbeat has triggered and fail. - compact_task.set_task_status(TaskStatus::ExecuteFailed); // do not send heartbeats to the task for 30s seconds (ttl = 1s, heartbeat check freq. = 1s) // default_interval = 30s tokio::time::sleep(std::time::Duration::from_secs(32)).await; assert!(!hummock_manager - .report_compact_task(&mut compact_task, None) + .report_compact_task( + compact_task.task_id, + TaskStatus::ExecuteFailed, + vec![], + None + ) .await .unwrap()); shutdown_tx.send(()).unwrap(); @@ -999,7 +998,7 @@ async fn test_hummock_compaction_task_heartbeat_removal_on_node_removal() { assert!(hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() @@ -1030,7 +1029,7 @@ async fn test_hummock_compaction_task_heartbeat_removal_on_node_removal() { let compact_task = hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() @@ -1193,15 +1192,15 @@ async fn test_version_stats() { .compactor_manager_ref_for_test() .add_compactor(worker_node.id); - let mut compact_task = hummock_manager + let compact_task = hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() .unwrap(); - compact_task.task_status = TaskStatus::Success as _; + // compact_task.task_status = TaskStatus::Success as _; let compact_table_stats_change = TableStatsMap::from([ ( 2, @@ -1222,7 +1221,9 @@ async fn test_version_stats() { ]); hummock_manager .report_compact_task( - &mut compact_task, + compact_task.task_id, + TaskStatus::Success, + vec![], Some(to_prost_table_stats_map(compact_table_stats_change)), ) .await @@ -1632,16 +1633,18 @@ async fn test_split_compaction_group_trivial_expired() { .register_table_ids(&[(102, 2)]) .await .unwrap(); - let mut task = hummock_manager - .get_compact_task(2, &mut default_level_selector()) + let task = hummock_manager + .get_compact_task(2, &mut default_compaction_selector()) .await .unwrap() .unwrap(); + hummock_manager .split_compaction_group(2, &[100]) .await .unwrap(); - let mut selector: Box = Box::::default(); + let mut selector: Box = + Box::::default(); let reclaim_task = hummock_manager .get_compact_task_impl(2, &mut selector) .await @@ -1666,30 +1669,32 @@ async fn test_split_compaction_group_trivial_expired() { vec![100] ); - let mut task2 = hummock_manager - .get_compact_task(new_group_id, &mut default_level_selector()) + let task2 = hummock_manager + .get_compact_task(new_group_id, &mut default_compaction_selector()) .await .unwrap() .unwrap(); - task2.sorted_output_ssts = vec![SstableInfo { - object_id: 12, - sst_id: 12, - key_range: None, - table_ids: vec![100], - min_epoch: 20, - max_epoch: 20, - ..Default::default() - }]; - // delete all reference of sst-10 - task2.task_status = TaskStatus::Success as i32; + let ret = hummock_manager - .report_compact_task(&mut task2, None) + .report_compact_task( + task2.task_id, + TaskStatus::Success, + vec![SstableInfo { + object_id: 12, + sst_id: 12, + key_range: None, + table_ids: vec![100], + min_epoch: 20, + max_epoch: 20, + ..Default::default() + }], + None, + ) .await .unwrap(); assert!(ret); - task.task_status = TaskStatus::Success as i32; let ret = hummock_manager - .report_compact_task(&mut task, None) + .report_compact_task(task.task_id, TaskStatus::Success, vec![], None) .await .unwrap(); // the task has been canceld @@ -1750,37 +1755,41 @@ async fn test_split_compaction_group_on_demand_bottom_levels() { .await .unwrap(); // Construct data via manual compaction - let mut compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; + let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; let base_level: usize = 6; assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 1); assert_eq!(compaction_task.target_level, base_level as u32); - compaction_task.sorted_output_ssts = vec![ - SstableInfo { - object_id: 11, - sst_id: 11, - table_ids: vec![100, 101], - key_range: Some(KeyRange { - left: iterator_test_key_of_epoch(1, 1, 1), - right: iterator_test_key_of_epoch(1, 1, 1), - right_exclusive: false, - }), - ..Default::default() - }, - SstableInfo { - object_id: 12, - sst_id: 12, - table_ids: vec![100], - key_range: Some(KeyRange { - left: iterator_test_key_of_epoch(1, 2, 2), - right: iterator_test_key_of_epoch(1, 2, 2), - right_exclusive: false, - }), - ..Default::default() - }, - ]; - compaction_task.task_status = TaskStatus::Success.into(); + assert!(hummock_manager - .report_compact_task(&mut compaction_task, None) + .report_compact_task( + compaction_task.task_id, + TaskStatus::Success, + vec![ + SstableInfo { + object_id: 11, + sst_id: 11, + table_ids: vec![100, 101], + key_range: Some(KeyRange { + left: iterator_test_key_of_epoch(1, 1, 1), + right: iterator_test_key_of_epoch(1, 1, 1), + right_exclusive: false, + }), + ..Default::default() + }, + SstableInfo { + object_id: 12, + sst_id: 12, + table_ids: vec![100], + key_range: Some(KeyRange { + left: iterator_test_key_of_epoch(1, 2, 2), + right: iterator_test_key_of_epoch(1, 2, 2), + right_exclusive: false, + }), + ..Default::default() + }, + ], + None + ) .await .unwrap()); let current_version = hummock_manager.get_current_version().await; @@ -1911,7 +1920,7 @@ async fn test_compaction_task_expiration_due_to_split_group() { .await .unwrap(); - let mut compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; + let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 2); hummock_manager .split_compaction_group(2, &[100]) @@ -1919,9 +1928,9 @@ async fn test_compaction_task_expiration_due_to_split_group() { .unwrap(); let version_1 = hummock_manager.get_current_version().await; - compaction_task.task_status = TaskStatus::Success.into(); + // compaction_task.task_status = TaskStatus::Success.into(); assert!(!hummock_manager - .report_compact_task(&mut compaction_task, None) + .report_compact_task(compaction_task.task_id, TaskStatus::Success, vec![], None) .await .unwrap()); let version_2 = hummock_manager.get_current_version().await; @@ -1930,11 +1939,10 @@ async fn test_compaction_task_expiration_due_to_split_group() { "version should not change because compaction task has been cancelled" ); - let mut compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; + let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 2); - compaction_task.task_status = TaskStatus::Success.into(); hummock_manager - .report_compact_task(&mut compaction_task, None) + .report_compact_task(compaction_task.task_id, TaskStatus::Success, vec![], None) .await .unwrap(); @@ -1968,18 +1976,21 @@ async fn test_move_tables_between_compaction_group() { .await .unwrap(); // Construct data via manual compaction - let mut compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; + let compaction_task = get_manual_compact_task(&hummock_manager, context_id).await; let base_level: usize = 6; assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 1); assert_eq!(compaction_task.target_level, base_level as u32); - compaction_task.sorted_output_ssts = vec![ - gen_sstable_info(11, 1, vec![100]), - gen_sstable_info(12, 2, vec![100, 101]), - gen_sstable_info(13, 3, vec![101, 102]), - ]; - compaction_task.task_status = TaskStatus::Success.into(); assert!(hummock_manager - .report_compact_task(&mut compaction_task, None) + .report_compact_task( + compaction_task.task_id, + TaskStatus::Success, + vec![ + gen_sstable_info(11, 1, vec![100]), + gen_sstable_info(12, 2, vec![100, 101]), + gen_sstable_info(13, 3, vec![101, 102]), + ], + None + ) .await .unwrap()); let sst_2 = gen_extend_sstable_info(14, 2, 1, vec![101, 102]); @@ -2021,9 +2032,10 @@ async fn test_move_tables_between_compaction_group() { let groups = info.keys().sorted().cloned().collect_vec(); assert_eq!(groups, vec![2, new_group_id]); - let mut selector: Box = Box::::default(); + let mut selector: Box = + Box::::default(); - let mut compaction_task = hummock_manager + let compaction_task = hummock_manager .get_compact_task(2, &mut selector) .await .unwrap() @@ -2031,11 +2043,14 @@ async fn test_move_tables_between_compaction_group() { assert_eq!(compaction_task.existing_table_ids, vec![101, 102]); assert_eq!(compaction_task.input_ssts[0].table_infos.len(), 1); assert_eq!(compaction_task.input_ssts[0].table_infos[0].object_id, 12); - compaction_task.sorted_output_ssts = vec![gen_sstable_info(20, 2, vec![101])]; - compaction_task.task_status = TaskStatus::Success.into(); let ret = hummock_manager - .report_compact_task(&mut compaction_task, None) + .report_compact_task( + compaction_task.task_id, + TaskStatus::Success, + vec![gen_sstable_info(20, 2, vec![101])], + None, + ) .await .unwrap(); assert!(ret); diff --git a/src/meta/src/hummock/manager/versioning.rs b/src/meta/src/hummock/manager/versioning.rs index 1e939513bbf3d..e1ed8a5d716c2 100644 --- a/src/meta/src/hummock/manager/versioning.rs +++ b/src/meta/src/hummock/manager/versioning.rs @@ -23,6 +23,7 @@ use risingwave_hummock_sdk::compaction_group::hummock_version_ext::{ HummockVersionExt, }; use risingwave_hummock_sdk::compaction_group::{StateTableId, StaticCompactionGroupId}; +use risingwave_hummock_sdk::table_stats::add_prost_table_stats_map; use risingwave_hummock_sdk::{ CompactionGroupId, HummockContextId, HummockSstableObjectId, HummockVersionId, FIRST_VERSION_ID, }; @@ -30,15 +31,18 @@ use risingwave_pb::common::WorkerNode; use risingwave_pb::hummock::write_limits::WriteLimit; use risingwave_pb::hummock::{ CompactionConfig, HummockPinnedSnapshot, HummockPinnedVersion, HummockVersion, - HummockVersionCheckpoint, HummockVersionDelta, HummockVersionStats, + HummockVersionCheckpoint, HummockVersionDelta, HummockVersionStats, SstableInfo, TableStats, }; use risingwave_pb::meta::subscribe_response::{Info, Operation}; +use crate::hummock::error::Result; use crate::hummock::manager::worker::{HummockManagerEvent, HummockManagerEventSender}; -use crate::hummock::manager::{read_lock, write_lock}; +use crate::hummock::manager::{commit_multi_var, read_lock, write_lock}; use crate::hummock::metrics_utils::{trigger_safepoint_stat, trigger_write_stop_stats}; use crate::hummock::model::CompactionGroup; use crate::hummock::HummockManager; +use crate::model::{ValTransaction, VarTransaction}; +use crate::storage::Transaction; /// `HummockVersionSafePoint` prevents hummock versions GE than it from being GC. /// It's used by meta node itself to temporarily pin versions. @@ -277,6 +281,16 @@ impl HummockManager { let guard = read_lock!(self, versioning).await; guard.branched_ssts.clone() } + + #[named] + pub async fn rebuild_table_stats(&self) -> Result<()> { + let mut versioning = write_lock!(self, versioning).await; + let new_stats = rebuild_table_stats(&versioning.current_version); + let mut version_stats = VarTransaction::new(&mut versioning.version_stats); + *version_stats = new_stats; + commit_multi_var!(self, None, Transaction::default(), version_stats)?; + Ok(()) + } } /// Calculates write limits for `target_groups`. @@ -338,6 +352,47 @@ pub(super) fn create_init_version(default_compaction_config: CompactionConfig) - init_version } +/// Rebuilds table stats from the given version. +/// Note that the result is approximate value. See `estimate_table_stats`. +fn rebuild_table_stats(version: &HummockVersion) -> HummockVersionStats { + let mut stats = HummockVersionStats { + hummock_version_id: version.id, + table_stats: Default::default(), + }; + for level in version.get_combined_levels() { + for sst in &level.table_infos { + let changes = estimate_table_stats(sst); + add_prost_table_stats_map(&mut stats.table_stats, &changes); + } + } + stats +} + +/// Estimates table stats change from the given file. +/// - The file stats is evenly distributed among multiple tables within the file. +/// - The total key size and total value size are estimated based on key range and file size. +/// - Branched files may lead to an overestimation. +fn estimate_table_stats(sst: &SstableInfo) -> HashMap { + let mut changes: HashMap = HashMap::default(); + let weighted_value = + |value: i64| -> i64 { (value as f64 / sst.table_ids.len() as f64).ceil() as i64 }; + let key_range = sst.key_range.as_ref().unwrap(); + let estimated_key_size: u64 = (key_range.left.len() + key_range.right.len()) as u64 / 2; + let mut estimated_total_key_size = estimated_key_size * sst.total_key_count; + if estimated_total_key_size > sst.uncompressed_file_size { + estimated_total_key_size = sst.uncompressed_file_size / 2; + tracing::warn!(sst.sst_id, "Calculated estimated_total_key_size {} > uncompressed_file_size {}. Use uncompressed_file_size/2 as estimated_total_key_size instead.", estimated_total_key_size, sst.uncompressed_file_size); + } + let estimated_total_value_size = sst.uncompressed_file_size - estimated_total_key_size; + for table_id in &sst.table_ids { + let e = changes.entry(*table_id).or_default(); + e.total_key_count += weighted_value(sst.total_key_count as i64); + e.total_key_size += weighted_value(estimated_total_key_size as i64); + e.total_value_size += weighted_value(estimated_total_value_size as i64); + } + changes +} + #[cfg(test)] mod tests { use std::collections::HashMap; @@ -346,10 +401,15 @@ mod tests { use risingwave_hummock_sdk::{CompactionGroupId, HummockVersionId}; use risingwave_pb::hummock::hummock_version::Levels; use risingwave_pb::hummock::write_limits::WriteLimit; - use risingwave_pb::hummock::{HummockPinnedVersion, HummockVersion, Level, OverlappingLevel}; + use risingwave_pb::hummock::{ + HummockPinnedVersion, HummockVersion, HummockVersionStats, KeyRange, Level, + OverlappingLevel, SstableInfo, + }; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; - use crate::hummock::manager::versioning::{calc_new_write_limits, Versioning}; + use crate::hummock::manager::versioning::{ + calc_new_write_limits, estimate_table_stats, rebuild_table_stats, Versioning, + }; use crate::hummock::model::CompactionGroup; #[test] @@ -470,4 +530,92 @@ mod tests { "too many L0 sub levels: 11 > 5" ); } + + #[test] + fn test_estimate_table_stats() { + let sst = SstableInfo { + key_range: Some(KeyRange { + left: vec![1; 10], + right: vec![1; 20], + ..Default::default() + }), + table_ids: vec![1, 2, 3], + total_key_count: 6000, + uncompressed_file_size: 6_000_000, + ..Default::default() + }; + let changes = estimate_table_stats(&sst); + assert_eq!(changes.len(), 3); + for stats in changes.values() { + assert_eq!(stats.total_key_count, 6000 / 3); + assert_eq!(stats.total_key_size, (10 + 20) / 2 * 6000 / 3); + assert_eq!( + stats.total_value_size, + (6_000_000 - (10 + 20) / 2 * 6000) / 3 + ); + } + + let mut version = HummockVersion { + id: 123, + levels: Default::default(), + max_committed_epoch: 0, + safe_epoch: 0, + }; + for cg in 1..3 { + version.levels.insert( + cg, + Levels { + levels: vec![Level { + table_infos: vec![sst.clone()], + ..Default::default() + }], + l0: Some(Default::default()), + ..Default::default() + }, + ); + } + let HummockVersionStats { + hummock_version_id, + table_stats, + } = rebuild_table_stats(&version); + assert_eq!(hummock_version_id, version.id); + assert_eq!(table_stats.len(), 3); + for (tid, stats) in table_stats { + assert_eq!( + stats.total_key_count, + changes.get(&tid).unwrap().total_key_count * 2 + ); + assert_eq!( + stats.total_key_size, + changes.get(&tid).unwrap().total_key_size * 2 + ); + assert_eq!( + stats.total_value_size, + changes.get(&tid).unwrap().total_value_size * 2 + ); + } + } + + #[test] + fn test_estimate_table_stats_large_key_range() { + let sst = SstableInfo { + key_range: Some(KeyRange { + left: vec![1; 1000], + right: vec![1; 2000], + ..Default::default() + }), + table_ids: vec![1, 2, 3], + total_key_count: 6000, + uncompressed_file_size: 60_000, + ..Default::default() + }; + let changes = estimate_table_stats(&sst); + assert_eq!(changes.len(), 3); + for t in &sst.table_ids { + let stats = changes.get(t).unwrap(); + assert_eq!(stats.total_key_count, 6000 / 3); + assert_eq!(stats.total_key_size, 60_000 / 2 / 3); + assert_eq!(stats.total_value_size, (60_000 - 60_000 / 2) / 3); + } + } } diff --git a/src/meta/src/hummock/manager/worker.rs b/src/meta/src/hummock/manager/worker.rs index 8a43ddc87247b..bc2103635b59f 100644 --- a/src/meta/src/hummock/manager/worker.rs +++ b/src/meta/src/hummock/manager/worker.rs @@ -34,7 +34,7 @@ pub enum HummockManagerEvent { } impl HummockManager { - pub(crate) async fn start_worker( + pub async fn start_worker( self: &HummockManagerRef, mut receiver: HummockManagerEventReceiver, ) -> JoinHandle<()> { @@ -98,58 +98,29 @@ impl HummockManager { let retry_strategy = ExponentialBackoff::from_millis(10) .max_delay(Duration::from_secs(60)) .map(jitter); - match notification { - LocalNotification::WorkerNodeDeleted(worker_node) => { - if worker_node.get_type().unwrap() == WorkerType::Compactor { - self.compactor_manager.remove_compactor(worker_node.id); - } - tokio_retry::RetryIf::spawn( - retry_strategy.clone(), - || async { - if let Err(err) = self.release_contexts(vec![worker_node.id]).await { - tracing::warn!( - "Failed to release hummock context {}. {}. Will retry.", - worker_node.id, - err - ); - return Err(err); - } - Ok(()) - }, - RetryableError::default(), - ) - .await - .expect("retry until success"); - tracing::info!("Released hummock context {}", worker_node.id); - sync_point!("AFTER_RELEASE_HUMMOCK_CONTEXTS_ASYNC"); + if let LocalNotification::WorkerNodeDeleted(worker_node) = notification { + if worker_node.get_type().unwrap() == WorkerType::Compactor { + self.compactor_manager.remove_compactor(worker_node.id); } - // TODO move `CompactionTaskNeedCancel` to `handle_hummock_manager_event` - // TODO extract retry boilerplate code - LocalNotification::CompactionTaskNeedCancel(compact_task) => { - let task_id = compact_task.task_id; - tokio_retry::RetryIf::spawn( - retry_strategy.clone(), - || async { - let mut compact_task_mut = compact_task.clone(); - if let Err(err) = self.cancel_compact_task_impl(&mut compact_task_mut).await - { - tracing::warn!( - "Failed to cancel compaction task {}. {}. Will retry.", - compact_task.task_id, - err - ); - return Err(err); - } - Ok(()) - }, - RetryableError::default(), - ) - .await - .expect("retry until success"); - tracing::info!("Cancelled compaction task {}", task_id); - sync_point!("AFTER_CANCEL_COMPACTION_TASK_ASYNC"); - } - _ => {} + tokio_retry::RetryIf::spawn( + retry_strategy.clone(), + || async { + if let Err(err) = self.release_contexts(vec![worker_node.id]).await { + tracing::warn!( + "Failed to release hummock context {}. {}. Will retry.", + worker_node.id, + err + ); + return Err(err); + } + Ok(()) + }, + RetryableError::default(), + ) + .await + .expect("retry until success"); + tracing::info!("Released hummock context {}", worker_node.id); + sync_point!("AFTER_RELEASE_HUMMOCK_CONTEXTS_ASYNC"); } } } diff --git a/src/meta/src/hummock/metrics_utils.rs b/src/meta/src/hummock/metrics_utils.rs index 8bcc1f1d2c1c6..6818b7f68570e 100644 --- a/src/meta/src/hummock/metrics_utils.rs +++ b/src/meta/src/hummock/metrics_utils.rs @@ -32,7 +32,8 @@ use risingwave_pb::hummock::{ HummockVersionCheckpoint, HummockVersionStats, LevelType, }; -use super::compaction::{get_compression_algorithm, DynamicLevelSelectorCore}; +use super::compaction::get_compression_algorithm; +use super::compaction::selector::DynamicLevelSelectorCore; use crate::hummock::compaction::CompactStatus; use crate::rpc::metrics::MetaMetrics; @@ -483,15 +484,15 @@ pub fn trigger_split_stat( .with_label_values(&[&group_label]) .set(member_table_id_len as _); - let branched_sst_count = branched_ssts + let branched_sst_count: usize = branched_ssts .values() - .map(|branched_map| branched_map.iter()) - .flat_map(|branched_map| { + .map(|branched_map| { branched_map - .filter(|(group_id, _sst_id)| **group_id == compaction_group_id) - .map(|(_, v)| v) + .keys() + .filter(|group_id| **group_id == compaction_group_id) + .count() }) - .sum::(); + .sum(); metrics .branched_sst_count diff --git a/src/meta/src/hummock/mock_hummock_meta_client.rs b/src/meta/src/hummock/mock_hummock_meta_client.rs index 915beee8e3a3f..678c701ca2891 100644 --- a/src/meta/src/hummock/mock_hummock_meta_client.rs +++ b/src/meta/src/hummock/mock_hummock_meta_client.rs @@ -27,6 +27,8 @@ use risingwave_hummock_sdk::{ SstObjectIdRange, }; use risingwave_pb::common::{HostAddress, WorkerType}; +use risingwave_pb::hummock::compact_task::TaskStatus; +use risingwave_pb::hummock::subscribe_compaction_event_request::{Event, ReportTask}; use risingwave_pb::hummock::subscribe_compaction_event_response::Event as ResponseEvent; use risingwave_pb::hummock::{ compact_task, CompactTask, HummockSnapshot, HummockVersion, SubscribeCompactionEventRequest, @@ -38,8 +40,8 @@ use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; use tokio::task::JoinHandle; use tokio_stream::wrappers::UnboundedReceiverStream; -use crate::hummock::compaction::{ - default_level_selector, LevelSelector, SpaceReclaimCompactionSelector, +use crate::hummock::compaction::selector::{ + default_compaction_selector, CompactionSelector, SpaceReclaimCompactionSelector, }; use crate::hummock::HummockManager; @@ -81,7 +83,7 @@ impl MockHummockMetaClient { self.hummock_manager .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap_or(None) @@ -224,7 +226,7 @@ impl HummockMetaClient for MockHummockMetaClient { .compactor_manager_ref_for_test() .add_compactor(context_id); - let (request_sender, _request_receiver) = + let (request_sender, mut request_receiver) = unbounded_channel::(); self.compact_context_id.store(context_id, Ordering::Release); @@ -232,6 +234,8 @@ impl HummockMetaClient for MockHummockMetaClient { let (task_tx, task_rx) = tokio::sync::mpsc::unbounded_channel(); let hummock_manager_compact = self.hummock_manager.clone(); + let mut join_handle_vec = vec![]; + let handle = tokio::spawn(async move { loop { let group_and_type = hummock_manager_compact @@ -244,8 +248,8 @@ impl HummockMetaClient for MockHummockMetaClient { let (group, task_type) = group_and_type.unwrap(); - let mut selector: Box = match task_type { - compact_task::TaskType::Dynamic => default_level_selector(), + let mut selector: Box = match task_type { + compact_task::TaskType::Dynamic => default_compaction_selector(), compact_task::TaskType::SpaceReclaim => { Box::::default() } @@ -270,11 +274,44 @@ impl HummockMetaClient for MockHummockMetaClient { } }); + join_handle_vec.push(handle); + + let hummock_manager_compact = self.hummock_manager.clone(); + let report_handle = tokio::spawn(async move { + tracing::info!("report_handle start"); + + loop { + if let Some(item) = request_receiver.recv().await { + if let Event::ReportTask(ReportTask { + task_id, + task_status, + sorted_output_ssts, + table_stats_change, + }) = item.event.unwrap() + { + if let Err(e) = hummock_manager_compact + .report_compact_task( + task_id, + TaskStatus::try_from(task_status).unwrap(), + sorted_output_ssts, + Some(table_stats_change), + ) + .await + { + tracing::error!("report compact_tack fail {e:?}"); + } + } + } + } + }); + + join_handle_vec.push(report_handle); + Ok(( request_sender, Box::pin(CompactionEventItemStream { inner: UnboundedReceiverStream::new(task_rx), - _handle: handle, + _handle: join_handle_vec, }), )) } @@ -288,7 +325,7 @@ impl MockHummockMetaClient { pub struct CompactionEventItemStream { inner: UnboundedReceiverStream, - _handle: JoinHandle<()>, + _handle: Vec>, } impl Drop for CompactionEventItemStream { diff --git a/src/meta/src/hummock/model/compaction_group_config.rs b/src/meta/src/hummock/model/compaction_group_config.rs index 8331abac62017..fa1bd1f88b3bd 100644 --- a/src/meta/src/hummock/model/compaction_group_config.rs +++ b/src/meta/src/hummock/model/compaction_group_config.rs @@ -23,8 +23,8 @@ use crate::model::{MetadataModel, MetadataModelResult}; #[derive(Debug, Clone, PartialEq)] pub struct CompactionGroup { - pub(crate) group_id: CompactionGroupId, - pub(crate) compaction_config: Arc, + pub group_id: CompactionGroupId, + pub compaction_config: Arc, } impl CompactionGroup { diff --git a/src/meta/src/hummock/model/mod.rs b/src/meta/src/hummock/model/mod.rs index a2e5d1748f351..66c12d90836b9 100644 --- a/src/meta/src/hummock/model/mod.rs +++ b/src/meta/src/hummock/model/mod.rs @@ -17,7 +17,6 @@ mod compaction_group_config; mod compaction_status; mod pinned_snapshot; mod pinned_version; -mod version; mod version_delta; mod version_stats; @@ -25,12 +24,10 @@ pub use compaction_group_config::CompactionGroup; pub use compaction_status::*; pub use pinned_snapshot::*; pub use pinned_version::*; -pub use version::*; pub use version_delta::*; /// Column family names for hummock. /// Deprecated `cf_name` should be reserved for backward compatibility. -const HUMMOCK_VERSION_CF_NAME: &str = "cf/hummock_0"; const HUMMOCK_VERSION_DELTA_CF_NAME: &str = "cf/hummock_1"; const HUMMOCK_PINNED_VERSION_CF_NAME: &str = "cf/hummock_2"; const HUMMOCK_PINNED_SNAPSHOT_CF_NAME: &str = "cf/hummock_3"; diff --git a/src/meta/src/hummock/model/version.rs b/src/meta/src/hummock/model/version.rs deleted file mode 100644 index d6a85ae745c64..0000000000000 --- a/src/meta/src/hummock/model/version.rs +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2023 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use prost::Message; -use risingwave_hummock_sdk::HummockVersionId; -use risingwave_pb::hummock::HummockVersion; - -use crate::hummock::model::HUMMOCK_VERSION_CF_NAME; -use crate::model::{MetadataModel, MetadataModelResult}; - -/// `HummockVersion` tracks `Sstables` in given version. -impl MetadataModel for HummockVersion { - type KeyType = HummockVersionId; - type PbType = HummockVersion; - - fn cf_name() -> String { - String::from(HUMMOCK_VERSION_CF_NAME) - } - - fn to_protobuf(&self) -> Self::PbType { - self.clone() - } - - fn to_protobuf_encoded_vec(&self) -> Vec { - self.encode_to_vec() - } - - fn from_protobuf(prost: Self::PbType) -> Self { - prost - } - - fn key(&self) -> MetadataModelResult { - Ok(0) - } -} diff --git a/src/meta/src/hummock/test_utils.rs b/src/meta/src/hummock/test_utils.rs index 632d56ca2c400..3d42442ae7c67 100644 --- a/src/meta/src/hummock/test_utils.rs +++ b/src/meta/src/hummock/test_utils.rs @@ -31,7 +31,7 @@ use risingwave_pb::meta::add_worker_node_request::Property; use crate::hummock::compaction::compaction_config::CompactionConfigBuilder; #[cfg(test)] -use crate::hummock::compaction::default_level_selector; +use crate::hummock::compaction::selector::default_compaction_selector; use crate::hummock::{CompactorManager, HummockManager, HummockManagerRef}; use crate::manager::{ ClusterManager, ClusterManagerRef, FragmentManager, MetaSrvEnv, META_NODE_ID, @@ -92,7 +92,7 @@ pub async fn add_test_tables( StaticCompactionGroupId::StateDefault.into(), ) .await; - let mut selector = default_level_selector(); + let mut selector = default_compaction_selector(); let mut compact_task = hummock_manager .get_compact_task(StaticCompactionGroupId::StateDefault.into(), &mut selector) .await @@ -114,10 +114,15 @@ pub async fn add_test_tables( .unwrap(); assert_eq!(compactor.context_id(), context_id); } - compact_task.sorted_output_ssts = test_tables_2.clone(); - compact_task.set_task_status(TaskStatus::Success); + let ret = hummock_manager - .report_compact_task(&mut compact_task, None) + .report_compact_task_for_test( + compact_task.task_id, + Some(compact_task), + TaskStatus::Success, + test_tables_2.clone(), + None, + ) .await .unwrap(); assert!(ret); diff --git a/src/meta/src/lib.rs b/src/meta/src/lib.rs index 92d3c571f57c5..afe66d27ad8e8 100644 --- a/src/meta/src/lib.rs +++ b/src/meta/src/lib.rs @@ -14,12 +14,10 @@ #![allow(clippy::derive_partial_eq_without_eq)] #![feature(trait_alias)] -#![feature(binary_heap_drain_sorted)] #![feature(type_alias_impl_trait)] -#![feature(extract_if)] -#![feature(custom_test_frameworks)] #![feature(lint_reasons)] #![feature(map_try_insert)] +#![feature(extract_if)] #![feature(hash_extract_if)] #![feature(btree_extract_if)] #![feature(result_option_inspect)] @@ -29,322 +27,39 @@ #![feature(assert_matches)] #![feature(try_blocks)] #![cfg_attr(coverage, feature(no_coverage))] +#![feature(custom_test_frameworks)] #![test_runner(risingwave_test_runner::test_runner::run_failpont_tests)] #![feature(is_sorted)] #![feature(impl_trait_in_assoc_type)] #![feature(type_name_of_val)] +#![feature(async_fn_in_trait)] pub mod backup_restore; -mod barrier; +pub mod barrier; +pub mod controller; #[cfg(not(madsim))] // no need in simulation test -mod dashboard; -mod error; +pub mod dashboard; +pub mod error; pub mod hummock; pub mod manager; pub mod model; -mod rpc; -pub(crate) mod serving; +pub mod model_v2; +pub mod rpc; +pub mod serving; pub mod storage; -mod stream; -pub(crate) mod telemetry; -use std::time::Duration; +pub mod stream; +pub mod telemetry; -use clap::Parser; pub use error::{MetaError, MetaResult}; -use risingwave_common::config::OverrideConfig; -use risingwave_common::{GIT_SHA, RW_VERSION}; pub use rpc::{ElectionClient, ElectionMember, EtcdElectionClient}; use crate::manager::MetaOpts; -use crate::rpc::server::{rpc_serve, AddressInfo, MetaStoreBackend}; - -#[derive(Debug, Clone, Parser, OverrideConfig)] -#[command(version, about = "The central metadata management service")] -pub struct MetaNodeOpts { - #[clap(long, env = "RW_VPC_ID")] - vpc_id: Option, - - #[clap(long, env = "RW_VPC_SECURITY_GROUP_ID")] - security_group_id: Option, - - #[clap(long, env = "RW_LISTEN_ADDR", default_value = "127.0.0.1:5690")] - listen_addr: String, - - /// The address for contacting this instance of the service. - /// This would be synonymous with the service's "public address" - /// or "identifying address". - /// It will serve as a unique identifier in cluster - /// membership and leader election. Must be specified for etcd backend. - #[clap(long, env = "RW_ADVERTISE_ADDR")] - advertise_addr: String, - - #[clap(long, env = "RW_DASHBOARD_HOST")] - dashboard_host: Option, - - #[clap(long, env = "RW_PROMETHEUS_HOST")] - prometheus_host: Option, - - #[clap(long, env = "RW_ETCD_ENDPOINTS", default_value_t = String::from(""))] - etcd_endpoints: String, - - /// Enable authentication with etcd. By default disabled. - #[clap(long, env = "RW_ETCD_AUTH")] - etcd_auth: bool, - - /// Username of etcd, required when --etcd-auth is enabled. - #[clap(long, env = "RW_ETCD_USERNAME", default_value = "")] - etcd_username: String, - - /// Password of etcd, required when --etcd-auth is enabled. - #[clap(long, env = "RW_ETCD_PASSWORD", default_value = "")] - etcd_password: String, - - #[clap(long, env = "RW_DASHBOARD_UI_PATH")] - dashboard_ui_path: Option, - - /// For dashboard service to fetch cluster info. - #[clap(long, env = "RW_PROMETHEUS_ENDPOINT")] - prometheus_endpoint: Option, - - /// Endpoint of the connector node, there will be a sidecar connector node - /// colocated with Meta node in the cloud environment - #[clap(long, env = "RW_CONNECTOR_RPC_ENDPOINT")] - pub connector_rpc_endpoint: Option, - - /// Default tag for the endpoint created when creating a privatelink connection. - /// Will be appended to the tags specified in the `tags` field in with clause in `create - /// connection`. - #[clap(long, env = "RW_PRIVATELINK_ENDPOINT_DEFAULT_TAGS")] - pub privatelink_endpoint_default_tags: Option, - - /// The path of `risingwave.toml` configuration file. - /// - /// If empty, default configuration values will be used. - #[clap(long, env = "RW_CONFIG_PATH", default_value = "")] - pub config_path: String, - - #[clap(long, env = "RW_BACKEND", value_enum)] - #[override_opts(path = meta.backend)] - backend: Option, - - /// The interval of periodic barrier. - #[clap(long, env = "RW_BARRIER_INTERVAL_MS")] - #[override_opts(path = system.barrier_interval_ms)] - barrier_interval_ms: Option, - - /// Target size of the Sstable. - #[clap(long, env = "RW_SSTABLE_SIZE_MB")] - #[override_opts(path = system.sstable_size_mb)] - sstable_size_mb: Option, - - /// Size of each block in bytes in SST. - #[clap(long, env = "RW_BLOCK_SIZE_KB")] - #[override_opts(path = system.block_size_kb)] - block_size_kb: Option, - - /// False positive probability of bloom filter. - #[clap(long, env = "RW_BLOOM_FALSE_POSITIVE")] - #[override_opts(path = system.bloom_false_positive)] - bloom_false_positive: Option, - - /// State store url - #[clap(long, env = "RW_STATE_STORE")] - #[override_opts(path = system.state_store)] - state_store: Option, - - /// Remote directory for storing data and metadata objects. - #[clap(long, env = "RW_DATA_DIRECTORY")] - #[override_opts(path = system.data_directory)] - data_directory: Option, - - /// Whether config object storage bucket lifecycle to purge stale data. - #[clap(long, env = "RW_DO_NOT_CONFIG_BUCKET_LIFECYCLE")] - #[override_opts(path = meta.do_not_config_object_storage_lifecycle)] - do_not_config_object_storage_lifecycle: Option, - - /// Remote storage url for storing snapshots. - #[clap(long, env = "RW_BACKUP_STORAGE_URL")] - #[override_opts(path = system.backup_storage_url)] - backup_storage_url: Option, - - /// Remote directory for storing snapshots. - #[clap(long, env = "RW_BACKUP_STORAGE_DIRECTORY")] - #[override_opts(path = system.backup_storage_directory)] - backup_storage_directory: Option, - - #[clap(long, env = "RW_OBJECT_STORE_STREAMING_READ_TIMEOUT_MS", value_enum)] - #[override_opts(path = storage.object_store_streaming_read_timeout_ms)] - pub object_store_streaming_read_timeout_ms: Option, - #[clap(long, env = "RW_OBJECT_STORE_STREAMING_UPLOAD_TIMEOUT_MS", value_enum)] - #[override_opts(path = storage.object_store_streaming_upload_timeout_ms)] - pub object_store_streaming_upload_timeout_ms: Option, - #[clap(long, env = "RW_OBJECT_STORE_UPLOAD_TIMEOUT_MS", value_enum)] - #[override_opts(path = storage.object_store_upload_timeout_ms)] - pub object_store_upload_timeout_ms: Option, - #[clap(long, env = "RW_OBJECT_STORE_READ_TIMEOUT_MS", value_enum)] - #[override_opts(path = storage.object_store_read_timeout_ms)] - pub object_store_read_timeout_ms: Option, -} - -use std::future::Future; -use std::pin::Pin; - -use risingwave_common::config::{load_config, MetaBackend, RwConfig}; -use tracing::info; - -/// Start meta node -pub fn start(opts: MetaNodeOpts) -> Pin + Send>> { - // WARNING: don't change the function signature. Making it `async fn` will cause - // slow compile in release mode. - Box::pin(async move { - info!("Starting meta node"); - info!("> options: {:?}", opts); - let config = load_config(&opts.config_path, &opts); - info!("> config: {:?}", config); - info!("> version: {} ({})", RW_VERSION, GIT_SHA); - let listen_addr = opts.listen_addr.parse().unwrap(); - let dashboard_addr = opts.dashboard_host.map(|x| x.parse().unwrap()); - let prometheus_addr = opts.prometheus_host.map(|x| x.parse().unwrap()); - let backend = match config.meta.backend { - MetaBackend::Etcd => MetaStoreBackend::Etcd { - endpoints: opts - .etcd_endpoints - .split(',') - .map(|x| x.to_string()) - .collect(), - credentials: match opts.etcd_auth { - true => Some((opts.etcd_username, opts.etcd_password)), - false => None, - }, - }, - MetaBackend::Mem => MetaStoreBackend::Mem, - }; - - validate_config(&config); - - let max_heartbeat_interval = - Duration::from_secs(config.meta.max_heartbeat_interval_secs as u64); - let max_idle_ms = config.meta.dangerous_max_idle_secs.unwrap_or(0) * 1000; - let in_flight_barrier_nums = config.streaming.in_flight_barrier_nums; - let privatelink_endpoint_default_tags = - opts.privatelink_endpoint_default_tags.map(|tags| { - tags.split(',') - .map(|s| { - let key_val = s.split_once('=').unwrap(); - (key_val.0.to_string(), key_val.1.to_string()) - }) - .collect() - }); - - info!("Meta server listening at {}", listen_addr); - let add_info = AddressInfo { - advertise_addr: opts.advertise_addr, - listen_addr, - prometheus_addr, - dashboard_addr, - ui_path: opts.dashboard_ui_path, - }; - - let (mut join_handle, leader_lost_handle, shutdown_send) = rpc_serve( - add_info, - backend, - max_heartbeat_interval, - config.meta.meta_leader_lease_secs, - MetaOpts { - enable_recovery: !config.meta.disable_recovery, - in_flight_barrier_nums, - max_idle_ms, - compaction_deterministic_test: config.meta.enable_compaction_deterministic, - default_parallelism: config.meta.default_parallelism, - vacuum_interval_sec: config.meta.vacuum_interval_sec, - vacuum_spin_interval_ms: config.meta.vacuum_spin_interval_ms, - hummock_version_checkpoint_interval_sec: config - .meta - .hummock_version_checkpoint_interval_sec, - min_delta_log_num_for_hummock_version_checkpoint: config - .meta - .min_delta_log_num_for_hummock_version_checkpoint, - min_sst_retention_time_sec: config.meta.min_sst_retention_time_sec, - full_gc_interval_sec: config.meta.full_gc_interval_sec, - collect_gc_watermark_spin_interval_sec: config - .meta - .collect_gc_watermark_spin_interval_sec, - enable_committed_sst_sanity_check: config.meta.enable_committed_sst_sanity_check, - periodic_compaction_interval_sec: config.meta.periodic_compaction_interval_sec, - node_num_monitor_interval_sec: config.meta.node_num_monitor_interval_sec, - prometheus_endpoint: opts.prometheus_endpoint, - vpc_id: opts.vpc_id, - security_group_id: opts.security_group_id, - connector_rpc_endpoint: opts.connector_rpc_endpoint, - privatelink_endpoint_default_tags, - periodic_space_reclaim_compaction_interval_sec: config - .meta - .periodic_space_reclaim_compaction_interval_sec, - telemetry_enabled: config.server.telemetry_enabled, - periodic_ttl_reclaim_compaction_interval_sec: config - .meta - .periodic_ttl_reclaim_compaction_interval_sec, - periodic_tombstone_reclaim_compaction_interval_sec: config - .meta - .periodic_tombstone_reclaim_compaction_interval_sec, - periodic_split_compact_group_interval_sec: config - .meta - .periodic_split_compact_group_interval_sec, - split_group_size_limit: config.meta.split_group_size_limit, - min_table_split_size: config.meta.move_table_size_limit, - table_write_throughput_threshold: config.meta.table_write_throughput_threshold, - min_table_split_write_throughput: config.meta.min_table_split_write_throughput, - partition_vnode_count: config.meta.partition_vnode_count, - do_not_config_object_storage_lifecycle: config - .meta - .do_not_config_object_storage_lifecycle, - compaction_task_max_heartbeat_interval_secs: config - .meta - .compaction_task_max_heartbeat_interval_secs, - compaction_config: Some(config.meta.compaction_config), - }, - config.system.into_init_system_params(), - ) - .await - .unwrap(); - - match leader_lost_handle { - None => { - tokio::select! { - _ = tokio::signal::ctrl_c() => { - tracing::info!("receive ctrl+c"); - shutdown_send.send(()).unwrap(); - join_handle.await.unwrap() - } - res = &mut join_handle => res.unwrap(), - }; - } - Some(mut handle) => { - tokio::select! { - _ = &mut handle => { - tracing::info!("receive leader lost signal"); - // When we lose leadership, we will exit as soon as possible. - } - _ = tokio::signal::ctrl_c() => { - tracing::info!("receive ctrl+c"); - shutdown_send.send(()).unwrap(); - join_handle.await.unwrap(); - handle.abort(); - } - res = &mut join_handle => { - res.unwrap(); - handle.abort(); - }, - }; - } - }; - }) -} -fn validate_config(config: &RwConfig) { - if config.meta.meta_leader_lease_secs <= 2 { - let error_msg = "meta leader lease secs should be larger than 2"; - tracing::error!(error_msg); - panic!("{}", error_msg); - } +#[derive(Debug)] +pub enum MetaStoreBackend { + Etcd { + endpoints: Vec, + credentials: Option<(String, String)>, + }, + Mem, } diff --git a/src/meta/src/manager/catalog/database.rs b/src/meta/src/manager/catalog/database.rs index ad1928e0bdd50..62b5692ce82ba 100644 --- a/src/meta/src/manager/catalog/database.rs +++ b/src/meta/src/manager/catalog/database.rs @@ -16,10 +16,12 @@ use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap, HashSet}; use itertools::Itertools; +use risingwave_common::bail; use risingwave_common::catalog::TableOption; use risingwave_pb::catalog::table::TableType; use risingwave_pb::catalog::{ - Connection, Database, Function, Index, PbStreamJobStatus, Schema, Sink, Source, Table, View, + Connection, CreateType, Database, Function, Index, PbStreamJobStatus, Schema, Sink, Source, + StreamJobStatus, Table, View, }; use super::{ConnectionId, DatabaseId, FunctionId, RelationId, SchemaId, SinkId, SourceId, ViewId}; @@ -194,12 +196,16 @@ impl DatabaseManager { } pub fn check_relation_name_duplicated(&self, relation_key: &RelationKey) -> MetaResult<()> { - if self.tables.values().any(|x| { + if let Some(t) = self.tables.values().find(|x| { x.database_id == relation_key.0 && x.schema_id == relation_key.1 && x.name.eq(&relation_key.2) }) { - Err(MetaError::catalog_duplicated("table", &relation_key.2)) + if t.stream_job_status == StreamJobStatus::Creating as i32 { + bail!("table is in creating procedure: {}", t.id); + } else { + Err(MetaError::catalog_duplicated("table", &relation_key.2)) + } } else if self.sources.values().any(|x| { x.database_id == relation_key.0 && x.schema_id == relation_key.1 @@ -258,9 +264,22 @@ impl DatabaseManager { self.databases.values().cloned().collect_vec() } - pub fn list_creating_tables(&self) -> Vec
{ - self.in_progress_creating_tables + pub fn list_creating_background_mvs(&self) -> Vec
{ + self.tables + .values() + .filter(|&t| { + t.stream_job_status == PbStreamJobStatus::Creating as i32 + && t.table_type == TableType::MaterializedView as i32 + && t.create_type == CreateType::Background as i32 + }) + .cloned() + .collect_vec() + } + + pub fn list_persisted_creating_tables(&self) -> Vec
{ + self.tables .values() + .filter(|&t| t.stream_job_status == PbStreamJobStatus::Creating as i32) .cloned() .collect_vec() } @@ -389,10 +408,12 @@ impl DatabaseManager { .contains(&relation.clone()) } + /// For all types of DDL pub fn mark_creating(&mut self, relation: &RelationKey) { self.in_progress_creation_tracker.insert(relation.clone()); } + /// Only for streaming DDL pub fn mark_creating_streaming_job(&mut self, table_id: TableId, key: RelationKey) { self.in_progress_creation_streaming_job .insert(table_id, key); @@ -417,6 +438,11 @@ impl DatabaseManager { self.in_progress_creation_streaming_job.keys().cloned() } + pub fn clear_creating_stream_jobs(&mut self) { + self.in_progress_creation_tracker.clear(); + self.in_progress_creation_streaming_job.clear(); + } + pub fn mark_creating_tables(&mut self, tables: &[Table]) { self.in_progress_creating_tables .extend(tables.iter().map(|t| (t.id, t.clone()))); diff --git a/src/meta/src/manager/catalog/fragment.rs b/src/meta/src/manager/catalog/fragment.rs index 1a74608c848a1..8b26b8afa11d9 100644 --- a/src/meta/src/manager/catalog/fragment.rs +++ b/src/meta/src/manager/catalog/fragment.rs @@ -43,7 +43,7 @@ use crate::model::{ }; use crate::storage::Transaction; use crate::stream::{SplitAssignment, TableRevision}; -use crate::MetaResult; +use crate::{MetaError, MetaResult}; pub struct FragmentManagerCore { table_fragments: BTreeMap, @@ -163,6 +163,56 @@ impl FragmentManager { map.values().cloned().collect() } + /// The `table_ids` here should correspond to stream jobs. + /// We get their corresponding table fragment, and from there, + /// we get the actors that are in the table fragment. + pub async fn get_table_id_actor_mapping( + &self, + table_ids: &[TableId], + ) -> HashMap> { + let map = &self.core.read().await.table_fragments; + let mut table_map = HashMap::new(); + for table_id in table_ids { + if let Some(table_fragment) = map.get(table_id) { + let mut actors = vec![]; + for fragment in table_fragment.fragments.values() { + for actor in &fragment.actors { + actors.push(actor.actor_id) + } + } + table_map.insert(*table_id, actors); + } + } + table_map + } + + /// Gets the counts for each upstream relation that each stream job + /// indicated by `table_ids` depends on. + /// For example in the following query: + /// ```sql + /// CREATE MATERIALIZED VIEW m1 AS + /// SELECT * FROM t1 JOIN t2 ON t1.a = t2.a JOIN t3 ON t2.b = t3.b + /// ``` + /// + /// We have t1 occurring once, and t2 occurring once. + pub async fn get_upstream_relation_counts( + &self, + table_ids: &[TableId], + ) -> HashMap> { + let map = &self.core.read().await.table_fragments; + let mut upstream_relation_counts = HashMap::new(); + for table_id in table_ids { + if let Some(table_fragments) = map.get(table_id) { + let dependent_ids = table_fragments.dependent_table_ids(); + let r = upstream_relation_counts.insert(*table_id, dependent_ids); + assert!(r.is_none(), "Each table_id should be unique!") + } else { + upstream_relation_counts.insert(*table_id, HashMap::new()); + } + } + upstream_relation_counts + } + pub fn get_mv_id_to_internal_table_ids_mapping(&self) -> Option)>> { match self.core.try_read() { Ok(core) => Some( @@ -231,10 +281,11 @@ impl FragmentManager { table_id: &TableId, ) -> MetaResult { let map = &self.core.read().await.table_fragments; - Ok(map - .get(table_id) - .cloned() - .with_context(|| format!("table_fragment not exist: id={}", table_id))?) + if let Some(table_fragment) = map.get(table_id) { + Ok(table_fragment.clone()) + } else { + Err(MetaError::fragment_not_found(table_id.table_id)) + } } pub async fn select_table_fragments_by_ids( @@ -244,15 +295,32 @@ impl FragmentManager { let map = &self.core.read().await.table_fragments; let mut table_fragments = Vec::with_capacity(table_ids.len()); for table_id in table_ids { - table_fragments.push( - map.get(table_id) - .cloned() - .with_context(|| format!("table_fragment not exist: id={}", table_id))?, - ); + table_fragments.push(if let Some(table_fragment) = map.get(table_id) { + table_fragment.clone() + } else { + return Err(MetaError::fragment_not_found(table_id.table_id)); + }); } Ok(table_fragments) } + pub async fn get_table_id_table_fragment_map( + &self, + table_ids: &[TableId], + ) -> MetaResult> { + let map = &self.core.read().await.table_fragments; + let mut id_to_fragment = HashMap::new(); + for table_id in table_ids { + let table_fragment = if let Some(table_fragment) = map.get(table_id) { + table_fragment.clone() + } else { + return Err(MetaError::fragment_not_found(table_id.table_id)); + }; + id_to_fragment.insert(*table_id, table_fragment); + } + Ok(id_to_fragment) + } + /// Start create a new `TableFragments` and insert it into meta store, currently the actors' /// state is `ActorState::Inactive` and the table fragments' state is `State::Initial`. pub async fn start_create_table_fragments( @@ -499,6 +567,8 @@ impl FragmentManager { /// Drop table fragments info and remove downstream actor infos in fragments from its dependent /// tables. + /// If table fragments already deleted, this should just be noop, + /// the delete function (`table_fragments.remove`) will not return an error. pub async fn drop_table_fragments_vec(&self, table_ids: &HashSet) -> MetaResult<()> { let mut guard = self.core.write().await; let current_revision = guard.table_revision; @@ -514,7 +584,7 @@ impl FragmentManager { table_fragments.remove(table_fragment.table_id()); let chain_actor_ids = table_fragment.chain_actor_ids(); let dependent_table_ids = table_fragment.dependent_table_ids(); - for dependent_table_id in dependent_table_ids { + for (dependent_table_id, _) in dependent_table_ids { if table_ids.contains(&dependent_table_id) { continue; } diff --git a/src/meta/src/manager/catalog/mod.rs b/src/meta/src/manager/catalog/mod.rs index 1c8f0c2f397c0..bcac32922d180 100644 --- a/src/meta/src/manager/catalog/mod.rs +++ b/src/meta/src/manager/catalog/mod.rs @@ -17,7 +17,7 @@ mod fragment; mod user; mod utils; -use std::collections::{HashMap, HashSet, VecDeque}; +use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::iter; use std::option::Option::Some; use std::sync::Arc; @@ -32,9 +32,10 @@ use risingwave_common::catalog::{ DEFAULT_SUPER_USER_FOR_PG_ID, DEFAULT_SUPER_USER_ID, SYSTEM_SCHEMAS, }; use risingwave_common::{bail, ensure}; -use risingwave_pb::catalog::table::OptionalAssociatedSourceId; +use risingwave_pb::catalog::table::{OptionalAssociatedSourceId, TableType}; use risingwave_pb::catalog::{ - Connection, Database, Function, Index, PbStreamJobStatus, Schema, Sink, Source, Table, View, + Connection, CreateType, Database, Function, Index, PbStreamJobStatus, Schema, Sink, Source, + StreamJobStatus, Table, View, }; use risingwave_pb::meta::subscribe_response::{Info, Operation}; use risingwave_pb::user::grant_privilege::{ActionWithGrantOption, Object}; @@ -44,7 +45,7 @@ use tokio::sync::{Mutex, MutexGuard}; use user::*; use crate::manager::{IdCategory, MetaSrvEnv, NotificationVersion, StreamingJob}; -use crate::model::{BTreeMapTransaction, MetadataModel, ValTransaction}; +use crate::model::{BTreeMapTransaction, MetadataModel, TableFragments, ValTransaction}; use crate::storage::Transaction; use crate::{MetaError, MetaResult}; @@ -79,7 +80,7 @@ macro_rules! commit_meta_with_trx { async { // Apply the change in `ValTransaction` to trx $( - $val_txn.apply_to_txn(&mut $trx)?; + $val_txn.apply_to_txn(&mut $trx).await?; )* // Commit to meta store $manager.env.meta_store().txn($trx).await?; @@ -115,6 +116,7 @@ use risingwave_common::util::column_index_mapping::ColIndexMapping; use risingwave_common::util::epoch::Epoch; use risingwave_pb::meta::cancel_creating_jobs_request::CreatingJobInfo; use risingwave_pb::meta::relation::RelationInfo; +use risingwave_pb::meta::table_fragments::State; use risingwave_pb::meta::{Relation, RelationGroup}; pub(crate) use {commit_meta, commit_meta_with_trx}; @@ -629,9 +631,13 @@ impl CatalogManager { pub async fn start_create_stream_job_procedure( &self, stream_job: &StreamingJob, + internal_tables: Vec
, ) -> MetaResult<()> { match stream_job { - StreamingJob::MaterializedView(table) => self.start_create_table_procedure(table).await, + StreamingJob::MaterializedView(table) => { + self.start_create_table_procedure(table, internal_tables) + .await + } StreamingJob::Sink(sink) => self.start_create_sink_procedure(sink).await, StreamingJob::Index(index, index_table) => { self.start_create_index_procedure(index, index_table).await @@ -641,7 +647,7 @@ impl CatalogManager { self.start_create_table_procedure_with_source(source, table) .await } else { - self.start_create_table_procedure(table).await + self.start_create_table_procedure(table, vec![]).await } } } @@ -694,7 +700,11 @@ impl CatalogManager { } /// This is used for both `CREATE TABLE` and `CREATE MATERIALIZED VIEW`. - pub async fn start_create_table_procedure(&self, table: &Table) -> MetaResult<()> { + pub async fn start_create_table_procedure( + &self, + table: &Table, + internal_tables: Vec
, + ) -> MetaResult<()> { let core = &mut *self.core.lock().await; let database_core = &mut core.database; let user_core = &mut core.user; @@ -707,19 +717,151 @@ impl CatalogManager { #[cfg(not(test))] user_core.ensure_user_id(table.owner)?; let key = (table.database_id, table.schema_id, table.name.clone()); + database_core.check_relation_name_duplicated(&key)?; - if database_core.has_in_progress_creation(&key) { - bail!("table is in creating procedure"); - } else { - database_core.mark_creating(&key); - database_core.mark_creating_streaming_job(table.id, key); - for &dependent_relation_id in &table.dependent_relations { - database_core.increase_ref_count(dependent_relation_id); + let mut tables = BTreeMapTransaction::new(&mut database_core.tables); + assert!( + !tables.contains_key(&table.id), + "table must not already exist in meta" + ); + for table in internal_tables { + tables.insert(table.id, table); + } + tables.insert(table.id, table.clone()); + commit_meta!(self, tables)?; + + for &dependent_relation_id in &table.dependent_relations { + database_core.increase_ref_count(dependent_relation_id); + } + user_core.increase_ref(table.owner); + Ok(()) + } + + fn assert_table_creating(tables: &BTreeMap, table: &Table) { + if let Some(t) = tables.get(&table.id) + && let Ok(StreamJobStatus::Creating) = t.get_stream_job_status() + {} else { + panic!("Table must be in creating procedure: {table:#?}") + } + } + + pub async fn assert_tables_deleted(&self, table_ids: Vec) { + let core = self.core.lock().await; + let tables = &core.database.tables; + for id in table_ids { + assert_eq!(tables.get(&id), None,) + } + } + + /// We clean the following tables: + /// 1. Those which belonged to incomplete Foreground jobs. + /// 2. Those which did not persist their table fragments, we can't recover these. + /// 3. Those which were only initialized, but not actually running yet. + /// 4. From 2, since we don't have internal table ids from the fragments, + /// we can detect hanging table ids by just finding all internal ids + /// with: + /// 1. `stream_job_status` = CREATING + /// 2. Not belonging to a background stream job. + /// Clean up these hanging tables by the id. + pub async fn clean_dirty_tables(&self, fragment_manager: FragmentManagerRef) -> MetaResult<()> { + let creating_tables: Vec
= self.list_persisted_creating_tables().await; + tracing::debug!( + "creating_tables ids: {:#?}", + creating_tables.iter().map(|t| t.id).collect_vec() + ); + let mut reserved_internal_tables = HashSet::new(); + let mut tables_to_clean = vec![]; + let mut internal_tables_to_clean = vec![]; + for table in creating_tables { + tracing::trace!( + "checking table {} definition: {}, create_type: {:#?}, table_type: {:#?}", + table.id, + table.definition, + table.get_create_type().unwrap_or(CreateType::Foreground), + table.get_table_type().unwrap(), + ); + // 1. Incomplete Foreground jobs + if table.create_type == CreateType::Foreground as i32 + && table.table_type != TableType::Internal as i32 + // || table.create_type == CreateType::Unspecified as i32 + { + tracing::debug!("cleaning table_id for foreground: {:#?}", table.id); + tables_to_clean.push(table); + continue; } - user_core.increase_ref(table.owner); - Ok(()) + if table.table_type == TableType::Internal as i32 { + internal_tables_to_clean.push(table); + continue; + } + + // 2. No table fragments + assert_ne!(table.table_type, TableType::Internal as i32); + match fragment_manager + .select_table_fragments_by_table_id(&table.id.into()) + .await + { + Err(e) => { + if e.is_fragment_not_found() { + tracing::debug!("cleaning table_id for no fragments: {:#?}", table.id); + tables_to_clean.push(table); + continue; + } else { + return Err(e); + } + } + Ok(fragment) => { + let fragment: TableFragments = fragment; + // 3. For those in initial state (i.e. not running / created), + // we should purge them. + if fragment.state() == State::Initial { + tracing::debug!("cleaning table_id no initial state: {:#?}", table.id); + tables_to_clean.push(table); + continue; + } else { + assert_eq!(table.create_type, CreateType::Background as i32); + // 4. Get all the corresponding internal tables, the rest we can purge. + for id in fragment.internal_table_ids() { + reserved_internal_tables.insert(id); + } + continue; + } + } + } + } + for t in internal_tables_to_clean { + if !reserved_internal_tables.contains(&t.id) { + tracing::debug!( + "cleaning table_id for internal tables not reserved: {:#?}", + t.id + ); + tables_to_clean.push(t); + } + } + + let core = &mut *self.core.lock().await; + let database_core = &mut core.database; + let tables = &mut database_core.tables; + let mut tables = BTreeMapTransaction::new(tables); + for table in &tables_to_clean { + tracing::debug!("cleaning table_id: {}", table.id); + let table = tables.remove(table.id); + assert!(table.is_some()) + } + commit_meta!(self, tables)?; + + database_core.clear_creating_stream_jobs(); + let user_core = &mut core.user; + for table in &tables_to_clean { + // Recovered when init database manager. + for relation_id in &table.dependent_relations { + database_core.decrease_ref_count(*relation_id); + } + // Recovered when init user manager. + user_core.decrease_ref(table.owner); } + + Ok(()) } /// This is used for both `CREATE TABLE` and `CREATE MATERIALIZED VIEW`. @@ -730,17 +872,11 @@ impl CatalogManager { ) -> MetaResult { let core = &mut *self.core.lock().await; let database_core = &mut core.database; - let mut tables = BTreeMapTransaction::new(&mut database_core.tables); - let key = (table.database_id, table.schema_id, table.name.clone()); - assert!( - !tables.contains_key(&table.id) - && database_core.in_progress_creation_tracker.contains(&key), - "table must be in creating procedure" - ); - database_core.in_progress_creation_tracker.remove(&key); - database_core - .in_progress_creation_streaming_job - .remove(&table.id); + let tables = &mut database_core.tables; + if cfg!(not(test)) { + Self::assert_table_creating(tables, &table); + } + let mut tables = BTreeMapTransaction::new(tables); table.stream_job_status = PbStreamJobStatus::Created.into(); tables.insert(table.id, table.clone()); @@ -769,23 +905,60 @@ impl CatalogManager { Ok(version) } - pub async fn cancel_create_table_procedure(&self, table: &Table) { - let core = &mut *self.core.lock().await; - let database_core = &mut core.database; - let user_core = &mut core.user; - let key = (table.database_id, table.schema_id, table.name.clone()); - assert!( - !database_core.tables.contains_key(&table.id) - && database_core.has_in_progress_creation(&key), - "table must be in creating procedure" - ); + /// Used to cleanup states in stream manager. + /// It is required because failure may not necessarily happen in barrier, + /// e.g. when cordon nodes. + /// and we still need some way to cleanup the state. + pub async fn cancel_create_table_procedure( + &self, + table_id: TableId, + internal_table_ids: Vec, + ) -> MetaResult<()> { + let table = { + let core = &mut self.core.lock().await; + let database_core = &mut core.database; + let tables = &mut database_core.tables; + let Some(table) = tables.get(&table_id).cloned() else { + bail!( + "table_id {} missing when attempting to cancel job", + table_id + ) + }; + table + }; - database_core.unmark_creating(&key); - database_core.unmark_creating_streaming_job(table.id); - for &dependent_relation_id in &table.dependent_relations { - database_core.decrease_ref_count(dependent_relation_id); + tracing::trace!("cleanup tables for {}", table.id); + { + let core = &mut self.core.lock().await; + let database_core = &mut core.database; + + let mut table_ids = vec![table.id]; + table_ids.extend(internal_table_ids); + + let tables = &mut database_core.tables; + let mut tables = BTreeMapTransaction::new(tables); + for table_id in table_ids { + tables.remove(table_id); + } + commit_meta!(self, tables)?; + } + + { + let core = &mut self.core.lock().await; + { + let user_core = &mut core.user; + user_core.decrease_ref(table.owner); + } + + { + let database_core = &mut core.database; + for &dependent_relation_id in &table.dependent_relations { + database_core.decrease_ref_count(dependent_relation_id); + } + } } - user_core.decrease_ref(table.owner); + + Ok(()) } /// return id of streaming jobs in the database which need to be dropped by stream manager. @@ -975,7 +1148,7 @@ impl CatalogManager { match drop_mode { DropMode::Restrict => { return Err(MetaError::permission_denied(format!( - "Fail to delete table `{}` because {} other relation(s) depend on it", + "Fail to delete index table `{}` because {} other relation(s) depend on it", index_table.name, ref_count ))); } @@ -1591,6 +1764,7 @@ impl CatalogManager { // 2. rename index name. index.name = index_name.to_string(); index_table.name = index_name.to_string(); + index_table.definition = alter_relation_rename(&index_table.definition, index_name); let mut indexes = BTreeMapTransaction::new(&mut database_core.indexes); let mut tables = BTreeMapTransaction::new(&mut database_core.tables); indexes.insert(index_id, index.clone()); @@ -2204,6 +2378,24 @@ impl CatalogManager { self.core.lock().await.database.list_tables() } + /// Lists table catalogs for mviews, without their internal tables. + pub async fn list_creating_background_mvs(&self) -> Vec
{ + self.core + .lock() + .await + .database + .list_creating_background_mvs() + } + + /// Lists table catalogs for all tables with `stream_job_status=CREATING`. + pub async fn list_persisted_creating_tables(&self) -> Vec
{ + self.core + .lock() + .await + .database + .list_persisted_creating_tables() + } + pub async fn get_all_table_options(&self) -> HashMap { self.core.lock().await.database.get_all_table_options() } diff --git a/src/meta/src/manager/catalog/utils.rs b/src/meta/src/manager/catalog/utils.rs index 7e26e32ee62eb..ea579867fc320 100644 --- a/src/meta/src/manager/catalog/utils.rs +++ b/src/meta/src/manager/catalog/utils.rs @@ -401,7 +401,7 @@ impl ReplaceTableExprRewriter { #[cfg(test)] mod tests { - use crate::manager::catalog::utils::{alter_relation_rename, alter_relation_rename_refs}; + use super::*; #[test] fn test_alter_table_rename() { diff --git a/src/meta/src/manager/cluster.rs b/src/meta/src/manager/cluster.rs index da5b4fce20711..a31979c8871b0 100644 --- a/src/meta/src/manager/cluster.rs +++ b/src/meta/src/manager/cluster.rs @@ -261,7 +261,7 @@ impl ClusterManager { .unwrap() .is_unschedulable = target; - var_txn.apply_to_txn(&mut txn)?; + var_txn.apply_to_txn(&mut txn).await?; var_txns.push(var_txn); } } @@ -314,7 +314,7 @@ impl ClusterManager { worker_id: WorkerId, info: Vec, ) -> MetaResult<()> { - tracing::trace!(target: "events::meta::server_heartbeat", worker_id = worker_id, "receive heartbeat"); + tracing::debug!(target: "events::meta::server_heartbeat", worker_id, "receive heartbeat"); let mut core = self.core.write().await; for worker in core.workers.values_mut() { if worker.worker_id() == worker_id { @@ -557,7 +557,7 @@ impl ClusterManagerCore { worker_id ); - var_txn.apply_to_txn(&mut txn)?; + var_txn.apply_to_txn(&mut txn).await?; var_txns.push(var_txn); } } @@ -728,11 +728,8 @@ mod tests { async fn test_cluster_manager() -> MetaResult<()> { let env = MetaSrvEnv::for_test().await; - let cluster_manager = Arc::new( - ClusterManager::new(env.clone(), Duration::new(0, 0)) - .await - .unwrap(), - ); + let cluster_manager = + Arc::new(ClusterManager::new(env, Duration::new(0, 0)).await.unwrap()); let mut worker_nodes = Vec::new(); let worker_count = 5usize; @@ -839,11 +836,8 @@ mod tests { async fn test_cluster_manager_schedulability() -> MetaResult<()> { let env = MetaSrvEnv::for_test().await; - let cluster_manager = Arc::new( - ClusterManager::new(env.clone(), Duration::new(0, 0)) - .await - .unwrap(), - ); + let cluster_manager = + Arc::new(ClusterManager::new(env, Duration::new(0, 0)).await.unwrap()); let worker_node = cluster_manager .add_worker_node( WorkerType::ComputeNode, diff --git a/src/meta/src/manager/env.rs b/src/meta/src/manager/env.rs index 40f81dbfd7a64..16a4bcb248b23 100644 --- a/src/meta/src/manager/env.rs +++ b/src/meta/src/manager/env.rs @@ -18,13 +18,17 @@ use std::sync::Arc; use risingwave_common::config::{CompactionConfig, DefaultParallelism}; use risingwave_pb::meta::SystemParams; use risingwave_rpc_client::{ConnectorClient, StreamClientPool, StreamClientPoolRef}; +use sea_orm::EntityTrait; use super::{SystemParamsManager, SystemParamsManagerRef}; +use crate::controller::system_param::{SystemParamsController, SystemParamsControllerRef}; +use crate::controller::SqlMetaStore; use crate::manager::{ IdGeneratorManager, IdGeneratorManagerRef, IdleManager, IdleManagerRef, NotificationManager, NotificationManagerRef, }; use crate::model::ClusterId; +use crate::model_v2::prelude::Cluster; use crate::storage::MetaStoreRef; #[cfg(any(test, feature = "test"))] use crate::storage::{MemStore, MetaStoreBoxExt}; @@ -40,6 +44,9 @@ pub struct MetaSrvEnv { /// meta store. meta_store: MetaStoreRef, + /// sql meta store. + meta_store_sql: Option, + /// notification manager. notification_manager: NotificationManagerRef, @@ -52,6 +59,9 @@ pub struct MetaSrvEnv { /// system param manager. system_params_manager: SystemParamsManagerRef, + /// system param controller. + system_params_controller: Option, + /// Unique identifier of the cluster. cluster_id: ClusterId, @@ -205,13 +215,14 @@ impl MetaSrvEnv { opts: MetaOpts, init_system_params: SystemParams, meta_store: MetaStoreRef, + meta_store_sql: Option, ) -> MetaResult { // change to sync after refactor `IdGeneratorManager::new` sync. let id_gen_manager = Arc::new(IdGeneratorManager::new(meta_store.clone()).await); let stream_client_pool = Arc::new(StreamClientPool::default()); let notification_manager = Arc::new(NotificationManager::new(meta_store.clone()).await); let idle_manager = Arc::new(IdleManager::new(opts.max_idle_ms)); - let (cluster_id, cluster_first_launch) = + let (mut cluster_id, cluster_first_launch) = if let Some(id) = ClusterId::from_meta_store(&meta_store).await? { (id, false) } else { @@ -221,21 +232,43 @@ impl MetaSrvEnv { SystemParamsManager::new( meta_store.clone(), notification_manager.clone(), - init_system_params, + init_system_params.clone(), cluster_first_launch, ) .await?, ); + // TODO: remove `cluster_first_launch` and check equality of cluster id stored in hummock to + // make sure the data dir of hummock is not used by another cluster. + let system_params_controller = match &meta_store_sql { + Some(store) => { + cluster_id = Cluster::find() + .one(&store.conn) + .await? + .map(|c| c.cluster_id.to_string().into()) + .unwrap(); + Some(Arc::new( + SystemParamsController::new( + store.clone(), + notification_manager.clone(), + init_system_params, + ) + .await?, + )) + } + None => None, + }; let connector_client = ConnectorClient::try_new(opts.connector_rpc_endpoint.as_ref()).await; Ok(Self { id_gen_manager, meta_store, + meta_store_sql, notification_manager, stream_client_pool, idle_manager, system_params_manager, + system_params_controller, cluster_id, cluster_first_launch, connector_client, @@ -251,6 +284,10 @@ impl MetaSrvEnv { &self.meta_store } + pub fn sql_meta_store(&self) -> Option { + self.meta_store_sql.clone() + } + pub fn id_gen_manager_ref(&self) -> IdGeneratorManagerRef { self.id_gen_manager.clone() } @@ -283,6 +320,14 @@ impl MetaSrvEnv { self.system_params_manager.deref() } + pub fn system_params_controller_ref(&self) -> Option { + self.system_params_controller.clone() + } + + pub fn system_params_controller(&self) -> Option<&SystemParamsControllerRef> { + self.system_params_controller.as_ref() + } + pub fn stream_client_pool_ref(&self) -> StreamClientPoolRef { self.stream_client_pool.clone() } @@ -314,6 +359,11 @@ impl MetaSrvEnv { pub async fn for_test_opts(opts: Arc) -> Self { // change to sync after refactor `IdGeneratorManager::new` sync. let meta_store = MemStore::default().into_ref(); + #[cfg(madsim)] + let meta_store_sql: Option = None; + #[cfg(not(madsim))] + let meta_store_sql = Some(SqlMetaStore::for_test().await); + let id_gen_manager = Arc::new(IdGeneratorManager::new(meta_store.clone()).await); let notification_manager = Arc::new(NotificationManager::new(meta_store.clone()).await); let stream_client_pool = Arc::new(StreamClientPool::default()); @@ -329,14 +379,29 @@ impl MetaSrvEnv { .await .unwrap(), ); + let system_params_controller = if let Some(store) = &meta_store_sql { + Some(Arc::new( + SystemParamsController::new( + store.clone(), + notification_manager.clone(), + risingwave_common::system_param::system_params_for_test(), + ) + .await + .unwrap(), + )) + } else { + None + }; Self { id_gen_manager, meta_store, + meta_store_sql, notification_manager, stream_client_pool, idle_manager, system_params_manager, + system_params_controller, cluster_id, cluster_first_launch, connector_client: None, diff --git a/src/meta/src/manager/mod.rs b/src/meta/src/manager/mod.rs index 6f787dba23d09..35642ed0ec143 100644 --- a/src/meta/src/manager/mod.rs +++ b/src/meta/src/manager/mod.rs @@ -18,18 +18,17 @@ mod env; mod id; mod idle; mod notification; -pub(crate) mod sink_coordination; +pub mod sink_coordination; mod streaming_job; mod system_param; -pub(crate) use catalog::*; -pub use cluster::WorkerKey; -pub(crate) use cluster::*; -pub use env::MetaSrvEnv; -pub(crate) use env::*; -pub(crate) use id::*; -pub(crate) use idle::*; -pub(crate) use notification::*; -pub use notification::{LocalNotification, MessageStatus, NotificationManagerRef}; -pub(crate) use streaming_job::*; -pub(crate) use system_param::*; +pub use catalog::*; +pub use cluster::{WorkerKey, *}; +pub use env::{MetaSrvEnv, *}; +pub use id::*; +pub use idle::*; +pub use notification::{LocalNotification, MessageStatus, NotificationManagerRef, *}; +pub use streaming_job::*; +pub use system_param::*; + +pub use super::model_v2::prelude; diff --git a/src/meta/src/manager/notification.rs b/src/meta/src/manager/notification.rs index 2e6272a79c68f..96c3c17ba59cd 100644 --- a/src/meta/src/manager/notification.rs +++ b/src/meta/src/manager/notification.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use risingwave_common::system_param::reader::SystemParamsReader; use risingwave_pb::common::{WorkerNode, WorkerType}; -use risingwave_pb::hummock::CompactTask; use risingwave_pb::meta::relation::RelationInfo; use risingwave_pb::meta::subscribe_response::{Info, Operation}; use risingwave_pb::meta::{ @@ -43,7 +42,6 @@ pub const IGNORED_NOTIFICATION_VERSION: u64 = 0; pub enum LocalNotification { WorkerNodeDeleted(WorkerNode), WorkerNodeActivated(WorkerNode), - CompactionTaskNeedCancel(CompactTask), SystemParamsChange(SystemParamsReader), FragmentMappingsUpsert(Vec), FragmentMappingsDelete(Vec), diff --git a/src/meta/src/manager/sink_coordination/coordinator_worker.rs b/src/meta/src/manager/sink_coordination/coordinator_worker.rs index 1a7c42c108661..79f4f5b753aa2 100644 --- a/src/meta/src/manager/sink_coordination/coordinator_worker.rs +++ b/src/meta/src/manager/sink_coordination/coordinator_worker.rs @@ -30,7 +30,6 @@ use risingwave_pb::connector_service::coordinate_response::{ use risingwave_pb::connector_service::{ coordinate_request, coordinate_response, CoordinateRequest, CoordinateResponse, SinkMetadata, }; -use risingwave_rpc_client::ConnectorClient; use tokio::sync::mpsc::UnboundedReceiver; use tonic::Status; use tracing::{error, warn}; @@ -47,7 +46,7 @@ macro_rules! send_await_with_err_check { }; } -pub(crate) struct CoordinatorWorker { +pub struct CoordinatorWorker { param: SinkParam, request_streams: Vec, response_senders: Vec, @@ -55,10 +54,9 @@ pub(crate) struct CoordinatorWorker { } impl CoordinatorWorker { - pub(crate) async fn run( + pub async fn run( first_writer_request: NewSinkWriterRequest, request_rx: UnboundedReceiver, - connector_client: Option, ) { let sink = match build_sink(first_writer_request.param.clone()) { Ok(sink) => sink, @@ -75,7 +73,7 @@ impl CoordinatorWorker { } }; dispatch_sink!(sink, sink, { - let coordinator = match sink.new_coordinator(connector_client).await { + let coordinator = match sink.new_coordinator().await { Ok(coordinator) => coordinator, Err(e) => { error!( @@ -93,7 +91,7 @@ impl CoordinatorWorker { }); } - pub(crate) async fn execute_coordinator( + pub async fn execute_coordinator( first_writer_request: NewSinkWriterRequest, request_rx: UnboundedReceiver, coordinator: impl SinkCommitCoordinator, @@ -168,7 +166,7 @@ impl CoordinatorWorker { registered_vnode.insert(vnode); } - loop { + while remaining_count > 0 { let new_writer_request = self.next_new_writer().await?; if self.param != new_writer_request.param { // TODO: may return error. @@ -191,10 +189,6 @@ impl CoordinatorWorker { registered_vnode.insert(vnode); remaining_count -= 1; } - - if remaining_count == 0 { - break; - } } self.send_to_all_sink_writers(|| { diff --git a/src/meta/src/manager/sink_coordination/manager.rs b/src/meta/src/manager/sink_coordination/manager.rs index 73d96895b608e..720a698fa8e72 100644 --- a/src/meta/src/manager/sink_coordination/manager.rs +++ b/src/meta/src/manager/sink_coordination/manager.rs @@ -25,7 +25,6 @@ use risingwave_connector::sink::catalog::SinkId; use risingwave_connector::sink::SinkParam; use risingwave_pb::connector_service::coordinate_request::Msg; use risingwave_pb::connector_service::{coordinate_request, CoordinateRequest, CoordinateResponse}; -use risingwave_rpc_client::ConnectorClient; use tokio::sync::mpsc; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; use tokio::sync::oneshot::{channel, Receiver, Sender}; @@ -70,28 +69,21 @@ pub struct SinkCoordinatorManager { } impl SinkCoordinatorManager { - pub(crate) fn start_worker( - connector_client: Option, - ) -> (Self, (JoinHandle<()>, Sender<()>)) { - Self::start_worker_with_spawn_worker( - connector_client, - |writer_request, manager_request_stream, connector_client| { - tokio::spawn(CoordinatorWorker::run( - writer_request, - manager_request_stream, - connector_client, - )) - }, - ) + pub fn start_worker() -> (Self, (JoinHandle<()>, Sender<()>)) { + Self::start_worker_with_spawn_worker(|writer_request, manager_request_stream| { + tokio::spawn(CoordinatorWorker::run( + writer_request, + manager_request_stream, + )) + }) } fn start_worker_with_spawn_worker( - connector_client: Option, spawn_coordinator_worker: impl SpawnCoordinatorFn, ) -> (Self, (JoinHandle<()>, Sender<()>)) { let (request_tx, request_rx) = mpsc::channel(BOUNDED_CHANNEL_SIZE); let (shutdown_tx, shutdown_rx) = channel(); - let worker = ManagerWorker::new(request_rx, shutdown_rx, connector_client); + let worker = ManagerWorker::new(request_rx, shutdown_rx); let join_handle = tokio::spawn(worker.execute(spawn_coordinator_worker)); ( SinkCoordinatorManager { request_tx }, @@ -99,7 +91,7 @@ impl SinkCoordinatorManager { ) } - pub(crate) async fn handle_new_request( + pub async fn handle_new_request( &self, mut request_stream: SinkWriterRequestStream, ) -> Result>, Status> { @@ -151,11 +143,11 @@ impl SinkCoordinatorManager { info!("successfully stop coordinator: {:?}", sink_id); } - pub(crate) async fn reset(&self) { + pub async fn reset(&self) { self.stop_coordinator(None).await; } - pub(crate) async fn stop_sink_coordinator(&self, sink_id: SinkId) { + pub async fn stop_sink_coordinator(&self, sink_id: SinkId) { self.stop_coordinator(Some(sink_id)).await; } } @@ -168,7 +160,6 @@ struct CoordinatorWorkerHandle { } struct ManagerWorker { - connector_client: Option, request_rx: mpsc::Receiver, // Make it option so that it can be polled with &mut SinkManagerWorker shutdown_rx: Option>, @@ -186,26 +177,17 @@ enum ManagerEvent { }, } -trait SpawnCoordinatorFn = FnMut( - NewSinkWriterRequest, - UnboundedReceiver, - Option, - ) -> JoinHandle<()> +trait SpawnCoordinatorFn = FnMut(NewSinkWriterRequest, UnboundedReceiver) -> JoinHandle<()> + Send + 'static; impl ManagerWorker { - fn new( - request_rx: mpsc::Receiver, - shutdown_rx: Receiver<()>, - connector_client: Option, - ) -> Self { + fn new(request_rx: mpsc::Receiver, shutdown_rx: Receiver<()>) -> Self { ManagerWorker { request_rx, shutdown_rx: Some(shutdown_rx), running_coordinator_worker_join_handles: Default::default(), running_coordinator_worker: Default::default(), - connector_client, } } @@ -346,8 +328,7 @@ impl ManagerWorker { } Entry::Vacant(entry) => { let (request_tx, request_rx) = unbounded_channel(); - let connector_client = self.connector_client.clone(); - let join_handle = spawn_coordinator_worker(request, request_rx, connector_client); + let join_handle = spawn_coordinator_worker(request, request_rx); self.running_coordinator_worker_join_handles.push( join_handle .map(move |join_result| (sink_id, join_result)) @@ -422,6 +403,7 @@ mod tests { columns: vec![], downstream_pk: vec![], sink_type: SinkType::AppendOnly, + format_desc: None, db_name: "test".into(), sink_from_name: "test".into(), }; @@ -448,10 +430,10 @@ mod tests { ]; let (manager, (_join_handle, _stop_tx)) = - SinkCoordinatorManager::start_worker_with_spawn_worker(None, { + SinkCoordinatorManager::start_worker_with_spawn_worker({ let param = param.clone(); let metadata = metadata.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx, _| { + move |first_request: NewSinkWriterRequest, new_writer_rx| { let param = param.clone(); let metadata = metadata.clone(); tokio::spawn(async move { @@ -586,6 +568,129 @@ mod tests { .await; } + #[tokio::test] + async fn test_single_writer() { + let sink_id = SinkId::from(1); + let param = SinkParam { + sink_id, + properties: Default::default(), + columns: vec![], + downstream_pk: vec![], + sink_type: SinkType::AppendOnly, + format_desc: None, + db_name: "test".into(), + sink_from_name: "test".into(), + }; + + let epoch1 = 233; + let epoch2 = 234; + + let all_vnode = (0..VirtualNode::COUNT).collect_vec(); + let build_bitmap = |indexes: &[usize]| { + let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT); + for i in indexes { + builder.set(*i, true); + } + builder.finish() + }; + let vnode = build_bitmap(&all_vnode); + + let metadata = [vec![1u8, 2u8], vec![3u8, 4u8]]; + + let (manager, (_join_handle, _stop_tx)) = + SinkCoordinatorManager::start_worker_with_spawn_worker({ + let param = param.clone(); + let metadata = metadata.clone(); + move |first_request: NewSinkWriterRequest, new_writer_rx| { + let param = param.clone(); + let metadata = metadata.clone(); + tokio::spawn(async move { + // validate the start request + assert_eq!(first_request.param, param); + CoordinatorWorker::execute_coordinator( + first_request, + new_writer_rx, + MockCoordinator::new(0, |epoch, metadata_list, count: &mut usize| { + *count += 1; + let mut metadata_list = metadata_list + .into_iter() + .map(|metadata| match metadata { + SinkMetadata { + metadata: + Some(Metadata::Serialized(SerializedMetadata { + metadata, + })), + } => metadata, + _ => unreachable!(), + }) + .collect_vec(); + metadata_list.sort(); + match *count { + 1 => { + assert_eq!(epoch, epoch1); + assert_eq!(1, metadata_list.len()); + assert_eq!(metadata[0], metadata_list[0]); + } + 2 => { + assert_eq!(epoch, epoch2); + assert_eq!(1, metadata_list.len()); + assert_eq!(metadata[1], metadata_list[0]); + } + _ => unreachable!(), + } + Ok(()) + }), + ) + .await; + }) + } + }); + + let build_client = |vnode| async { + CoordinatorStreamHandle::new_with_init_stream( + param.to_proto(), + vnode, + |stream_req| async { + Ok(tonic::Response::new( + manager + .handle_new_request(stream_req.into_inner().map(Ok).boxed()) + .await + .unwrap() + .boxed(), + )) + }, + ) + .await + .unwrap() + }; + + let mut client = build_client(vnode).await; + + client + .commit( + epoch1, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata[0].clone(), + })), + }, + ) + .await + .unwrap(); + + client + .commit( + epoch2, + SinkMetadata { + metadata: Some(Metadata::Serialized(SerializedMetadata { + metadata: metadata[1].clone(), + })), + }, + ) + .await + .unwrap(); + } + #[tokio::test] async fn test_drop_sink_while_init() { let sink_id = SinkId::from(1); @@ -595,11 +700,12 @@ mod tests { columns: vec![], downstream_pk: vec![], sink_type: SinkType::AppendOnly, + format_desc: None, db_name: "test".into(), sink_from_name: "test".into(), }; - let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker(None); + let (manager, (_join_handle, _stop_tx)) = SinkCoordinatorManager::start_worker(); let mut build_client_future1 = pin!(CoordinatorStreamHandle::new_with_init_stream( param.to_proto(), @@ -633,6 +739,7 @@ mod tests { columns: vec![], downstream_pk: vec![], sink_type: SinkType::AppendOnly, + format_desc: None, db_name: "test".into(), sink_from_name: "test".into(), }; @@ -653,9 +760,9 @@ mod tests { let vnode2 = build_bitmap(second); let (manager, (_join_handle, _stop_tx)) = - SinkCoordinatorManager::start_worker_with_spawn_worker(None, { + SinkCoordinatorManager::start_worker_with_spawn_worker({ let param = param.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx, _| { + move |first_request: NewSinkWriterRequest, new_writer_rx| { let param = param.clone(); tokio::spawn(async move { // validate the start request @@ -715,6 +822,7 @@ mod tests { columns: vec![], downstream_pk: vec![], sink_type: SinkType::AppendOnly, + format_desc: None, db_name: "test".into(), sink_from_name: "test".into(), }; @@ -735,9 +843,9 @@ mod tests { let vnode2 = build_bitmap(second); let (manager, (_join_handle, _stop_tx)) = - SinkCoordinatorManager::start_worker_with_spawn_worker(None, { + SinkCoordinatorManager::start_worker_with_spawn_worker({ let param = param.clone(); - move |first_request: NewSinkWriterRequest, new_writer_rx, _| { + move |first_request: NewSinkWriterRequest, new_writer_rx| { let param = param.clone(); tokio::spawn(async move { // validate the start request diff --git a/src/meta/src/manager/sink_coordination/mod.rs b/src/meta/src/manager/sink_coordination/mod.rs index 30786c8721e97..fe861e2175343 100644 --- a/src/meta/src/manager/sink_coordination/mod.rs +++ b/src/meta/src/manager/sink_coordination/mod.rs @@ -16,19 +16,19 @@ mod coordinator_worker; mod manager; use futures::stream::BoxStream; -pub(crate) use manager::SinkCoordinatorManager; +pub use manager::SinkCoordinatorManager; use risingwave_common::buffer::Bitmap; use risingwave_connector::sink::SinkParam; use risingwave_pb::connector_service::{CoordinateRequest, CoordinateResponse}; use tokio::sync::mpsc::Sender; use tonic::Status; -pub(crate) type SinkWriterRequestStream = BoxStream<'static, Result>; -pub(crate) type SinkCoordinatorResponseSender = Sender>; +pub type SinkWriterRequestStream = BoxStream<'static, Result>; +pub type SinkCoordinatorResponseSender = Sender>; -pub(crate) struct NewSinkWriterRequest { - pub(crate) request_stream: SinkWriterRequestStream, - pub(crate) response_tx: SinkCoordinatorResponseSender, - pub(crate) param: SinkParam, - pub(crate) vnode_bitmap: Bitmap, +pub struct NewSinkWriterRequest { + pub request_stream: SinkWriterRequestStream, + pub response_tx: SinkCoordinatorResponseSender, + pub param: SinkParam, + pub vnode_bitmap: Bitmap, } diff --git a/src/meta/src/manager/streaming_job.rs b/src/meta/src/manager/streaming_job.rs index 6b3e71fe20092..e02388eba4f3d 100644 --- a/src/meta/src/manager/streaming_job.rs +++ b/src/meta/src/manager/streaming_job.rs @@ -16,7 +16,7 @@ use std::collections::HashMap; use risingwave_common::catalog::TableVersionId; use risingwave_common::util::epoch::Epoch; -use risingwave_pb::catalog::{Index, Sink, Source, Table}; +use risingwave_pb::catalog::{CreateType, Index, Sink, Source, Table}; use crate::model::FragmentId; @@ -31,7 +31,7 @@ pub enum StreamingJob { } impl StreamingJob { - pub(crate) fn mark_created(&mut self) { + pub fn mark_created(&mut self) { let created_at_epoch = Some(Epoch::now().0); match self { StreamingJob::MaterializedView(table) => table.created_at_epoch = created_at_epoch, @@ -48,7 +48,7 @@ impl StreamingJob { } } - pub(crate) fn mark_initialized(&mut self) { + pub fn mark_initialized(&mut self) { let initialized_at_epoch = Some(Epoch::now().0); match self { StreamingJob::MaterializedView(table) => { @@ -197,4 +197,13 @@ impl StreamingJob { None } } + + pub fn create_type(&self) -> CreateType { + match self { + Self::MaterializedView(table) => { + table.get_create_type().unwrap_or(CreateType::Foreground) + } + _ => CreateType::Foreground, + } + } } diff --git a/src/meta/src/manager/system_param/mod.rs b/src/meta/src/manager/system_param/mod.rs index cdedad61d8d71..eb24e0db0f340 100644 --- a/src/meta/src/manager/system_param/mod.rs +++ b/src/meta/src/manager/system_param/mod.rs @@ -89,7 +89,7 @@ impl SystemParamsManager { set_system_param(mem_txn.deref_mut(), name, value).map_err(MetaError::system_param)?; let mut store_txn = Transaction::default(); - mem_txn.apply_to_txn(&mut store_txn)?; + mem_txn.apply_to_txn(&mut store_txn).await?; self.meta_store.txn(store_txn).await?; mem_txn.commit(); diff --git a/src/meta/src/manager/system_param/model.rs b/src/meta/src/manager/system_param/model.rs index bed4f3d86e8a4..d486d6a5d74c6 100644 --- a/src/meta/src/manager/system_param/model.rs +++ b/src/meta/src/manager/system_param/model.rs @@ -67,20 +67,21 @@ impl SystemParamsModel for SystemParams { S: MetaStore, { let mut txn = Transaction::default(); - self.upsert_in_transaction(&mut txn)?; + self.upsert_in_transaction(&mut txn).await?; Ok(store.txn(txn).await?) } } -impl Transactional for SystemParams { - fn upsert_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()> { +#[async_trait] +impl Transactional for SystemParams { + async fn upsert_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()> { for (k, v) in system_params_to_kv(self).map_err(MetadataModelError::internal)? { trx.put(Self::cf_name(), k.into_bytes(), v.into_bytes()); } Ok(()) } - fn delete_in_transaction(&self, _trx: &mut Transaction) -> MetadataModelResult<()> { + async fn delete_in_transaction(&self, _trx: &mut Transaction) -> MetadataModelResult<()> { unreachable!() } } diff --git a/src/meta/src/model/cluster.rs b/src/meta/src/model/cluster.rs index 882f48b6dc8c4..3d654a1d6b8c9 100644 --- a/src/meta/src/model/cluster.rs +++ b/src/meta/src/model/cluster.rs @@ -128,8 +128,14 @@ const CLUSTER_ID_KEY: &[u8] = "cluster_id".as_bytes(); #[derive(Clone, Debug)] pub struct ClusterId(String); +impl Default for ClusterId { + fn default() -> Self { + Self::new() + } +} + impl ClusterId { - pub(crate) fn new() -> Self { + pub fn new() -> Self { Self(Uuid::new_v4().to_string()) } @@ -139,15 +145,13 @@ impl ClusterId { )) } - pub(crate) async fn from_meta_store( + pub async fn from_meta_store( meta_store: &S, ) -> MetadataModelResult> { Self::from_snapshot::(&meta_store.snapshot().await).await } - pub(crate) async fn from_snapshot( - s: &S::Snapshot, - ) -> MetadataModelResult> { + pub async fn from_snapshot(s: &S::Snapshot) -> MetadataModelResult> { match s.get_cf(CLUSTER_ID_CF_NAME, CLUSTER_ID_KEY).await { Ok(bytes) => Ok(Some(Self::from_bytes(bytes)?)), Err(e) => match e { @@ -157,10 +161,7 @@ impl ClusterId { } } - pub(crate) async fn put_at_meta_store( - &self, - meta_store: &S, - ) -> MetadataModelResult<()> { + pub async fn put_at_meta_store(&self, meta_store: &S) -> MetadataModelResult<()> { Ok(meta_store .put_cf( CLUSTER_ID_CF_NAME, diff --git a/src/meta/src/model/mod.rs b/src/meta/src/model/mod.rs index bb07e7e7b6cf1..f1fe0285d9ae8 100644 --- a/src/meta/src/model/mod.rs +++ b/src/meta/src/model/mod.rs @@ -24,6 +24,7 @@ mod user; use std::collections::btree_map::{Entry, VacantEntry}; use std::collections::BTreeMap; use std::fmt::Debug; +use std::marker::PhantomData; use std::ops::{Deref, DerefMut}; use async_trait::async_trait; @@ -48,9 +49,10 @@ pub type DispatcherId = u64; /// A global, unique identifier of a fragment pub type FragmentId = u32; -pub trait Transactional { - fn upsert_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()>; - fn delete_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()>; +#[async_trait] +pub trait Transactional { + async fn upsert_in_transaction(&self, trx: &mut TXN) -> MetadataModelResult<()>; + async fn delete_in_transaction(&self, trx: &mut TXN) -> MetadataModelResult<()>; } mod private { @@ -203,11 +205,12 @@ for_all_metadata_models!(impl_metadata_model_marker); /// `Transactional` defines operations supported in a transaction. /// Read operations can be supported if necessary. -impl Transactional for T +#[async_trait] +impl Transactional for T where - T: MetadataModel, + T: MetadataModel + Sync, { - fn upsert_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()> { + async fn upsert_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()> { trx.put( Self::cf_name(), self.key()?.encode_to_vec(), @@ -216,7 +219,7 @@ where Ok(()) } - fn delete_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()> { + async fn delete_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()> { trx.delete(Self::cf_name(), self.key()?.encode_to_vec()); Ok(()) } @@ -225,11 +228,12 @@ where /// Trait that wraps a local memory value and applies the change to the local memory value on /// `commit` or leaves the local memory value untouched on `abort`. pub trait ValTransaction: Sized { + type TXN; /// Commit the change to local memory value fn commit(self); /// Apply the change (upsert or delete) to `txn` - fn apply_to_txn(&self, txn: &mut Transaction) -> MetadataModelResult<()>; + async fn apply_to_txn(&self, txn: &mut Self::TXN) -> MetadataModelResult<()>; /// Abort the `VarTransaction` and leave the local memory value untouched fn abort(self) { @@ -243,26 +247,28 @@ pub trait ValTransaction: Sized { /// When `commit` is called, the change to `new_value` will be applied to the `orig_value_ref` /// When `abort` is called, the `VarTransaction` is dropped and the local memory value is /// untouched. -pub struct VarTransaction<'a, T: Transactional> { +pub struct VarTransaction<'a, TXN, T: Transactional> { orig_value_ref: &'a mut T, new_value: Option, + _phantom: PhantomData, } -impl<'a, T> VarTransaction<'a, T> +impl<'a, TXN, T> VarTransaction<'a, TXN, T> where - T: Transactional, + T: Transactional, { /// Create a `VarTransaction` that wraps a raw variable - pub fn new(val_ref: &'a mut T) -> VarTransaction<'a, T> { + pub fn new(val_ref: &'a mut T) -> VarTransaction<'a, TXN, T> { VarTransaction { // lazy initialization new_value: None, orig_value_ref: val_ref, + _phantom: PhantomData, } } } -impl<'a, T: Transactional> Deref for VarTransaction<'a, T> { +impl<'a, TXN, T: Transactional> Deref for VarTransaction<'a, TXN, T> { type Target = T; fn deref(&self) -> &Self::Target { @@ -273,9 +279,9 @@ impl<'a, T: Transactional> Deref for VarTransaction<'a, T> { } } -impl<'a, T> DerefMut for VarTransaction<'a, T> +impl<'a, TXN, T> DerefMut for VarTransaction<'a, TXN, T> where - T: Clone + Transactional, + T: Clone + Transactional, { fn deref_mut(&mut self) -> &mut Self::Target { if self.new_value.is_none() { @@ -285,21 +291,23 @@ where } } -impl<'a, T> ValTransaction for VarTransaction<'a, T> +impl<'a, TXN, T> ValTransaction for VarTransaction<'a, TXN, T> where - T: Transactional + PartialEq, + T: Transactional + PartialEq, { + type TXN = TXN; + fn commit(self) { if let Some(new_value) = self.new_value { *self.orig_value_ref = new_value; } } - fn apply_to_txn(&self, txn: &mut Transaction) -> MetadataModelResult<()> { + async fn apply_to_txn(&self, txn: &mut Self::TXN) -> MetadataModelResult<()> { if let Some(new_value) = &self.new_value { // Apply the change to `txn` only when the value is modified if *self.orig_value_ref != *new_value { - new_value.upsert_in_transaction(txn) + new_value.upsert_in_transaction(txn).await } else { Ok(()) } @@ -418,25 +426,27 @@ enum BTreeMapOp { /// are stored in `staging`. On `commit`, it will apply the changes stored in `staging` to the in /// memory btree map. When serve `get` and `get_mut`, it merges the value stored in `staging` and /// `tree_ref`. -pub struct BTreeMapTransaction<'a, K: Ord, V> { +pub struct BTreeMapTransaction<'a, K: Ord, V, TXN = Transaction> { /// A reference to the original `BTreeMap`. All access to this field should be immutable, /// except when we commit the staging changes to the original map. tree_ref: &'a mut BTreeMap, /// Store all the staging changes that will be applied to the original map on commit staging: BTreeMap>, + _phantom: PhantomData, } -impl<'a, K: Ord + Debug, V: Clone> BTreeMapTransaction<'a, K, V> { - pub fn new(tree_ref: &'a mut BTreeMap) -> BTreeMapTransaction<'a, K, V> { +impl<'a, K: Ord + Debug, V: Clone, TXN> BTreeMapTransaction<'a, K, V, TXN> { + pub fn new(tree_ref: &'a mut BTreeMap) -> BTreeMapTransaction<'a, K, V, TXN> { Self { tree_ref, staging: BTreeMap::default(), + _phantom: PhantomData, } } /// Start a `BTreeMapEntryTransaction` when the `key` exists #[allow(dead_code)] - pub fn new_entry_txn(&mut self, key: K) -> Option> { + pub fn new_entry_txn(&mut self, key: K) -> Option> { BTreeMapEntryTransaction::new(self.tree_ref, key, None) } @@ -447,13 +457,17 @@ impl<'a, K: Ord + Debug, V: Clone> BTreeMapTransaction<'a, K, V> { &mut self, key: K, default_val: V, - ) -> BTreeMapEntryTransaction<'_, K, V> { + ) -> BTreeMapEntryTransaction<'_, K, V, TXN> { BTreeMapEntryTransaction::new(self.tree_ref, key, Some(default_val)) .expect("default value is provided and should return `Some`") } /// Start a `BTreeMapEntryTransaction` that inserts the `val` into `key`. - pub fn new_entry_insert_txn(&mut self, key: K, val: V) -> BTreeMapEntryTransaction<'_, K, V> { + pub fn new_entry_insert_txn( + &mut self, + key: K, + val: V, + ) -> BTreeMapEntryTransaction<'_, K, V, TXN> { BTreeMapEntryTransaction::new_insert(self.tree_ref, key, val) } @@ -549,21 +563,23 @@ impl<'a, K: Ord + Debug, V: Clone> BTreeMapTransaction<'a, K, V> { } } -impl<'a, K: Ord + Debug, V: Transactional + Clone> ValTransaction - for BTreeMapTransaction<'a, K, V> +impl<'a, K: Ord + Debug, V: Transactional + Clone, TXN> ValTransaction + for BTreeMapTransaction<'a, K, V, TXN> { + type TXN = TXN; + fn commit(self) { self.commit_memory(); } - fn apply_to_txn(&self, txn: &mut Transaction) -> MetadataModelResult<()> { + async fn apply_to_txn(&self, txn: &mut Self::TXN) -> MetadataModelResult<()> { // Add the staging operation to txn for (k, op) in &self.staging { match op { - BTreeMapOp::Insert(v) => v.upsert_in_transaction(txn)?, + BTreeMapOp::Insert(v) => v.upsert_in_transaction(txn).await?, BTreeMapOp::Delete => { if let Some(v) = self.tree_ref.get(k) { - v.delete_in_transaction(txn)?; + v.delete_in_transaction(txn).await?; } } } @@ -573,24 +589,26 @@ impl<'a, K: Ord + Debug, V: Transactional + Clone> ValTransaction } /// Transaction wrapper for a `BTreeMap` entry value of given `key` -pub struct BTreeMapEntryTransaction<'a, K, V> { +pub struct BTreeMapEntryTransaction<'a, K, V, TXN> { tree_ref: &'a mut BTreeMap, pub key: K, pub new_value: V, + _phantom: PhantomData, } -impl<'a, K: Ord + Debug, V: Clone> BTreeMapEntryTransaction<'a, K, V> { +impl<'a, K: Ord + Debug, V: Clone, TXN> BTreeMapEntryTransaction<'a, K, V, TXN> { /// Create a `ValTransaction` that wraps a `BTreeMap` entry of the given `key`. /// If the tree does not contain `key`, the `default_val` will be used as the initial value pub fn new_insert( tree_ref: &'a mut BTreeMap, key: K, value: V, - ) -> BTreeMapEntryTransaction<'a, K, V> { + ) -> BTreeMapEntryTransaction<'a, K, V, TXN> { BTreeMapEntryTransaction { new_value: value, tree_ref, key, + _phantom: PhantomData, } } @@ -604,7 +622,7 @@ impl<'a, K: Ord + Debug, V: Clone> BTreeMapEntryTransaction<'a, K, V> { tree_ref: &'a mut BTreeMap, key: K, default_val: Option, - ) -> Option> { + ) -> Option> { tree_ref .get(&key) .cloned() @@ -613,11 +631,12 @@ impl<'a, K: Ord + Debug, V: Clone> BTreeMapEntryTransaction<'a, K, V> { new_value: orig_value, tree_ref, key, + _phantom: PhantomData, }) } } -impl<'a, K, V> Deref for BTreeMapEntryTransaction<'a, K, V> { +impl<'a, K, V, TXN> Deref for BTreeMapEntryTransaction<'a, K, V, TXN> { type Target = V; fn deref(&self) -> &Self::Target { @@ -625,24 +644,26 @@ impl<'a, K, V> Deref for BTreeMapEntryTransaction<'a, K, V> { } } -impl<'a, K, V> DerefMut for BTreeMapEntryTransaction<'a, K, V> { +impl<'a, K, V, TXN> DerefMut for BTreeMapEntryTransaction<'a, K, V, TXN> { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.new_value } } -impl<'a, K: Ord, V: PartialEq + Transactional> ValTransaction - for BTreeMapEntryTransaction<'a, K, V> +impl<'a, K: Ord, V: PartialEq + Transactional, TXN> ValTransaction + for BTreeMapEntryTransaction<'a, K, V, TXN> { + type TXN = TXN; + fn commit(self) { self.tree_ref.insert(self.key, self.new_value); } - fn apply_to_txn(&self, txn: &mut Transaction) -> MetadataModelResult<()> { + async fn apply_to_txn(&self, txn: &mut Self::TXN) -> MetadataModelResult<()> { if !self.tree_ref.contains_key(&self.key) || *self.tree_ref.get(&self.key).unwrap() != self.new_value { - self.new_value.upsert_in_transaction(txn)? + self.new_value.upsert_in_transaction(txn).await? } Ok(()) } @@ -661,8 +682,9 @@ mod tests { const TEST_CF: &str = "test-cf"; - impl Transactional for TestTransactional { - fn upsert_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()> { + #[async_trait] + impl Transactional for TestTransactional { + async fn upsert_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()> { trx.put( TEST_CF.to_string(), self.key.as_bytes().into(), @@ -671,14 +693,14 @@ mod tests { Ok(()) } - fn delete_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()> { + async fn delete_in_transaction(&self, trx: &mut Transaction) -> MetadataModelResult<()> { trx.delete(TEST_CF.to_string(), self.key.as_bytes().into()); Ok(()) } } - #[test] - fn test_simple_var_transaction_commit() { + #[tokio::test] + async fn test_simple_var_transaction_commit() { let mut kv = TestTransactional { key: "key", value: "original", @@ -687,7 +709,7 @@ mod tests { num_txn.value = "modified"; assert_eq!(num_txn.value, "modified"); let mut txn = Transaction::default(); - num_txn.apply_to_txn(&mut txn).unwrap(); + num_txn.apply_to_txn(&mut txn).await.unwrap(); let txn_op = txn.get_operations(); assert_eq!(1, txn_op.len()); assert!(matches!( @@ -717,8 +739,8 @@ mod tests { assert_eq!("original", kv.value); } - #[test] - fn test_tree_map_transaction_commit() { + #[tokio::test] + async fn test_tree_map_transaction_commit() { let mut map: BTreeMap = BTreeMap::new(); map.insert( "to-remove".to_string(), @@ -800,7 +822,7 @@ mod tests { ); let mut txn = Transaction::default(); - map_txn.apply_to_txn(&mut txn).unwrap(); + map_txn.apply_to_txn(&mut txn).await.unwrap(); let txn_ops = txn.get_operations(); assert_eq!(5, txn_ops.len()); for op in txn_ops { @@ -860,8 +882,8 @@ mod tests { assert_eq!(map_copy, map); } - #[test] - fn test_tree_map_entry_update_transaction_commit() { + #[tokio::test] + async fn test_tree_map_entry_update_transaction_commit() { let mut map: BTreeMap = BTreeMap::new(); map.insert( "first".to_string(), @@ -875,7 +897,7 @@ mod tests { let mut first_entry_txn = map_txn.new_entry_txn("first".to_string()).unwrap(); first_entry_txn.value = "first-value"; let mut txn = Transaction::default(); - first_entry_txn.apply_to_txn(&mut txn).unwrap(); + first_entry_txn.apply_to_txn(&mut txn).await.unwrap(); let txn_ops = txn.get_operations(); assert_eq!(1, txn_ops.len()); assert!( @@ -885,8 +907,8 @@ mod tests { assert_eq!("first-value", map.get("first").unwrap().value); } - #[test] - fn test_tree_map_entry_insert_transaction_commit() { + #[tokio::test] + async fn test_tree_map_entry_insert_transaction_commit() { let mut map: BTreeMap = BTreeMap::new(); let mut map_txn = BTreeMapTransaction::new(&mut map); @@ -898,7 +920,7 @@ mod tests { }, ); let mut txn = Transaction::default(); - first_entry_txn.apply_to_txn(&mut txn).unwrap(); + first_entry_txn.apply_to_txn(&mut txn).await.unwrap(); let txn_ops = txn.get_operations(); assert_eq!(1, txn_ops.len()); assert!( diff --git a/src/meta/src/model/stream.rs b/src/meta/src/model/stream.rs index 5dd8f53e249b0..726bd7fcd8e73 100644 --- a/src/meta/src/model/stream.rs +++ b/src/meta/src/model/stream.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::ops::AddAssign; use itertools::Itertools; use risingwave_common::catalog::TableId; @@ -48,22 +49,22 @@ pub struct TableFragments { state: State, /// The table fragments. - pub(crate) fragments: BTreeMap, + pub fragments: BTreeMap, /// The status of actors - pub(crate) actor_status: BTreeMap, + pub actor_status: BTreeMap, /// The splits of actors - pub(crate) actor_splits: HashMap>, + pub actor_splits: HashMap>, /// The environment associated with this stream plan and its fragments - pub(crate) env: StreamEnvironment, + pub env: StreamEnvironment, } #[derive(Debug, Clone, Default)] pub struct StreamEnvironment { /// The timezone used to interpret timestamps and dates for conversion - pub(crate) timezone: Option, + pub timezone: Option, } impl StreamEnvironment { @@ -353,9 +354,12 @@ impl TableFragments { } /// Resolve dependent table - fn resolve_dependent_table(stream_node: &StreamNode, table_ids: &mut HashSet) { + fn resolve_dependent_table(stream_node: &StreamNode, table_ids: &mut HashMap) { if let Some(NodeBody::Chain(chain)) = stream_node.node_body.as_ref() { - table_ids.insert(TableId::new(chain.table_id)); + table_ids + .entry(TableId::new(chain.table_id)) + .or_default() + .add_assign(1); } for child in &stream_node.input { @@ -363,9 +367,10 @@ impl TableFragments { } } - /// Returns dependent table ids. - pub fn dependent_table_ids(&self) -> HashSet { - let mut table_ids = HashSet::new(); + /// Returns a mapping of dependent table ids of the `TableFragments` + /// to their corresponding count. + pub fn dependent_table_ids(&self) -> HashMap { + let mut table_ids = HashMap::new(); self.fragments.values().for_each(|fragment| { let actor = &fragment.actors[0]; Self::resolve_dependent_table(actor.nodes.as_ref().unwrap(), &mut table_ids); diff --git a/src/meta/src/model_v2/README.md b/src/meta/src/model_v2/README.md new file mode 100644 index 0000000000000..25c22a4f566e1 --- /dev/null +++ b/src/meta/src/model_v2/README.md @@ -0,0 +1,50 @@ +# How to define changes between versions and generate migration and model files + +- Generate a new migration file and apply it to the database, check [migration](./migration/README.md) for more details. Let's take a local PG database as an example(`postgres://postgres:@localhost:5432/postgres`): + ```sh + export DATABASE_URL=postgres://postgres:@localhost:5432/postgres; + cargo run -- generate MIGRATION_NAME + cargo run -- up + ``` + - Define tables, indexes, foreign keys in the file. The new generated file will include a sample migration script, + you can replace it with your own migration scripts, like defining or changing tables, indexes, foreign keys and other + dml operation to do data correctness etc. Check [writing-migration](https://www.sea-ql.org/SeaORM/docs/migration/writing-migration/) + for more details. + ```rust + #[async_trait::async_trait] + impl MigrationTrait for Migration { + async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> { + // Replace the sample below with your own migration scripts + todo!(); + } + + async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> { + // Replace the sample below with your own migration scripts + todo!(); + } + } + ``` +- Apply migration, and generate model files for new tables and indexes from the database, so you don't need to write them manually, + ```sh + cargo run -- up + sea-orm-cli generate entity -u postgres://postgres:@localhost:5432/postgres -s public -o {target_dir} + cp {target_dir}/xxx.rs src/meta/src/model_v2/ + ``` +- Defines enum and array types in the model files, since they're basically only supported in PG, and we need to + define them in the model files manually. For example: + ```rust + // We define integer array typed fields as json and derive it using the follow one. + #[derive(Clone, Debug, PartialEq, FromJsonQueryResult, Eq, Serialize, Deserialize, Default)] + pub struct I32Array(pub Vec); + + // We define enum typed fields as string and derive it using the follow one. + #[derive(Clone, Debug, PartialEq, Eq, EnumIter, DeriveActiveEnum)] + #[sea_orm(rs_type = "String", db_type = "String(None)")] + pub enum WorkerStatus { + #[sea_orm(string_value = "STARTING")] + Starting, + #[sea_orm(string_value = "RUNNING")] + Running, + } + ``` +- Define other helper functions in the model files if necessary. \ No newline at end of file diff --git a/src/meta/src/model_v2/actor.rs b/src/meta/src/model_v2/actor.rs new file mode 100644 index 0000000000000..8fecb3046b1bc --- /dev/null +++ b/src/meta/src/model_v2/actor.rs @@ -0,0 +1,51 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::I32Array; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "actor")] +pub struct Model { + #[sea_orm(primary_key)] + pub actor_id: i32, + pub fragment_id: i32, + pub status: Option, + pub splits: Option, + pub parallel_unit_id: i32, + pub upstream_actor_ids: Option, + pub dispatchers: Option, + pub vnode_bitmap: Option, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::fragment::Entity", + from = "Column::FragmentId", + to = "super::fragment::Column::FragmentId", + on_update = "NoAction", + on_delete = "Cascade" + )] + Fragment, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Fragment.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/cluster.rs b/src/meta/src/model_v2/cluster.rs new file mode 100644 index 0000000000000..36cdb449046bf --- /dev/null +++ b/src/meta/src/model_v2/cluster.rs @@ -0,0 +1,28 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "cluster")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub cluster_id: Uuid, + pub created_at: DateTime, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/compaction_config.rs b/src/meta/src/model_v2/compaction_config.rs new file mode 100644 index 0000000000000..6f8345734586e --- /dev/null +++ b/src/meta/src/model_v2/compaction_config.rs @@ -0,0 +1,29 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "compaction_config")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub compaction_group_id: i64, + #[sea_orm(column_type = "JsonBinary", nullable)] + pub config: Option, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/compaction_status.rs b/src/meta/src/model_v2/compaction_status.rs new file mode 100644 index 0000000000000..5872463395066 --- /dev/null +++ b/src/meta/src/model_v2/compaction_status.rs @@ -0,0 +1,29 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "compaction_status")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub compaction_group_id: i64, + #[sea_orm(column_type = "JsonBinary", nullable)] + pub status: Option, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/compaction_task.rs b/src/meta/src/model_v2/compaction_task.rs new file mode 100644 index 0000000000000..d3211b96d9a65 --- /dev/null +++ b/src/meta/src/model_v2/compaction_task.rs @@ -0,0 +1,30 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "compaction_task")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub id: i64, + #[sea_orm(column_type = "JsonBinary")] + pub task: Json, + pub context_id: i32, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/connection.rs b/src/meta/src/model_v2/connection.rs new file mode 100644 index 0000000000000..0096603c843a3 --- /dev/null +++ b/src/meta/src/model_v2/connection.rs @@ -0,0 +1,79 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_pb::catalog::connection::PbInfo; +use risingwave_pb::catalog::PbConnection; +use sea_orm::entity::prelude::*; +use sea_orm::ActiveValue; + +use crate::model_v2::{ConnectionId, PrivateLinkService}; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "connection")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub connection_id: ConnectionId, + pub name: String, + pub info: PrivateLinkService, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::ConnectionId", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object, + #[sea_orm(has_many = "super::sink::Entity")] + Sink, + #[sea_orm(has_many = "super::source::Entity")] + Source, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Sink.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Source.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} + +impl From for ActiveModel { + fn from(conn: PbConnection) -> Self { + let Some(PbInfo::PrivateLinkService(private_link_srv)) = conn.info else { + unreachable!("private link not provided.") + }; + + Self { + connection_id: ActiveValue::Set(conn.id as _), + name: ActiveValue::Set(conn.name), + info: ActiveValue::Set(PrivateLinkService(private_link_srv)), + } + } +} diff --git a/src/meta/src/model_v2/database.rs b/src/meta/src/model_v2/database.rs new file mode 100644 index 0000000000000..909c12eceac5a --- /dev/null +++ b/src/meta/src/model_v2/database.rs @@ -0,0 +1,46 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::DatabaseId; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "database")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub database_id: DatabaseId, + #[sea_orm(unique)] + pub name: String, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::DatabaseId", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/ext/hummock.rs b/src/meta/src/model_v2/ext/hummock.rs new file mode 100644 index 0000000000000..77111e2e7d202 --- /dev/null +++ b/src/meta/src/model_v2/ext/hummock.rs @@ -0,0 +1,61 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_pb::hummock::HummockPinnedVersion; +use sea_orm::sea_query::OnConflict; +use sea_orm::ActiveValue::{Set, Unchanged}; +use sea_orm::EntityTrait; + +use crate::model::{MetadataModelResult, Transactional}; +use crate::model_v2::hummock_pinned_version; +use crate::model_v2::trx::Transaction; + +#[async_trait::async_trait] +impl Transactional for HummockPinnedVersion { + async fn upsert_in_transaction( + &self, + trx: &mut crate::model_v2::trx::Transaction, + ) -> MetadataModelResult<()> { + // TODO: error type conversion + // TODO: integer type conversion + let m = hummock_pinned_version::ActiveModel { + context_id: Unchanged(self.context_id.try_into().unwrap()), + min_pinned_id: Set(self.min_pinned_id.try_into().unwrap()), + }; + hummock_pinned_version::Entity::insert(m) + .on_conflict( + OnConflict::column(hummock_pinned_version::Column::ContextId) + .update_columns([hummock_pinned_version::Column::MinPinnedId]) + .to_owned(), + ) + .exec(trx) + .await + .unwrap(); + Ok(()) + } + + async fn delete_in_transaction( + &self, + trx: &mut crate::model_v2::trx::Transaction, + ) -> MetadataModelResult<()> { + // TODO: error type conversion + // TODO: integer type conversion + let id: i32 = self.context_id.try_into().unwrap(); + hummock_pinned_version::Entity::delete_by_id(id) + .exec(trx) + .await + .unwrap(); + Ok(()) + } +} diff --git a/src/meta/src/model_v2/ext/mod.rs b/src/meta/src/model_v2/ext/mod.rs new file mode 100644 index 0000000000000..47a5ce8623dc4 --- /dev/null +++ b/src/meta/src/model_v2/ext/mod.rs @@ -0,0 +1,16 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod hummock; +pub use hummock::*; diff --git a/src/meta/src/model_v2/fragment.rs b/src/meta/src/model_v2/fragment.rs new file mode 100644 index 0000000000000..9263dd99eabb8 --- /dev/null +++ b/src/meta/src/model_v2/fragment.rs @@ -0,0 +1,62 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::I32Array; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "fragment")] +pub struct Model { + #[sea_orm(primary_key)] + pub fragment_id: i32, + pub table_id: i32, + pub fragment_type_mask: i32, + pub distribution_type: String, + pub stream_node: Json, + pub vnode_mapping: Option, + pub state_table_ids: Option, + pub upstream_fragment_id: Option, + pub dispatcher_type: Option, + pub dist_key_indices: Option, + pub output_indices: Option, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm(has_many = "super::actor::Entity")] + Actor, + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::TableId", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Actor.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/function.rs b/src/meta/src/model_v2/function.rs new file mode 100644 index 0000000000000..663f8e2284fd7 --- /dev/null +++ b/src/meta/src/model_v2/function.rs @@ -0,0 +1,90 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_pb::catalog::function::Kind; +use risingwave_pb::catalog::PbFunction; +use sea_orm::entity::prelude::*; +use sea_orm::ActiveValue; + +use crate::model_v2::{DataType, DataTypeArray, FunctionId}; + +#[derive(Clone, Debug, PartialEq, Eq, EnumIter, DeriveActiveEnum)] +#[sea_orm(rs_type = "String", db_type = "String(None)")] +pub enum FunctionKind { + #[sea_orm(string_value = "Scalar")] + Scalar, + #[sea_orm(string_value = "Table")] + Table, + #[sea_orm(string_value = "Aggregate")] + Aggregate, +} + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "function")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub function_id: FunctionId, + pub name: String, + pub arg_types: DataTypeArray, + pub return_type: DataType, + pub language: String, + pub link: String, + pub identifier: String, + pub kind: FunctionKind, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::FunctionId", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} + +impl From for FunctionKind { + fn from(kind: Kind) -> Self { + match kind { + Kind::Scalar(_) => Self::Scalar, + Kind::Table(_) => Self::Table, + Kind::Aggregate(_) => Self::Aggregate, + } + } +} + +impl From for ActiveModel { + fn from(function: PbFunction) -> Self { + Self { + function_id: ActiveValue::Set(function.id as _), + name: ActiveValue::Set(function.name), + arg_types: ActiveValue::Set(DataTypeArray(function.arg_types)), + return_type: ActiveValue::Set(DataType(function.return_type.unwrap())), + language: ActiveValue::Set(function.language), + link: ActiveValue::Set(function.link), + identifier: ActiveValue::Set(function.identifier), + kind: ActiveValue::Set(function.kind.unwrap().into()), + } + } +} diff --git a/src/meta/src/model_v2/hummock_pinned_snapshot.rs b/src/meta/src/model_v2/hummock_pinned_snapshot.rs new file mode 100644 index 0000000000000..170f35dd5d358 --- /dev/null +++ b/src/meta/src/model_v2/hummock_pinned_snapshot.rs @@ -0,0 +1,28 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "hummock_pinned_snapshot")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub context_id: i32, + pub min_pinned_snapshot: i64, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/hummock_pinned_version.rs b/src/meta/src/model_v2/hummock_pinned_version.rs new file mode 100644 index 0000000000000..6e2f34a5f735e --- /dev/null +++ b/src/meta/src/model_v2/hummock_pinned_version.rs @@ -0,0 +1,28 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "hummock_pinned_version")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub context_id: i32, + pub min_pinned_id: i64, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/hummock_version_delta.rs b/src/meta/src/model_v2/hummock_version_delta.rs new file mode 100644 index 0000000000000..100dd82eafe94 --- /dev/null +++ b/src/meta/src/model_v2/hummock_version_delta.rs @@ -0,0 +1,35 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "hummock_version_delta")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub id: i64, + pub prev_id: i64, + #[sea_orm(column_type = "JsonBinary", nullable)] + pub group_deltas: Option, + pub max_committed_epoch: i64, + pub safe_epoch: i64, + pub trivial_move: bool, + #[sea_orm(column_type = "JsonBinary", nullable)] + pub gc_object_ids: Option, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/hummock_version_stats.rs b/src/meta/src/model_v2/hummock_version_stats.rs new file mode 100644 index 0000000000000..1a7e990df405a --- /dev/null +++ b/src/meta/src/model_v2/hummock_version_stats.rs @@ -0,0 +1,29 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "hummock_version_stats")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub id: i64, + #[sea_orm(column_type = "JsonBinary")] + pub stats: Json, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/index.rs b/src/meta/src/model_v2/index.rs new file mode 100644 index 0000000000000..3b80632e2cfc3 --- /dev/null +++ b/src/meta/src/model_v2/index.rs @@ -0,0 +1,66 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::{ExprNodeArray, I32Array, IndexId, JobStatus, TableId}; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "index")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub index_id: IndexId, + pub name: String, + pub index_table_id: TableId, + pub primary_table_id: TableId, + pub index_items: ExprNodeArray, + pub original_columns: I32Array, + pub job_status: JobStatus, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::IndexId", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object, + #[sea_orm( + belongs_to = "super::table::Entity", + from = "Column::IndexTableId", + to = "super::table::Column::TableId", + on_update = "NoAction", + on_delete = "NoAction" + )] + Table2, + #[sea_orm( + belongs_to = "super::table::Entity", + from = "Column::PrimaryTableId", + to = "super::table::Column::TableId", + on_update = "NoAction", + on_delete = "NoAction" + )] + Table1, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/migration/Cargo.toml b/src/meta/src/model_v2/migration/Cargo.toml new file mode 100644 index 0000000000000..d5d51d77da909 --- /dev/null +++ b/src/meta/src/model_v2/migration/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "model_migration" +version = "0.1.0" +edition = "2021" +publish = false + +[lib] +name = "model_migration" +path = "src/lib.rs" + +[dependencies] +async-std = { version = "1", features = ["attributes", "tokio1"] } +uuid = { version = "1", features = ["v4"] } + +[dependencies.sea-orm-migration] +version = "0.12.0" +features = ["sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", "runtime-tokio-native-tls", "with-uuid"] diff --git a/src/meta/src/model_v2/migration/README.md b/src/meta/src/model_v2/migration/README.md new file mode 100644 index 0000000000000..3b438d89e31c5 --- /dev/null +++ b/src/meta/src/model_v2/migration/README.md @@ -0,0 +1,41 @@ +# Running Migrator CLI + +- Generate a new migration file + ```sh + cargo run -- generate MIGRATION_NAME + ``` +- Apply all pending migrations + ```sh + cargo run + ``` + ```sh + cargo run -- up + ``` +- Apply first 10 pending migrations + ```sh + cargo run -- up -n 10 + ``` +- Rollback last applied migrations + ```sh + cargo run -- down + ``` +- Rollback last 10 applied migrations + ```sh + cargo run -- down -n 10 + ``` +- Drop all tables from the database, then reapply all migrations + ```sh + cargo run -- fresh + ``` +- Rollback all applied migrations, then reapply all migrations + ```sh + cargo run -- refresh + ``` +- Rollback all applied migrations + ```sh + cargo run -- reset + ``` +- Check the status of all migrations + ```sh + cargo run -- status + ``` diff --git a/src/meta/src/model_v2/migration/src/lib.rs b/src/meta/src/model_v2/migration/src/lib.rs new file mode 100644 index 0000000000000..570bc75d08e99 --- /dev/null +++ b/src/meta/src/model_v2/migration/src/lib.rs @@ -0,0 +1,16 @@ +pub use sea_orm_migration::prelude::*; + +mod m20230908_072257_init; +mod m20231008_020431_hummock; + +pub struct Migrator; + +#[async_trait::async_trait] +impl MigratorTrait for Migrator { + fn migrations() -> Vec> { + vec![ + Box::new(m20230908_072257_init::Migration), + Box::new(m20231008_020431_hummock::Migration), + ] + } +} diff --git a/src/meta/src/model_v2/migration/src/m20230908_072257_init.rs b/src/meta/src/model_v2/migration/src/m20230908_072257_init.rs new file mode 100644 index 0000000000000..c9559bd6feda2 --- /dev/null +++ b/src/meta/src/model_v2/migration/src/m20230908_072257_init.rs @@ -0,0 +1,999 @@ +use sea_orm_migration::prelude::{Index as MigrationIndex, Table as MigrationTable, *}; + +#[derive(DeriveMigrationName)] +pub struct Migration; + +#[async_trait::async_trait] +impl MigrationTrait for Migration { + async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> { + // 1. check if the table exists. + assert!(!manager.has_table(Cluster::Table.to_string()).await?); + assert!(!manager.has_table(Worker::Table.to_string()).await?); + assert!(!manager.has_table(WorkerProperty::Table.to_string()).await?); + assert!(!manager.has_table(User::Table.to_string()).await?); + assert!(!manager.has_table(UserPrivilege::Table.to_string()).await?); + assert!(!manager.has_table(Database::Table.to_string()).await?); + assert!(!manager.has_table(Schema::Table.to_string()).await?); + assert!(!manager.has_table(Fragment::Table.to_string()).await?); + assert!(!manager.has_table(Actor::Table.to_string()).await?); + assert!(!manager.has_table(Table::Table.to_string()).await?); + assert!(!manager.has_table(Source::Table.to_string()).await?); + assert!(!manager.has_table(Sink::Table.to_string()).await?); + assert!(!manager.has_table(Connection::Table.to_string()).await?); + assert!(!manager.has_table(View::Table.to_string()).await?); + assert!(!manager.has_table(Index::Table.to_string()).await?); + assert!(!manager.has_table(Function::Table.to_string()).await?); + assert!(!manager.has_table(Object::Table.to_string()).await?); + assert!( + !manager + .has_table(ObjectDependency::Table.to_string()) + .await? + ); + assert!( + !manager + .has_table(SystemParameter::Table.to_string()) + .await? + ); + + // 2. create tables. + manager + .create_table( + MigrationTable::create() + .table(Cluster::Table) + .col( + ColumnDef::new(Cluster::ClusterId) + .uuid() + .not_null() + .primary_key(), + ) + .col( + ColumnDef::new(Cluster::CreatedAt) + .timestamp() + .default(Expr::current_timestamp()) + .not_null(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Worker::Table) + .col( + ColumnDef::new(Worker::WorkerId) + .integer() + .not_null() + .auto_increment() + .primary_key(), + ) + .col(ColumnDef::new(Worker::WorkerType).string().not_null()) + .col(ColumnDef::new(Worker::Host).string().not_null()) + .col(ColumnDef::new(Worker::Port).integer().not_null()) + .col(ColumnDef::new(Worker::Status).string().not_null()) + .col(ColumnDef::new(Worker::TransactionId).integer()) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(WorkerProperty::Table) + .col( + ColumnDef::new(WorkerProperty::WorkerId) + .integer() + .primary_key(), + ) + .col( + ColumnDef::new(WorkerProperty::ParallelUnitIds) + .json() + .not_null(), + ) + .col( + ColumnDef::new(WorkerProperty::IsStreaming) + .boolean() + .not_null(), + ) + .col( + ColumnDef::new(WorkerProperty::IsServing) + .boolean() + .not_null(), + ) + .col( + ColumnDef::new(WorkerProperty::IsUnschedulable) + .boolean() + .not_null(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_worker_property_worker_id") + .from(WorkerProperty::Table, WorkerProperty::WorkerId) + .to(Worker::Table, Worker::WorkerId) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(User::Table) + .col( + ColumnDef::new(User::UserId) + .integer() + .primary_key() + .auto_increment(), + ) + .col(ColumnDef::new(User::Name).string().not_null()) + .col(ColumnDef::new(User::IsSuper).boolean().not_null()) + .col(ColumnDef::new(User::CanCreateDb).boolean().not_null()) + .col(ColumnDef::new(User::CanCreateUser).boolean().not_null()) + .col(ColumnDef::new(User::CanLogin).boolean().not_null()) + .col(ColumnDef::new(User::AuthType).string()) + .col(ColumnDef::new(User::AuthValue).string()) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Object::Table) + .col( + ColumnDef::new(Object::Oid) + .integer() + .auto_increment() + .primary_key(), + ) + .col(ColumnDef::new(Object::ObjType).string().not_null()) + .col(ColumnDef::new(Object::OwnerId).integer().not_null()) + .col(ColumnDef::new(Object::SchemaId).integer()) + .col(ColumnDef::new(Object::DatabaseId).integer()) + .col( + ColumnDef::new(Object::InitializedAt) + .timestamp() + .default(Expr::current_timestamp()) + .not_null(), + ) + .col( + ColumnDef::new(Object::CreatedAt) + .timestamp() + .default(Expr::current_timestamp()) + .not_null(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_object_owner_id") + .from(Object::Table, Object::OwnerId) + .to(User::Table, User::UserId) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_object_database_id") + .from(Object::Table, Object::DatabaseId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_object_schema_id") + .from(Object::Table, Object::SchemaId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(UserPrivilege::Table) + .col( + ColumnDef::new(UserPrivilege::Id) + .integer() + .primary_key() + .auto_increment(), + ) + .col(ColumnDef::new(UserPrivilege::UserId).integer().not_null()) + .col(ColumnDef::new(UserPrivilege::Oid).integer().not_null()) + .col( + ColumnDef::new(UserPrivilege::GrantedBy) + .integer() + .not_null(), + ) + .col(ColumnDef::new(UserPrivilege::Actions).string().not_null()) + .col( + ColumnDef::new(UserPrivilege::WithGrantOption) + .boolean() + .not_null(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_user_privilege_user_id") + .from(UserPrivilege::Table, UserPrivilege::UserId) + .to(User::Table, User::UserId) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_user_privilege_granted_by") + .from(UserPrivilege::Table, UserPrivilege::GrantedBy) + .to(User::Table, User::UserId) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_user_privilege_oid") + .from(UserPrivilege::Table, UserPrivilege::Oid) + .to(Object::Table, Object::Oid) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(ObjectDependency::Table) + .col( + ColumnDef::new(ObjectDependency::Id) + .integer() + .auto_increment() + .primary_key(), + ) + .col(ColumnDef::new(ObjectDependency::Oid).integer().not_null()) + .col( + ColumnDef::new(ObjectDependency::UsedBy) + .integer() + .not_null(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_object_dependency_oid") + .from(ObjectDependency::Table, ObjectDependency::Oid) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_object_dependency_used_by") + .from(ObjectDependency::Table, ObjectDependency::UsedBy) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Database::Table) + .col(ColumnDef::new(Database::DatabaseId).integer().primary_key()) + .col( + ColumnDef::new(Database::Name) + .string() + .unique_key() + .not_null(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_database_object_id") + .from(Database::Table, Database::DatabaseId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Schema::Table) + .col(ColumnDef::new(Schema::SchemaId).integer().primary_key()) + .col(ColumnDef::new(Schema::Name).string().not_null()) + .foreign_key( + &mut ForeignKey::create() + .name("FK_schema_object_id") + .from(Schema::Table, Schema::SchemaId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Fragment::Table) + .col( + ColumnDef::new(Fragment::FragmentId) + .integer() + .primary_key() + .auto_increment(), + ) + .col(ColumnDef::new(Fragment::TableId).integer().not_null()) + .col( + ColumnDef::new(Fragment::FragmentTypeMask) + .integer() + .not_null(), + ) + .col( + ColumnDef::new(Fragment::DistributionType) + .string() + .not_null(), + ) + .col(ColumnDef::new(Fragment::StreamNode).json().not_null()) + .col(ColumnDef::new(Fragment::VnodeMapping).json()) + .col(ColumnDef::new(Fragment::StateTableIds).json()) + .col(ColumnDef::new(Fragment::UpstreamFragmentId).json()) + .col(ColumnDef::new(Fragment::DispatcherType).string()) + .col(ColumnDef::new(Fragment::DistKeyIndices).json()) + .col(ColumnDef::new(Fragment::OutputIndices).json()) + .foreign_key( + &mut ForeignKey::create() + .name("FK_fragment_table_id") + .from(Fragment::Table, Fragment::TableId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Actor::Table) + .col( + ColumnDef::new(Actor::ActorId) + .integer() + .primary_key() + .auto_increment(), + ) + .col(ColumnDef::new(Actor::FragmentId).integer().not_null()) + .col(ColumnDef::new(Actor::Status).string()) + .col(ColumnDef::new(Actor::Splits).json()) + .col(ColumnDef::new(Actor::ParallelUnitId).integer().not_null()) + .col(ColumnDef::new(Actor::UpstreamActorIds).json()) + .col(ColumnDef::new(Actor::Dispatchers).json()) + .col(ColumnDef::new(Actor::VnodeBitmap).string()) + .foreign_key( + &mut ForeignKey::create() + .name("FK_actor_fragment_id") + .from(Actor::Table, Actor::FragmentId) + .to(Fragment::Table, Fragment::FragmentId) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Connection::Table) + .col( + ColumnDef::new(Connection::ConnectionId) + .integer() + .primary_key(), + ) + .col(ColumnDef::new(Connection::Name).string().not_null()) + .col(ColumnDef::new(Connection::Info).json().not_null()) + .foreign_key( + &mut ForeignKey::create() + .name("FK_connection_object_id") + .from(Connection::Table, Connection::ConnectionId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Source::Table) + .col(ColumnDef::new(Source::SourceId).integer().primary_key()) + .col(ColumnDef::new(Source::Name).string().not_null()) + .col(ColumnDef::new(Source::RowIdIndex).integer()) + .col(ColumnDef::new(Source::Columns).json().not_null()) + .col(ColumnDef::new(Source::PkColumnIds).json().not_null()) + .col(ColumnDef::new(Source::Properties).json().not_null()) + .col(ColumnDef::new(Source::Definition).string().not_null()) + .col(ColumnDef::new(Source::SourceInfo).json()) + .col(ColumnDef::new(Source::WatermarkDescs).json().not_null()) + .col(ColumnDef::new(Source::OptionalAssociatedTableId).integer()) + .col(ColumnDef::new(Source::ConnectionId).integer()) + .col(ColumnDef::new(Source::Version).big_integer().not_null()) + .foreign_key( + &mut ForeignKey::create() + .name("FK_source_object_id") + .from(Source::Table, Source::SourceId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_source_connection_id") + .from(Source::Table, Source::ConnectionId) + .to(Connection::Table, Connection::ConnectionId) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Table::Table) + .col(ColumnDef::new(Table::TableId).integer().primary_key()) + .col(ColumnDef::new(Table::Name).string().not_null()) + .col(ColumnDef::new(Table::OptionalAssociatedSourceId).integer()) + .col(ColumnDef::new(Table::TableType).string().not_null()) + .col(ColumnDef::new(Table::Columns).json().not_null()) + .col(ColumnDef::new(Table::Pk).json().not_null()) + .col(ColumnDef::new(Table::DistributionKey).json().not_null()) + .col(ColumnDef::new(Table::StreamKey).json().not_null()) + .col(ColumnDef::new(Table::AppendOnly).boolean().not_null()) + .col(ColumnDef::new(Table::Properties).json().not_null()) + .col(ColumnDef::new(Table::FragmentId).integer().not_null()) + .col(ColumnDef::new(Table::VnodeColIndex).integer()) + .col(ColumnDef::new(Table::RowIdIndex).integer()) + .col(ColumnDef::new(Table::ValueIndices).json().not_null()) + .col(ColumnDef::new(Table::Definition).string().not_null()) + .col( + ColumnDef::new(Table::HandlePkConflictBehavior) + .string() + .not_null(), + ) + .col( + ColumnDef::new(Table::ReadPrefixLenHint) + .integer() + .not_null(), + ) + .col(ColumnDef::new(Table::WatermarkIndices).json().not_null()) + .col(ColumnDef::new(Table::DistKeyInPk).json().not_null()) + .col(ColumnDef::new(Table::DmlFragmentId).integer()) + .col(ColumnDef::new(Table::Cardinality).json()) + .col( + ColumnDef::new(Table::CleanedByWatermark) + .boolean() + .not_null(), + ) + .col(ColumnDef::new(Table::JobStatus).string().not_null()) + .col(ColumnDef::new(Table::CreateType).string().not_null()) + .col(ColumnDef::new(Table::Version).json().not_null()) + .foreign_key( + &mut ForeignKey::create() + .name("FK_table_object_id") + .from(Table::Table, Table::TableId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_table_fragment_id") + .from(Table::Table, Table::FragmentId) + .to(Fragment::Table, Fragment::FragmentId) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_table_dml_fragment_id") + .from(Table::Table, Table::DmlFragmentId) + .to(Fragment::Table, Fragment::FragmentId) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_table_optional_associated_source_id") + .from(Table::Table, Table::OptionalAssociatedSourceId) + .to(Source::Table, Source::SourceId) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Sink::Table) + .col(ColumnDef::new(Sink::SinkId).integer().primary_key()) + .col(ColumnDef::new(Sink::Name).string().not_null()) + .col(ColumnDef::new(Sink::Columns).json().not_null()) + .col(ColumnDef::new(Sink::PlanPk).json().not_null()) + .col(ColumnDef::new(Sink::DistributionKey).json().not_null()) + .col(ColumnDef::new(Sink::DownstreamPk).json().not_null()) + .col(ColumnDef::new(Sink::SinkType).string().not_null()) + .col(ColumnDef::new(Sink::Properties).json().not_null()) + .col(ColumnDef::new(Sink::Definition).string().not_null()) + .col(ColumnDef::new(Sink::ConnectionId).integer()) + .col(ColumnDef::new(Sink::DbName).string().not_null()) + .col(ColumnDef::new(Sink::SinkFromName).string().not_null()) + .col(ColumnDef::new(Sink::SinkFormatDesc).json()) + .col(ColumnDef::new(Sink::JobStatus).string().not_null()) + .foreign_key( + &mut ForeignKey::create() + .name("FK_sink_object_id") + .from(Sink::Table, Sink::SinkId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_sink_connection_id") + .from(Sink::Table, Sink::ConnectionId) + .to(Connection::Table, Connection::ConnectionId) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(View::Table) + .col(ColumnDef::new(View::ViewId).integer().primary_key()) + .col(ColumnDef::new(View::Name).string().not_null()) + .col(ColumnDef::new(View::Properties).json().not_null()) + .col(ColumnDef::new(View::Definition).string().not_null()) + .col(ColumnDef::new(View::Columns).json().not_null()) + .foreign_key( + &mut ForeignKey::create() + .name("FK_view_object_id") + .from(View::Table, View::ViewId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Index::Table) + .col(ColumnDef::new(Index::IndexId).integer().primary_key()) + .col(ColumnDef::new(Index::Name).string().not_null()) + .col(ColumnDef::new(Index::IndexTableId).integer().not_null()) + .col(ColumnDef::new(Index::PrimaryTableId).integer().not_null()) + .col(ColumnDef::new(Index::IndexItems).json().not_null()) + .col(ColumnDef::new(Index::OriginalColumns).json().not_null()) + .col(ColumnDef::new(Index::JobStatus).string().not_null()) + .foreign_key( + &mut ForeignKey::create() + .name("FK_index_object_id") + .from(Index::Table, Index::IndexId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_index_index_table_id") + .from(Index::Table, Index::IndexTableId) + .to(Table::Table, Table::TableId) + .to_owned(), + ) + .foreign_key( + &mut ForeignKey::create() + .name("FK_index_primary_table_id") + .from(Index::Table, Index::PrimaryTableId) + .to(Table::Table, Table::TableId) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(Function::Table) + .col(ColumnDef::new(Function::FunctionId).integer().primary_key()) + .col(ColumnDef::new(Function::Name).string().not_null()) + .col(ColumnDef::new(Function::ArgTypes).json().not_null()) + .col(ColumnDef::new(Function::ReturnType).json().not_null()) + .col(ColumnDef::new(Function::Language).string().not_null()) + .col(ColumnDef::new(Function::Link).string().not_null()) + .col(ColumnDef::new(Function::Identifier).string().not_null()) + .col(ColumnDef::new(Function::Kind).string().not_null()) + .foreign_key( + &mut ForeignKey::create() + .name("FK_function_object_id") + .from(Function::Table, Function::FunctionId) + .to(Object::Table, Object::Oid) + .on_delete(ForeignKeyAction::Cascade) + .to_owned(), + ) + .to_owned(), + ) + .await?; + manager + .create_table( + MigrationTable::create() + .table(SystemParameter::Table) + .col( + ColumnDef::new(SystemParameter::Name) + .string() + .primary_key() + .not_null(), + ) + .col(ColumnDef::new(SystemParameter::Value).string().not_null()) + .col( + ColumnDef::new(SystemParameter::IsMutable) + .boolean() + .not_null(), + ) + .col(ColumnDef::new(SystemParameter::Description).string()) + .to_owned(), + ) + .await?; + + // 3. create indexes. + manager + .create_index( + MigrationIndex::create() + .table(Worker::Table) + .name("idx_worker_host_port") + .unique() + .col(Worker::Host) + .col(Worker::Port) + .to_owned(), + ) + .await?; + + // 4. initialize data. + let insert_cluster_id = Query::insert() + .into_table(Cluster::Table) + .columns([Cluster::ClusterId]) + .values_panic([uuid::Uuid::new_v4().into()]) + .to_owned(); + let insert_sys_users = Query::insert() + .into_table(User::Table) + .columns([ + User::Name, + User::IsSuper, + User::CanCreateUser, + User::CanCreateDb, + User::CanLogin, + ]) + .values_panic([ + "root".into(), + true.into(), + true.into(), + true.into(), + true.into(), + ]) + .values_panic([ + "postgres".into(), + true.into(), + true.into(), + true.into(), + true.into(), + ]) + .to_owned(); + + // Since User table is newly created, we assume that the initial user id of `root` is 1 and `postgres` is 2. + let insert_objects = Query::insert() + .into_table(Object::Table) + .columns([Object::ObjType, Object::OwnerId, Object::DatabaseId]) + .values_panic(["DATABASE".into(), 1.into(), None::.into()]) + .values_panic(["SCHEMA".into(), 1.into(), 1.into()]) // public + .values_panic(["SCHEMA".into(), 1.into(), 1.into()]) // pg_catalog + .values_panic(["SCHEMA".into(), 1.into(), 1.into()]) // information_schema + .values_panic(["SCHEMA".into(), 1.into(), 1.into()]) // rw_catalog + .to_owned(); + + // Since all tables are newly created, we assume that the initial object id of `dev` is 1 and the schemas' ids are 2, 3, 4, 5. + let insert_sys_database = Query::insert() + .into_table(Database::Table) + .columns([Database::DatabaseId, Database::Name]) + .values_panic([1.into(), "dev".into()]) + .to_owned(); + let insert_sys_schemas = Query::insert() + .into_table(Schema::Table) + .columns([Schema::SchemaId, Schema::Name]) + .values_panic([2.into(), "public".into()]) + .values_panic([3.into(), "pg_catalog".into()]) + .values_panic([4.into(), "information_schema".into()]) + .values_panic([5.into(), "rw_catalog".into()]) + .to_owned(); + + manager.exec_stmt(insert_cluster_id).await?; + manager.exec_stmt(insert_sys_users).await?; + manager.exec_stmt(insert_objects).await?; + manager.exec_stmt(insert_sys_database).await?; + manager.exec_stmt(insert_sys_schemas).await?; + + Ok(()) + } + + async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> { + macro_rules! drop_tables { + ($manager:expr, $( $table:ident ),+) => { + $( + $manager + .drop_table( + MigrationTable::drop() + .table($table::Table) + .if_exists() + .cascade() + .to_owned(), + ) + .await?; + )+ + }; + } + + // drop tables cascade. + drop_tables!( + manager, + Cluster, + Worker, + WorkerProperty, + User, + UserPrivilege, + Database, + Schema, + Fragment, + Actor, + Table, + Source, + Sink, + Connection, + View, + Index, + Function, + Object, + ObjectDependency, + SystemParameter + ); + Ok(()) + } +} + +#[derive(DeriveIden)] +enum Cluster { + Table, + ClusterId, + CreatedAt, +} + +#[derive(DeriveIden)] +enum Worker { + Table, + WorkerId, + WorkerType, + Host, + Port, + TransactionId, + Status, +} + +#[derive(DeriveIden)] +enum WorkerProperty { + Table, + WorkerId, + ParallelUnitIds, + IsStreaming, + IsServing, + IsUnschedulable, +} + +#[derive(DeriveIden)] +enum User { + Table, + UserId, + Name, + IsSuper, + CanCreateDb, + CanCreateUser, + CanLogin, + AuthType, + AuthValue, +} + +#[derive(DeriveIden)] +enum UserPrivilege { + Table, + Id, + UserId, + Oid, + GrantedBy, + Actions, + WithGrantOption, +} + +#[derive(DeriveIden)] +enum Database { + Table, + DatabaseId, + Name, +} + +#[derive(DeriveIden)] +enum Schema { + Table, + SchemaId, + Name, +} + +#[derive(DeriveIden)] +enum Fragment { + Table, + FragmentId, + TableId, + FragmentTypeMask, + DistributionType, + StreamNode, + VnodeMapping, + StateTableIds, + UpstreamFragmentId, + DispatcherType, + DistKeyIndices, + OutputIndices, +} + +#[derive(DeriveIden)] +enum Actor { + Table, + ActorId, + FragmentId, + Status, + Splits, + ParallelUnitId, + UpstreamActorIds, + Dispatchers, + VnodeBitmap, +} + +#[derive(DeriveIden)] +#[allow(clippy::enum_variant_names)] +enum Table { + Table, + TableId, + Name, + OptionalAssociatedSourceId, + TableType, + Columns, + Pk, + DistributionKey, + StreamKey, + AppendOnly, + Properties, + FragmentId, + VnodeColIndex, + RowIdIndex, + ValueIndices, + Definition, + HandlePkConflictBehavior, + ReadPrefixLenHint, + WatermarkIndices, + DistKeyInPk, + DmlFragmentId, + Cardinality, + CleanedByWatermark, + JobStatus, + CreateType, + Version, +} + +#[derive(DeriveIden)] +enum Source { + Table, + SourceId, + Name, + RowIdIndex, + Columns, + PkColumnIds, + Properties, + Definition, + SourceInfo, + WatermarkDescs, + OptionalAssociatedTableId, + ConnectionId, + Version, +} + +#[derive(DeriveIden)] +enum Sink { + Table, + SinkId, + Name, + Columns, + PlanPk, + DistributionKey, + DownstreamPk, + SinkType, + Properties, + Definition, + ConnectionId, + DbName, + SinkFromName, + SinkFormatDesc, + JobStatus, +} + +#[derive(DeriveIden)] +enum Connection { + Table, + ConnectionId, + Name, + Info, +} + +#[derive(DeriveIden)] +enum View { + Table, + ViewId, + Name, + Properties, + Definition, + Columns, +} + +#[derive(DeriveIden)] +enum Index { + Table, + IndexId, + Name, + IndexTableId, + PrimaryTableId, + IndexItems, + OriginalColumns, + JobStatus, +} + +#[derive(DeriveIden)] +enum Function { + Table, + FunctionId, + Name, + ArgTypes, + ReturnType, + Language, + Link, + Identifier, + Kind, +} + +#[derive(DeriveIden)] +enum Object { + Table, + Oid, + ObjType, + OwnerId, + SchemaId, + DatabaseId, + InitializedAt, + CreatedAt, +} + +#[derive(DeriveIden)] +enum ObjectDependency { + Table, + Id, + Oid, + UsedBy, +} + +#[derive(DeriveIden)] +enum SystemParameter { + Table, + Name, + Value, + IsMutable, + Description, +} diff --git a/src/meta/src/model_v2/migration/src/m20231008_020431_hummock.rs b/src/meta/src/model_v2/migration/src/m20231008_020431_hummock.rs new file mode 100644 index 0000000000000..ab01980990f34 --- /dev/null +++ b/src/meta/src/model_v2/migration/src/m20231008_020431_hummock.rs @@ -0,0 +1,264 @@ +use sea_orm_migration::prelude::*; + +#[derive(DeriveMigrationName)] +pub struct Migration; + +#[async_trait::async_trait] +impl MigrationTrait for Migration { + async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> { + macro_rules! assert_not_has_tables { + ($manager:expr, $( $table:ident ),+) => { + $( + assert!( + !$manager + .has_table($table::Table.to_string()) + .await? + ); + )+ + }; + } + assert_not_has_tables!( + manager, + CompactionTask, + CompactionConfig, + CompactionStatus, + HummockPinnedVersion, + HummockPinnedSnapshot, + HummockVersionDelta, + HummockVersionStats + ); + + manager + .create_table( + Table::create() + .table(CompactionTask::Table) + .col( + ColumnDef::new(CompactionTask::Id) + .big_integer() + .not_null() + .primary_key(), + ) + .col( + ColumnDef::new(CompactionTask::Task) + .json_binary() + .not_null(), + ) + .col( + ColumnDef::new(CompactionTask::ContextId) + .integer() + .not_null(), + ) + .to_owned(), + ) + .await?; + + manager + .create_table( + Table::create() + .table(CompactionConfig::Table) + .col( + ColumnDef::new(CompactionConfig::CompactionGroupId) + .big_integer() + .not_null() + .primary_key(), + ) + .col(ColumnDef::new(CompactionConfig::Config).json_binary()) + .to_owned(), + ) + .await?; + + manager + .create_table( + Table::create() + .table(CompactionStatus::Table) + .col( + ColumnDef::new(CompactionStatus::CompactionGroupId) + .big_integer() + .not_null() + .primary_key(), + ) + .col(ColumnDef::new(CompactionStatus::Status).json_binary()) + .to_owned(), + ) + .await?; + + manager + .create_table( + Table::create() + .table(HummockPinnedVersion::Table) + .col( + ColumnDef::new(HummockPinnedVersion::ContextId) + .integer() + .not_null() + .primary_key(), + ) + .col( + ColumnDef::new(HummockPinnedVersion::MinPinnedId) + .big_integer() + .not_null(), + ) + .to_owned(), + ) + .await?; + + manager + .create_table( + Table::create() + .table(HummockPinnedSnapshot::Table) + .col( + ColumnDef::new(HummockPinnedSnapshot::ContextId) + .integer() + .not_null() + .primary_key(), + ) + .col( + ColumnDef::new(HummockPinnedSnapshot::MinPinnedSnapshot) + .big_integer() + .not_null(), + ) + .to_owned(), + ) + .await?; + + manager + .create_table( + Table::create() + .table(HummockVersionDelta::Table) + .col( + ColumnDef::new(HummockVersionDelta::Id) + .big_integer() + .not_null() + .primary_key(), + ) + .col( + ColumnDef::new(HummockVersionDelta::PrevId) + .big_integer() + .not_null(), + ) + .col(ColumnDef::new(HummockVersionDelta::GroupDeltas).json_binary()) + .col( + ColumnDef::new(HummockVersionDelta::MaxCommittedEpoch) + .big_integer() + .not_null(), + ) + .col( + ColumnDef::new(HummockVersionDelta::SafeEpoch) + .big_integer() + .not_null(), + ) + .col( + ColumnDef::new(HummockVersionDelta::TrivialMove) + .boolean() + .not_null(), + ) + .col(ColumnDef::new(HummockVersionDelta::GcObjectIds).json_binary()) + .to_owned(), + ) + .await?; + + manager + .create_table( + Table::create() + .table(HummockVersionStats::Table) + .col( + ColumnDef::new(HummockVersionStats::Id) + .big_integer() + .not_null() + .primary_key(), + ) + .col( + ColumnDef::new(HummockVersionStats::Stats) + .json_binary() + .not_null(), + ) + .to_owned(), + ) + .await?; + + Ok(()) + } + + async fn down(&self, manager: &SchemaManager) -> Result<(), DbErr> { + macro_rules! drop_tables { + ($manager:expr, $( $table:ident ),+) => { + $( + $manager + .drop_table( + Table::drop() + .table($table::Table) + .if_exists() + .cascade() + .to_owned(), + ) + .await?; + )+ + }; + } + drop_tables!( + manager, + CompactionTask, + CompactionConfig, + CompactionStatus, + HummockPinnedVersion, + HummockPinnedSnapshot, + HummockVersionDelta, + HummockVersionStats + ); + Ok(()) + } +} + +#[derive(DeriveIden)] +enum CompactionTask { + Table, + Id, + Task, + ContextId, +} + +#[derive(DeriveIden)] +enum CompactionConfig { + Table, + CompactionGroupId, + Config, +} + +#[derive(DeriveIden)] +enum CompactionStatus { + Table, + CompactionGroupId, + Status, +} + +#[derive(DeriveIden)] +enum HummockPinnedVersion { + Table, + ContextId, + MinPinnedId, +} + +#[derive(DeriveIden)] +enum HummockPinnedSnapshot { + Table, + ContextId, + MinPinnedSnapshot, +} + +#[derive(DeriveIden)] +enum HummockVersionDelta { + Table, + Id, + PrevId, + GroupDeltas, + MaxCommittedEpoch, + SafeEpoch, + TrivialMove, + GcObjectIds, +} + +#[derive(DeriveIden)] +enum HummockVersionStats { + Table, + Id, + Stats, +} diff --git a/src/meta/src/model_v2/migration/src/main.rs b/src/meta/src/model_v2/migration/src/main.rs new file mode 100644 index 0000000000000..9354e45ecd198 --- /dev/null +++ b/src/meta/src/model_v2/migration/src/main.rs @@ -0,0 +1,6 @@ +use sea_orm_migration::prelude::*; + +#[async_std::main] +async fn main() { + cli::run_cli(model_migration::Migrator).await; +} diff --git a/src/meta/src/model_v2/mod.rs b/src/meta/src/model_v2/mod.rs new file mode 100644 index 0000000000000..1c2f928063fff --- /dev/null +++ b/src/meta/src/model_v2/mod.rs @@ -0,0 +1,136 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use risingwave_pb::catalog::{PbCreateType, PbStreamJobStatus}; +use sea_orm::{DeriveActiveEnum, EnumIter, FromJsonQueryResult}; +use serde::{Deserialize, Serialize}; + +pub mod prelude; + +pub mod actor; +pub mod cluster; +pub mod compaction_config; +pub mod compaction_status; +pub mod compaction_task; +pub mod connection; +pub mod database; +pub mod ext; +pub mod fragment; +pub mod function; +pub mod hummock_pinned_snapshot; +pub mod hummock_pinned_version; +pub mod hummock_version_delta; +pub mod hummock_version_stats; +pub mod index; +pub mod object; +pub mod object_dependency; +pub mod schema; +pub mod sink; +pub mod source; +pub mod system_parameter; +pub mod table; +pub mod trx; +pub mod user; +pub mod user_privilege; +pub mod view; +pub mod worker; +pub mod worker_property; + +pub type WorkerId = u32; +pub type TransactionId = u32; + +pub type ObjectId = u32; +pub type DatabaseId = ObjectId; +pub type SchemaId = ObjectId; +pub type TableId = ObjectId; +pub type SourceId = ObjectId; +pub type SinkId = ObjectId; +pub type IndexId = ObjectId; +pub type ViewId = ObjectId; +pub type FunctionId = ObjectId; +pub type ConnectionId = ObjectId; +pub type UserId = u32; + +#[derive(Clone, Debug, PartialEq, Eq, EnumIter, DeriveActiveEnum)] +#[sea_orm(rs_type = "String", db_type = "String(None)")] +pub enum JobStatus { + #[sea_orm(string_value = "CREATING")] + Creating, + #[sea_orm(string_value = "CREATED")] + Created, +} + +impl From for PbStreamJobStatus { + fn from(job_status: JobStatus) -> Self { + match job_status { + JobStatus::Creating => Self::Creating, + JobStatus::Created => Self::Created, + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq, EnumIter, DeriveActiveEnum)] +#[sea_orm(rs_type = "String", db_type = "String(None)")] +pub enum CreateType { + #[sea_orm(string_value = "BACKGROUND")] + Background, + #[sea_orm(string_value = "FOREGROUND")] + Foreground, +} + +impl From for PbCreateType { + fn from(create_type: CreateType) -> Self { + match create_type { + CreateType::Background => Self::Background, + CreateType::Foreground => Self::Foreground, + } + } +} + +/// Defines struct with a single pb field that derives `FromJsonQueryResult`, it will helps to map json value stored in database to Pb struct. +macro_rules! derive_from_json_struct { + ($struct_name:ident, $field_type:ty) => { + #[derive(Clone, Debug, PartialEq, FromJsonQueryResult, Serialize, Deserialize, Default)] + pub struct $struct_name(pub $field_type); + impl Eq for $struct_name {} + }; +} + +derive_from_json_struct!(I32Array, Vec); +derive_from_json_struct!(DataType, risingwave_pb::data::DataType); +derive_from_json_struct!(DataTypeArray, Vec); +derive_from_json_struct!(FieldArray, Vec); +derive_from_json_struct!(Property, HashMap); +derive_from_json_struct!(ColumnCatalog, risingwave_pb::plan_common::PbColumnCatalog); +derive_from_json_struct!( + ColumnCatalogArray, + Vec +); +derive_from_json_struct!(StreamSourceInfo, risingwave_pb::catalog::PbStreamSourceInfo); +derive_from_json_struct!(WatermarkDesc, risingwave_pb::catalog::PbWatermarkDesc); +derive_from_json_struct!( + WatermarkDescArray, + Vec +); +derive_from_json_struct!(ExprNodeArray, Vec); +derive_from_json_struct!(ColumnOrderArray, Vec); +derive_from_json_struct!(SinkFormatDesc, risingwave_pb::catalog::PbSinkFormatDesc); +derive_from_json_struct!(Cardinality, risingwave_pb::plan_common::PbCardinality); +derive_from_json_struct!(TableVersion, risingwave_pb::catalog::table::PbTableVersion); +derive_from_json_struct!( + PrivateLinkService, + risingwave_pb::catalog::connection::PbPrivateLinkService +); diff --git a/src/meta/src/model_v2/object.rs b/src/meta/src/model_v2/object.rs new file mode 100644 index 0000000000000..5048f93a483d9 --- /dev/null +++ b/src/meta/src/model_v2/object.rs @@ -0,0 +1,193 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::{DatabaseId, ObjectId, SchemaId, UserId}; + +#[derive(Clone, Debug, PartialEq, Eq, Copy, EnumIter, DeriveActiveEnum)] +#[sea_orm(rs_type = "String", db_type = "String(None)")] +pub enum ObjectType { + #[sea_orm(string_value = "DATABASE")] + Database, + #[sea_orm(string_value = "SCHEMA")] + Schema, + #[sea_orm(string_value = "TABLE")] + Table, + #[sea_orm(string_value = "SOURCE")] + Source, + #[sea_orm(string_value = "SINK")] + Sink, + #[sea_orm(string_value = "VIEW")] + View, + #[sea_orm(string_value = "INDEX")] + Index, + #[sea_orm(string_value = "FUNCTION")] + Function, + #[sea_orm(string_value = "CONNECTION")] + Connection, +} + +impl ObjectType { + pub fn as_str(&self) -> &'static str { + match self { + ObjectType::Database => "database", + ObjectType::Schema => "schema", + ObjectType::Table => "table", + ObjectType::Source => "source", + ObjectType::Sink => "sink", + ObjectType::View => "view", + ObjectType::Index => "index", + ObjectType::Function => "function", + ObjectType::Connection => "connection", + } + } +} + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "object")] +pub struct Model { + #[sea_orm(primary_key)] + pub oid: ObjectId, + pub obj_type: ObjectType, + pub owner_id: UserId, + pub schema_id: Option, + pub database_id: Option, + pub initialized_at: DateTime, + pub created_at: DateTime, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm(has_many = "super::connection::Entity")] + Connection, + #[sea_orm(has_many = "super::database::Entity")] + Database, + #[sea_orm(has_many = "super::fragment::Entity")] + Fragment, + #[sea_orm(has_many = "super::function::Entity")] + Function, + #[sea_orm(has_many = "super::index::Entity")] + Index, + #[sea_orm( + belongs_to = "Entity", + from = "Column::DatabaseId", + to = "Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + SelfRef2, + #[sea_orm( + belongs_to = "Entity", + from = "Column::SchemaId", + to = "Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + SelfRef1, + #[sea_orm(has_many = "super::schema::Entity")] + Schema, + #[sea_orm(has_many = "super::sink::Entity")] + Sink, + #[sea_orm(has_many = "super::source::Entity")] + Source, + #[sea_orm(has_many = "super::table::Entity")] + Table, + #[sea_orm( + belongs_to = "super::user::Entity", + from = "Column::OwnerId", + to = "super::user::Column::UserId", + on_update = "NoAction", + on_delete = "Cascade" + )] + User, + #[sea_orm(has_many = "super::user_privilege::Entity")] + UserPrivilege, + #[sea_orm(has_many = "super::view::Entity")] + View, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Connection.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Database.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Fragment.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Function.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Index.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Schema.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Sink.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Source.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Table.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::User.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::UserPrivilege.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::View.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/object_dependency.rs b/src/meta/src/model_v2/object_dependency.rs new file mode 100644 index 0000000000000..53800112a7370 --- /dev/null +++ b/src/meta/src/model_v2/object_dependency.rs @@ -0,0 +1,48 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::{ObjectId, UserId}; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "object_dependency")] +pub struct Model { + #[sea_orm(primary_key)] + pub id: i32, + pub oid: ObjectId, + pub used_by: UserId, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::Oid", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object2, + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::UsedBy", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object1, +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/prelude.rs b/src/meta/src/model_v2/prelude.rs new file mode 100644 index 0000000000000..ab9670f712f04 --- /dev/null +++ b/src/meta/src/model_v2/prelude.rs @@ -0,0 +1,40 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub use super::actor::Entity as Actor; +pub use super::cluster::Entity as Cluster; +pub use super::compaction_config::Entity as CompactionConfig; +pub use super::compaction_status::Entity as CompactionStatus; +pub use super::compaction_task::Entity as CompactionTask; +pub use super::connection::Entity as Connection; +pub use super::database::Entity as Database; +pub use super::fragment::Entity as Fragment; +pub use super::function::Entity as Function; +pub use super::hummock_pinned_snapshot::Entity as HummockPinnedSnapshot; +pub use super::hummock_pinned_version::Entity as HummockPinnedVersion; +pub use super::hummock_version_delta::Entity as HummockVersionDelta; +pub use super::hummock_version_stats::Entity as HummockVersionStats; +pub use super::index::Entity as Index; +pub use super::object::Entity as Object; +pub use super::object_dependency::Entity as ObjectDependency; +pub use super::schema::Entity as Schema; +pub use super::sink::Entity as Sink; +pub use super::source::Entity as Source; +pub use super::system_parameter::Entity as SystemParameter; +pub use super::table::Entity as Table; +pub use super::user::Entity as User; +pub use super::user_privilege::Entity as UserPrivilege; +pub use super::view::Entity as View; +pub use super::worker::Entity as Worker; +pub use super::worker_property::Entity as WorkerProperty; diff --git a/src/meta/src/model_v2/schema.rs b/src/meta/src/model_v2/schema.rs new file mode 100644 index 0000000000000..2c28665fd06f0 --- /dev/null +++ b/src/meta/src/model_v2/schema.rs @@ -0,0 +1,45 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::SchemaId; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "schema")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub schema_id: SchemaId, + pub name: String, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::SchemaId", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/sink.rs b/src/meta/src/model_v2/sink.rs new file mode 100644 index 0000000000000..bef46f1d7195f --- /dev/null +++ b/src/meta/src/model_v2/sink.rs @@ -0,0 +1,96 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_pb::catalog::PbSinkType; +use sea_orm::entity::prelude::*; + +use crate::model_v2::{ + ColumnCatalogArray, ColumnOrderArray, ConnectionId, I32Array, JobStatus, Property, + SinkFormatDesc, SinkId, +}; + +#[derive(Clone, Debug, PartialEq, Eq, EnumIter, DeriveActiveEnum)] +#[sea_orm(rs_type = "String", db_type = "String(None)")] +pub enum SinkType { + #[sea_orm(string_value = "APPEND_ONLY")] + AppendOnly, + #[sea_orm(string_value = "FORCE_APPEND_ONLY")] + ForceAppendOnly, + #[sea_orm(string_value = "UPSERT")] + Upsert, +} + +impl From for PbSinkType { + fn from(sink_type: SinkType) -> Self { + match sink_type { + SinkType::AppendOnly => Self::AppendOnly, + SinkType::ForceAppendOnly => Self::ForceAppendOnly, + SinkType::Upsert => Self::Upsert, + } + } +} + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "sink")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub sink_id: SinkId, + pub name: String, + pub columns: ColumnCatalogArray, + pub plan_pk: ColumnOrderArray, + pub distribution_key: I32Array, + pub downstream_pk: I32Array, + pub sink_type: SinkType, + pub properties: Property, + pub definition: String, + pub connection_id: Option, + pub db_name: String, + pub sink_from_name: String, + pub sink_format_desc: Option, + pub job_status: JobStatus, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::connection::Entity", + from = "Column::ConnectionId", + to = "super::connection::Column::ConnectionId", + on_update = "NoAction", + on_delete = "NoAction" + )] + Connection, + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::SinkId", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Connection.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/source.rs b/src/meta/src/model_v2/source.rs new file mode 100644 index 0000000000000..2ad1de7914d96 --- /dev/null +++ b/src/meta/src/model_v2/source.rs @@ -0,0 +1,80 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::{ + ColumnCatalogArray, ConnectionId, I32Array, Property, SourceId, StreamSourceInfo, TableId, + WatermarkDescArray, +}; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "source")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub source_id: SourceId, + pub name: String, + pub row_id_index: Option, + pub columns: ColumnCatalogArray, + pub pk_column_ids: I32Array, + pub properties: Property, + pub definition: String, + pub source_info: Option, + pub watermark_descs: WatermarkDescArray, + pub optional_associated_table_id: Option, + pub connection_id: Option, + pub version: u64, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::connection::Entity", + from = "Column::ConnectionId", + to = "super::connection::Column::ConnectionId", + on_update = "NoAction", + on_delete = "NoAction" + )] + Connection, + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::SourceId", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object, + #[sea_orm(has_many = "super::table::Entity")] + Table, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Connection.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Table.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/system_parameter.rs b/src/meta/src/model_v2/system_parameter.rs new file mode 100644 index 0000000000000..366c3f743187b --- /dev/null +++ b/src/meta/src/model_v2/system_parameter.rs @@ -0,0 +1,30 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "system_parameter")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub name: String, + pub value: String, + pub is_mutable: bool, + pub description: Option, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/table.rs b/src/meta/src/model_v2/table.rs new file mode 100644 index 0000000000000..08caee7009f8f --- /dev/null +++ b/src/meta/src/model_v2/table.rs @@ -0,0 +1,148 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_pb::catalog::table::PbTableType; +use risingwave_pb::catalog::PbHandleConflictBehavior; +use sea_orm::entity::prelude::*; + +use crate::model_v2::{ + Cardinality, ColumnCatalogArray, ColumnOrderArray, CreateType, I32Array, JobStatus, Property, + SourceId, TableId, TableVersion, +}; + +#[derive(Clone, Debug, PartialEq, Eq, EnumIter, DeriveActiveEnum)] +#[sea_orm(rs_type = "String", db_type = "String(None)")] +pub enum TableType { + #[sea_orm(string_value = "TABLE")] + Table, + #[sea_orm(string_value = "MATERIALIZED_VIEW")] + MaterializedView, + #[sea_orm(string_value = "INDEX")] + Index, + #[sea_orm(string_value = "INTERNAL")] + Internal, +} + +impl From for PbTableType { + fn from(table_type: TableType) -> Self { + match table_type { + TableType::Table => Self::Table, + TableType::MaterializedView => Self::MaterializedView, + TableType::Index => Self::Index, + TableType::Internal => Self::Internal, + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq, EnumIter, DeriveActiveEnum)] +#[sea_orm(rs_type = "String", db_type = "String(None)")] +pub enum HandleConflictBehavior { + #[sea_orm(string_value = "OVERWRITE")] + Overwrite, + #[sea_orm(string_value = "IGNORE")] + Ignore, + #[sea_orm(string_value = "NO_CHECK")] + NoCheck, +} + +impl From for PbHandleConflictBehavior { + fn from(handle_conflict_behavior: HandleConflictBehavior) -> Self { + match handle_conflict_behavior { + HandleConflictBehavior::Overwrite => Self::Overwrite, + HandleConflictBehavior::Ignore => Self::Ignore, + HandleConflictBehavior::NoCheck => Self::NoCheck, + } + } +} + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "table")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub table_id: TableId, + pub name: String, + pub optional_associated_source_id: Option, + pub table_type: TableType, + pub columns: ColumnCatalogArray, + pub pk: ColumnOrderArray, + pub distribution_key: I32Array, + pub stream_key: I32Array, + pub append_only: bool, + pub properties: Property, + pub fragment_id: i32, + pub vnode_col_index: Option, + pub row_id_index: Option, + pub value_indices: I32Array, + pub definition: String, + pub handle_pk_conflict_behavior: HandleConflictBehavior, + pub read_prefix_len_hint: u32, + pub watermark_indices: I32Array, + pub dist_key_in_pk: I32Array, + pub dml_fragment_id: Option, + pub cardinality: Option, + pub cleaned_by_watermark: bool, + pub job_status: JobStatus, + pub create_type: CreateType, + pub version: TableVersion, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::fragment::Entity", + from = "Column::DmlFragmentId", + to = "super::fragment::Column::FragmentId", + on_update = "NoAction", + on_delete = "NoAction" + )] + Fragment2, + #[sea_orm( + belongs_to = "super::fragment::Entity", + from = "Column::FragmentId", + to = "super::fragment::Column::FragmentId", + on_update = "NoAction", + on_delete = "NoAction" + )] + Fragment1, + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::TableId", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object, + #[sea_orm( + belongs_to = "super::source::Entity", + from = "Column::OptionalAssociatedSourceId", + to = "super::source::Column::SourceId", + on_update = "NoAction", + on_delete = "NoAction" + )] + Source, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Source.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/trx.rs b/src/meta/src/model_v2/trx.rs new file mode 100644 index 0000000000000..4bfe6d0261de4 --- /dev/null +++ b/src/meta/src/model_v2/trx.rs @@ -0,0 +1,276 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub type Transaction = sea_orm::DatabaseTransaction; + +#[cfg(not(madsim))] +#[cfg(test)] +mod tests { + use std::collections::BTreeMap; + + use risingwave_pb::hummock::HummockPinnedVersion; + use sea_orm::{EntityTrait, TransactionTrait}; + + use crate::controller::SqlMetaStore; + use crate::model::{BTreeMapTransaction, ValTransaction, VarTransaction}; + use crate::model_v2::hummock_pinned_version::Model as HummockPinnedVersionModel; + use crate::model_v2::prelude::HummockPinnedVersion as HummockPinnedVersionEntity; + use crate::model_v2::trx::Transaction; + + #[tokio::test] + async fn test_simple_var_transaction_commit() { + let store = SqlMetaStore::for_test().await; + let db = &store.conn; + let mut kv = HummockPinnedVersion { + context_id: 1, + min_pinned_id: 2, + }; + let mut num_txn = VarTransaction::<'_, Transaction, _>::new(&mut kv); + num_txn.min_pinned_id = 3; + assert_eq!(num_txn.min_pinned_id, 3); + let mut txn = db.begin().await.unwrap(); + num_txn.apply_to_txn(&mut txn).await.unwrap(); + txn.commit().await.unwrap(); + let db_val = HummockPinnedVersionEntity::find_by_id(1) + .one(db) + .await + .unwrap() + .unwrap(); + assert_eq!(db_val.min_pinned_id, 3); + num_txn.commit(); + assert_eq!(kv.min_pinned_id, 3); + } + + #[test] + fn test_simple_var_transaction_abort() { + let mut kv = HummockPinnedVersion { + context_id: 1, + min_pinned_id: 11, + }; + let mut num_txn = VarTransaction::<'_, Transaction, _>::new(&mut kv); + num_txn.min_pinned_id = 2; + num_txn.abort(); + assert_eq!(11, kv.min_pinned_id); + } + + #[tokio::test] + async fn test_tree_map_transaction_commit() { + let mut map: BTreeMap = BTreeMap::new(); + // to remove + map.insert( + 1, + HummockPinnedVersion { + context_id: 1, + min_pinned_id: 11, + }, + ); + // to-remove-after-modify + map.insert( + 2, + HummockPinnedVersion { + context_id: 2, + min_pinned_id: 22, + }, + ); + // first + map.insert( + 3, + HummockPinnedVersion { + context_id: 3, + min_pinned_id: 33, + }, + ); + + let mut map_copy = map.clone(); + let mut map_txn = BTreeMapTransaction::new(&mut map); + map_txn.remove(1); + map_txn.insert( + 2, + HummockPinnedVersion { + context_id: 2, + min_pinned_id: 0, + }, + ); + map_txn.remove(2); + // first + map_txn.insert( + 3, + HummockPinnedVersion { + context_id: 3, + min_pinned_id: 333, + }, + ); + // second + map_txn.insert( + 4, + HummockPinnedVersion { + context_id: 4, + min_pinned_id: 44, + }, + ); + assert_eq!( + &HummockPinnedVersion { + context_id: 4, + min_pinned_id: 44 + }, + map_txn.get(&4).unwrap() + ); + // third + map_txn.insert( + 5, + HummockPinnedVersion { + context_id: 5, + min_pinned_id: 55, + }, + ); + assert_eq!( + &HummockPinnedVersion { + context_id: 5, + min_pinned_id: 55 + }, + map_txn.get(&5).unwrap() + ); + + let mut third_entry = map_txn.get_mut(5).unwrap(); + third_entry.min_pinned_id = 555; + assert_eq!( + &HummockPinnedVersion { + context_id: 5, + min_pinned_id: 555 + }, + map_txn.get(&5).unwrap() + ); + + let store = SqlMetaStore::for_test().await; + let db = &store.conn; + let mut txn = db.begin().await.unwrap(); + map_txn.apply_to_txn(&mut txn).await.unwrap(); + txn.commit().await.unwrap(); + + let db_rows: Vec = + HummockPinnedVersionEntity::find().all(db).await.unwrap(); + assert_eq!(db_rows.len(), 3); + assert_eq!( + 1, + db_rows + .iter() + .filter(|m| m.context_id == 3 && m.min_pinned_id == 333) + .count() + ); + assert_eq!( + 1, + db_rows + .iter() + .filter(|m| m.context_id == 4 && m.min_pinned_id == 44) + .count() + ); + assert_eq!( + 1, + db_rows + .iter() + .filter(|m| m.context_id == 5 && m.min_pinned_id == 555) + .count() + ); + map_txn.commit(); + + // replay the change to local copy and compare + map_copy.remove(&1).unwrap(); + map_copy.insert( + 2, + HummockPinnedVersion { + context_id: 2, + min_pinned_id: 22, + }, + ); + map_copy.remove(&2).unwrap(); + map_copy.insert( + 3, + HummockPinnedVersion { + context_id: 3, + min_pinned_id: 333, + }, + ); + map_copy.insert( + 4, + HummockPinnedVersion { + context_id: 4, + min_pinned_id: 44, + }, + ); + map_copy.insert( + 5, + HummockPinnedVersion { + context_id: 5, + min_pinned_id: 555, + }, + ); + assert_eq!(map_copy, map); + } + + #[tokio::test] + async fn test_tree_map_entry_update_transaction_commit() { + let mut map: BTreeMap = BTreeMap::new(); + map.insert( + 1, + HummockPinnedVersion { + context_id: 1, + min_pinned_id: 11, + }, + ); + + let mut map_txn = BTreeMapTransaction::new(&mut map); + let mut first_entry_txn = map_txn.new_entry_txn(1).unwrap(); + first_entry_txn.min_pinned_id = 111; + + let store = SqlMetaStore::for_test().await; + let db = &store.conn; + let mut txn = db.begin().await.unwrap(); + first_entry_txn.apply_to_txn(&mut txn).await.unwrap(); + txn.commit().await.unwrap(); + first_entry_txn.commit(); + + let db_rows: Vec = + HummockPinnedVersionEntity::find().all(db).await.unwrap(); + assert_eq!(db_rows.len(), 1); + assert_eq!( + 1, + db_rows + .iter() + .filter(|m| m.context_id == 1 && m.min_pinned_id == 111) + .count() + ); + assert_eq!(111, map.get(&1).unwrap().min_pinned_id); + } + + #[tokio::test] + async fn test_tree_map_entry_insert_transaction_commit() { + let mut map: BTreeMap = BTreeMap::new(); + + let mut map_txn = BTreeMapTransaction::new(&mut map); + let first_entry_txn = map_txn.new_entry_insert_txn( + 1, + HummockPinnedVersion { + context_id: 1, + min_pinned_id: 11, + }, + ); + let store = SqlMetaStore::for_test().await; + let db = &store.conn; + let mut txn = db.begin().await.unwrap(); + first_entry_txn.apply_to_txn(&mut txn).await.unwrap(); + txn.commit().await.unwrap(); + first_entry_txn.commit(); + assert_eq!(11, map.get(&1).unwrap().min_pinned_id); + } +} diff --git a/src/meta/src/model_v2/user.rs b/src/meta/src/model_v2/user.rs new file mode 100644 index 0000000000000..0e7ab4dd17876 --- /dev/null +++ b/src/meta/src/model_v2/user.rs @@ -0,0 +1,45 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::UserId; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "user")] +pub struct Model { + #[sea_orm(primary_key)] + pub user_id: UserId, + pub name: String, + pub is_super: bool, + pub can_create_db: bool, + pub can_create_user: bool, + pub can_login: bool, + pub auth_type: Option, + pub auth_value: Option, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm(has_many = "super::object::Entity")] + Object, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/user_privilege.rs b/src/meta/src/model_v2/user_privilege.rs new file mode 100644 index 0000000000000..335f716cec1c8 --- /dev/null +++ b/src/meta/src/model_v2/user_privilege.rs @@ -0,0 +1,65 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::{ObjectId, UserId}; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "user_privilege")] +pub struct Model { + #[sea_orm(primary_key)] + pub id: i32, + pub user_id: UserId, + pub oid: ObjectId, + pub granted_by: UserId, + pub actions: String, + pub with_grant_option: bool, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::Oid", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "NoAction" + )] + Object, + #[sea_orm( + belongs_to = "super::user::Entity", + from = "Column::GrantedBy", + to = "super::user::Column::UserId", + on_update = "NoAction", + on_delete = "NoAction" + )] + User2, + #[sea_orm( + belongs_to = "super::user::Entity", + from = "Column::UserId", + to = "super::user::Column::UserId", + on_update = "NoAction", + on_delete = "Cascade" + )] + User1, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/view.rs b/src/meta/src/model_v2/view.rs new file mode 100644 index 0000000000000..8f7d22408d3f2 --- /dev/null +++ b/src/meta/src/model_v2/view.rs @@ -0,0 +1,62 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use risingwave_pb::catalog::PbView; +use sea_orm::entity::prelude::*; +use sea_orm::ActiveValue; + +use crate::model_v2::{FieldArray, Property, ViewId}; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "view")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub view_id: ViewId, + pub name: String, + pub properties: Property, + pub definition: String, + pub columns: FieldArray, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::object::Entity", + from = "Column::ViewId", + to = "super::object::Column::Oid", + on_update = "NoAction", + on_delete = "Cascade" + )] + Object, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Object.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} + +impl From for ActiveModel { + fn from(view: PbView) -> Self { + Self { + view_id: ActiveValue::Set(view.id as _), + name: ActiveValue::Set(view.name), + properties: ActiveValue::Set(Property(view.properties)), + definition: ActiveValue::Set(view.sql), + columns: ActiveValue::Set(FieldArray(view.columns)), + } + } +} diff --git a/src/meta/src/model_v2/worker.rs b/src/meta/src/model_v2/worker.rs new file mode 100644 index 0000000000000..08cdb2be34da1 --- /dev/null +++ b/src/meta/src/model_v2/worker.rs @@ -0,0 +1,67 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::{TransactionId, WorkerId}; + +#[derive(Clone, Debug, Hash, PartialEq, Eq, EnumIter, DeriveActiveEnum)] +#[sea_orm(rs_type = "String", db_type = "String(None)")] +pub enum WorkerType { + #[sea_orm(string_value = "FRONTEND")] + Frontend, + #[sea_orm(string_value = "COMPUTE_NODE")] + ComputeNode, + #[sea_orm(string_value = "RISE_CTL")] + RiseCtl, + #[sea_orm(string_value = "COMPACTOR")] + Compactor, + #[sea_orm(string_value = "META")] + Meta, +} + +#[derive(Clone, Debug, PartialEq, Eq, EnumIter, DeriveActiveEnum)] +#[sea_orm(rs_type = "String", db_type = "String(None)")] +pub enum WorkerStatus { + #[sea_orm(string_value = "STARTING")] + Starting, + #[sea_orm(string_value = "RUNNING")] + Running, +} + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "worker")] +pub struct Model { + #[sea_orm(primary_key)] + pub worker_id: WorkerId, + pub worker_type: WorkerType, + pub host: String, + pub port: i32, + pub status: WorkerStatus, + pub transaction_id: Option, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm(has_many = "super::worker_property::Entity")] + WorkerProperty, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::WorkerProperty.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/model_v2/worker_property.rs b/src/meta/src/model_v2/worker_property.rs new file mode 100644 index 0000000000000..8521cbed15ce2 --- /dev/null +++ b/src/meta/src/model_v2/worker_property.rs @@ -0,0 +1,48 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use sea_orm::entity::prelude::*; + +use crate::model_v2::{I32Array, WorkerId}; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel, Eq)] +#[sea_orm(table_name = "worker_property")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub worker_id: WorkerId, + pub parallel_unit_ids: I32Array, + pub is_streaming: bool, + pub is_serving: bool, + pub is_unschedulable: bool, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation { + #[sea_orm( + belongs_to = "super::worker::Entity", + from = "Column::WorkerId", + to = "super::worker::Column::WorkerId", + on_update = "NoAction", + on_delete = "Cascade" + )] + Worker, +} + +impl Related for Entity { + fn to() -> RelationDef { + Relation::Worker.def() + } +} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/src/meta/src/rpc/ddl_controller.rs b/src/meta/src/rpc/ddl_controller.rs index d4b324ea6ddc6..04b9729c5a5b8 100644 --- a/src/meta/src/rpc/ddl_controller.rs +++ b/src/meta/src/rpc/ddl_controller.rs @@ -23,10 +23,10 @@ use risingwave_common::util::column_index_mapping::ColIndexMapping; use risingwave_common::util::epoch::Epoch; use risingwave_pb::catalog::connection::private_link_service::PbPrivateLinkProvider; use risingwave_pb::catalog::{ - connection, Connection, Database, Function, Schema, Source, Table, View, + connection, Connection, CreateType, Database, Function, Schema, Source, Table, View, }; use risingwave_pb::ddl_service::alter_relation_name_request::Relation; -use risingwave_pb::ddl_service::{DdlProgress, StreamJobExecutionMode}; +use risingwave_pb::ddl_service::DdlProgress; use risingwave_pb::stream_plan::StreamFragmentGraph as StreamFragmentGraphProto; use tokio::sync::Semaphore; use tracing::log::warn; @@ -48,6 +48,7 @@ use crate::stream::{ }; use crate::{MetaError, MetaResult}; +#[derive(PartialEq)] pub enum DropMode { Restrict, Cascade, @@ -93,11 +94,7 @@ pub enum DdlCommand { DropFunction(FunctionId), CreateView(View), DropView(ViewId, DropMode), - CreateStreamingJob( - StreamingJob, - StreamFragmentGraphProto, - StreamJobExecutionMode, - ), + CreateStreamingJob(StreamingJob, StreamFragmentGraphProto, CreateType), DropStreamingJob(StreamingJobId, DropMode), ReplaceTable(StreamingJob, StreamFragmentGraphProto, ColIndexMapping), AlterRelationName(Relation, String), @@ -182,7 +179,7 @@ impl CreatingStreamingJobPermit { } impl DdlController { - pub(crate) async fn new( + pub async fn new( env: MetaSrvEnv, catalog_manager: CatalogManagerRef, stream_manager: GlobalStreamManagerRef, @@ -221,7 +218,7 @@ impl DdlController { /// has been interrupted during executing, the request will be cancelled by tonic. Since we have /// a lot of logic for revert, status management, notification and so on, ensuring consistency /// would be a huge hassle and pain if we don't spawn here. - pub(crate) async fn run_command(&self, command: DdlCommand) -> MetaResult { + pub async fn run_command(&self, command: DdlCommand) -> MetaResult { self.check_barrier_manager_status().await?; let ctrl = self.clone(); let fut = async move { @@ -240,12 +237,8 @@ impl DdlController { DdlCommand::DropView(view_id, drop_mode) => { ctrl.drop_view(view_id, drop_mode).await } - DdlCommand::CreateStreamingJob( - stream_job, - fragment_graph, - stream_job_execution_mode, - ) => { - ctrl.create_streaming_job(stream_job, fragment_graph, stream_job_execution_mode) + DdlCommand::CreateStreamingJob(stream_job, fragment_graph, create_type) => { + ctrl.create_streaming_job(stream_job, fragment_graph, create_type) .await } DdlCommand::DropStreamingJob(job_id, drop_mode) => { @@ -271,7 +264,7 @@ impl DdlController { tokio::spawn(fut).await.unwrap() } - pub(crate) async fn get_ddl_progress(&self) -> Vec { + pub async fn get_ddl_progress(&self) -> Vec { self.barrier_manager.get_ddl_progress().await } @@ -331,20 +324,25 @@ impl DdlController { drop_mode: DropMode, ) -> MetaResult { // 1. Drop source in catalog. - let version = self + let (version, streaming_job_ids) = self .catalog_manager .drop_relation( RelationIdEnum::Source(source_id), self.fragment_manager.clone(), drop_mode, ) - .await? - .0; + .await?; + // 2. Unregister source connector worker. self.source_manager .unregister_sources(vec![source_id]) .await; + // 3. Drop streaming jobs if cascade + self.stream_manager + .drop_streaming_jobs(streaming_job_ids) + .await; + Ok(version) } @@ -414,8 +412,13 @@ impl DdlController { &self, mut stream_job: StreamingJob, fragment_graph: StreamFragmentGraphProto, - stream_job_execution_mode: StreamJobExecutionMode, + create_type: CreateType, ) -> MetaResult { + tracing::debug!( + id = stream_job.id(), + definition = stream_job.definition(), + "starting stream job", + ); let _permit = self .creating_streaming_job_permits .semaphore @@ -425,6 +428,8 @@ impl DdlController { let _reschedule_job_lock = self.stream_manager.reschedule_lock.read().await; let env = StreamEnvironment::from_protobuf(fragment_graph.get_env().unwrap()); + + tracing::debug!(id = stream_job.id(), "preparing stream job"); let fragment_graph = self .prepare_stream_job(&mut stream_job, fragment_graph) .await?; @@ -434,6 +439,7 @@ impl DdlController { let mut internal_tables = vec![]; let result = try { + tracing::debug!(id = stream_job.id(), "building stream job"); let (ctx, table_fragments) = self .build_stream_job(env, &stream_job, fragment_graph) .await?; @@ -457,19 +463,19 @@ impl DdlController { let (ctx, table_fragments) = match result { Ok(r) => r, Err(e) => { - self.cancel_stream_job(&stream_job, internal_tables).await; + self.cancel_stream_job(&stream_job, internal_tables).await?; return Err(e); } }; - match stream_job_execution_mode { - StreamJobExecutionMode::Foreground | StreamJobExecutionMode::Unspecified => { + match create_type { + CreateType::Foreground | CreateType::Unspecified => { self.create_streaming_job_inner(stream_job, table_fragments, ctx, internal_tables) .await } - StreamJobExecutionMode::Background => { + CreateType::Background => { let ctrl = self.clone(); - let definition = stream_job.definition(); + let stream_job_id = stream_job.id(); let fut = async move { let result = ctrl .create_streaming_job_inner( @@ -480,9 +486,11 @@ impl DdlController { ) .await; match result { - Err(e) => tracing::error!(definition, error = ?e, "stream_job_error"), + Err(e) => { + tracing::error!(id=stream_job_id, error = ?e, "finish stream job failed") + } Ok(_) => { - tracing::info!(definition, "stream_job_ok") + tracing::info!(id = stream_job_id, "finish stream job succeeded") } } }; @@ -492,6 +500,7 @@ impl DdlController { } } + // We persist table fragments at this step. async fn create_streaming_job_inner( &self, stream_job: StreamingJob, @@ -499,15 +508,29 @@ impl DdlController { ctx: CreateStreamingJobContext, internal_tables: Vec
, ) -> MetaResult { + let job_id = stream_job.id(); + tracing::debug!(id = job_id, "creating stream job"); let result = self .stream_manager .create_streaming_job(table_fragments, ctx) .await; if let Err(e) = result { - self.cancel_stream_job(&stream_job, internal_tables).await; + match stream_job.create_type() { + // NOTE: This assumes that we will trigger recovery, + // and recover stream job progress. + CreateType::Background => { + tracing::error!(id = stream_job.id(), error = ?e, "finish stream job failed") + } + _ => { + self.cancel_stream_job(&stream_job, internal_tables).await?; + } + } return Err(e); }; - self.finish_stream_job(stream_job, internal_tables).await + tracing::debug!(id = job_id, "finishing stream job"); + let version = self.finish_stream_job(stream_job, internal_tables).await?; + tracing::debug!(id = job_id, "finished stream job"); + Ok(version) } async fn drop_streaming_job( @@ -572,6 +595,8 @@ impl DdlController { StreamFragmentGraph::new(fragment_graph, self.env.id_gen_manager_ref(), stream_job) .await?; + let internal_tables = fragment_graph.internal_tables().into_values().collect_vec(); + // 2. Set the graph-related fields and freeze the `stream_job`. stream_job.set_table_fragment_id(fragment_graph.table_fragment_id()); stream_job.set_dml_fragment_id(fragment_graph.dml_fragment_id()); @@ -579,7 +604,7 @@ impl DdlController { // 3. Mark current relation as "creating" and add reference count to dependent relations. self.catalog_manager - .start_create_stream_job_procedure(stream_job) + .start_create_stream_job_procedure(stream_job, internal_tables) .await?; Ok(fragment_graph) @@ -688,6 +713,7 @@ impl DdlController { table_properties: stream_job.properties(), definition: stream_job.definition(), mv_table_id: stream_job.mv_table(), + create_type: stream_job.create_type(), }; // 4. Mark creating tables, including internal tables and the table of the stream job. @@ -704,17 +730,27 @@ impl DdlController { Ok((ctx, table_fragments)) } - /// `cancel_stream_job` cancels a stream job and clean some states. - async fn cancel_stream_job(&self, stream_job: &StreamingJob, internal_tables: Vec
) { + /// This is NOT used by `CANCEL JOBS`. + /// It is used internally by `DdlController` to cancel and cleanup stream job. + async fn cancel_stream_job( + &self, + stream_job: &StreamingJob, + internal_tables: Vec
, + ) -> MetaResult<()> { let mut creating_internal_table_ids = internal_tables.into_iter().map(|t| t.id).collect_vec(); // 1. cancel create procedure. match stream_job { StreamingJob::MaterializedView(table) => { - creating_internal_table_ids.push(table.id); - self.catalog_manager - .cancel_create_table_procedure(table) + // barrier manager will do the cleanup. + let result = self + .catalog_manager + .cancel_create_table_procedure(table.id, creating_internal_table_ids.clone()) .await; + creating_internal_table_ids.push(table.id); + if let Err(e) = result { + tracing::warn!("Failed to cancel create table procedure, perhaps barrier manager has already cleaned it. Reason: {e:#?}"); + } } StreamingJob::Sink(sink) => { self.catalog_manager @@ -722,16 +758,23 @@ impl DdlController { .await; } StreamingJob::Table(source, table) => { - creating_internal_table_ids.push(table.id); if let Some(source) = source { self.catalog_manager .cancel_create_table_procedure_with_source(source, table) .await; } else { - self.catalog_manager - .cancel_create_table_procedure(table) + let result = self + .catalog_manager + .cancel_create_table_procedure( + table.id, + creating_internal_table_ids.clone(), + ) .await; + if let Err(e) = result { + tracing::warn!("Failed to cancel create table procedure, perhaps barrier manager has already cleaned it. Reason: {e:#?}"); + } } + creating_internal_table_ids.push(table.id); } StreamingJob::Index(index, table) => { creating_internal_table_ids.push(table.id); @@ -744,6 +787,7 @@ impl DdlController { self.catalog_manager .unmark_creating_tables(&creating_internal_table_ids, true) .await; + Ok(()) } /// `finish_stream_job` finishes a stream job and clean some states. diff --git a/src/meta/src/rpc/election/mod.rs b/src/meta/src/rpc/election/mod.rs index 9835c554b3fd3..7916ddba6eea4 100644 --- a/src/meta/src/rpc/election/mod.rs +++ b/src/meta/src/rpc/election/mod.rs @@ -29,6 +29,10 @@ pub struct ElectionMember { #[async_trait::async_trait] pub trait ElectionClient: Send + Sync + 'static { + async fn init(&self) -> MetaResult<()> { + Ok(()) + } + fn id(&self) -> MetaResult; async fn run_once(&self, ttl: i64, stop: Receiver<()>) -> MetaResult<()>; fn subscribe(&self) -> Receiver; diff --git a/src/meta/src/rpc/election/sql.rs b/src/meta/src/rpc/election/sql.rs index b6bd02b179c26..a027e8bffdfd1 100644 --- a/src/meta/src/rpc/election/sql.rs +++ b/src/meta/src/rpc/election/sql.rs @@ -15,7 +15,11 @@ use std::sync::Arc; use std::time::Duration; -use sqlx::{MySql, MySqlPool, PgPool, Postgres, Sqlite, SqlitePool}; +use anyhow::anyhow; +use sea_orm::{ + ConnectionTrait, DatabaseBackend, DatabaseConnection, FromQueryResult, Statement, + TransactionTrait, Value, +}; use tokio::sync::watch; use tokio::sync::watch::Receiver; use tokio::time; @@ -29,14 +33,27 @@ pub struct SqlBackendElectionClient { is_leader_sender: watch::Sender, } -#[derive(sqlx::FromRow, Debug)] -pub(crate) struct ElectionRow { +impl SqlBackendElectionClient { + pub fn new(id: String, driver: Arc) -> Self { + let (sender, _) = watch::channel(false); + Self { + id, + driver, + is_leader_sender: sender, + } + } +} + +#[derive(Debug, FromQueryResult)] +pub struct ElectionRow { service: String, id: String, } #[async_trait::async_trait] -pub(crate) trait SqlDriver: Send + Sync + 'static { +pub trait SqlDriver: Send + Sync + 'static { + async fn init_database(&self) -> MetaResult<()>; + async fn update_heartbeat(&self, service_name: &str, id: &str) -> MetaResult<()>; async fn try_campaign(&self, service_name: &str, id: &str, ttl: i64) @@ -48,9 +65,9 @@ pub(crate) trait SqlDriver: Send + Sync + 'static { async fn resign(&self, service_name: &str, id: &str) -> MetaResult<()>; } -pub(crate) trait SqlDriverCommon { +pub trait SqlDriverCommon { const ELECTION_LEADER_TABLE_NAME: &'static str = "election_leader"; - const ELECTION_MEMBER_TABLE_NAME: &'static str = "election_members"; + const ELECTION_MEMBER_TABLE_NAME: &'static str = "election_member"; fn election_table_name() -> &'static str { Self::ELECTION_LEADER_TABLE_NAME @@ -67,34 +84,69 @@ impl SqlDriverCommon for PostgresDriver {} impl SqlDriverCommon for SqliteDriver {} pub struct MySqlDriver { - pool: MySqlPool, + pub(crate) conn: DatabaseConnection, +} + +impl MySqlDriver { + pub fn new(conn: DatabaseConnection) -> Arc { + Arc::new(Self { conn }) + } } pub struct PostgresDriver { - pool: PgPool, + pub(crate) conn: DatabaseConnection, +} + +impl PostgresDriver { + pub fn new(conn: DatabaseConnection) -> Arc { + Arc::new(Self { conn }) + } } pub struct SqliteDriver { - pool: SqlitePool, + pub(crate) conn: DatabaseConnection, +} + +impl SqliteDriver { + pub fn new(conn: DatabaseConnection) -> Arc { + Arc::new(Self { conn }) + } } #[async_trait::async_trait] impl SqlDriver for SqliteDriver { + async fn init_database(&self) -> MetaResult<()> { + self.conn.execute( + Statement::from_string(DatabaseBackend::Sqlite, format!( + r#"CREATE TABLE IF NOT EXISTS {table} (service VARCHAR(256), id VARCHAR(256), last_heartbeat DATETIME, PRIMARY KEY (service, id));"#, + table = Self::member_table_name() + ))).await?; + + self.conn.execute( + Statement::from_string(DatabaseBackend::Sqlite, format!( + r#"CREATE TABLE IF NOT EXISTS {table} (service VARCHAR(256), id VARCHAR(256), last_heartbeat DATETIME, PRIMARY KEY (service));"#, + table = Self::election_table_name() + ))).await?; + + Ok(()) + } + async fn update_heartbeat(&self, service_name: &str, id: &str) -> MetaResult<()> { - sqlx::query(&format!( - r#"INSERT INTO {table} (id, service, last_heartbeat) + self.conn + .execute(Statement::from_sql_and_values( + DatabaseBackend::Sqlite, + format!( + r#"INSERT INTO {table} (id, service, last_heartbeat) VALUES($1, $2, CURRENT_TIMESTAMP) ON CONFLICT (id, service) DO UPDATE SET last_heartbeat = EXCLUDED.last_heartbeat; "#, - table = Self::member_table_name() - )) - .bind(id) - .bind(service_name) - .execute(&self.pool) - .await?; - + table = Self::member_table_name() + ), + vec![Value::from(id), Value::from(service_name)], + )) + .await?; Ok(()) } @@ -104,79 +156,106 @@ DO id: &str, ttl: i64, ) -> MetaResult { - let row = sqlx::query_as::(&format!( - r#"INSERT INTO {table} (service, id, last_heartbeat) -VALUES ($1, $2, CURRENT_TIMESTAMP) -ON CONFLICT (service) - DO UPDATE - SET id = CASE - WHEN DATETIME({table}.last_heartbeat, '+' || $3 || ' second') < CURRENT_TIMESTAMP THEN EXCLUDED.id - ELSE {table}.id - END, - last_heartbeat = CASE - WHEN DATETIME({table}.last_heartbeat, '+' || $3 || ' seconds') < CURRENT_TIMESTAMP THEN EXCLUDED.last_heartbeat - WHEN {table}.id = EXCLUDED.id THEN EXCLUDED.last_heartbeat - ELSE {table}.last_heartbeat - END -RETURNING service, id, last_heartbeat; -"#, - table = Self::election_table_name() - )) - .bind(service_name) - .bind(id) - .bind(ttl) - .fetch_one(&self.pool) + let query_result = self.conn + .query_one(Statement::from_sql_and_values( + DatabaseBackend::Sqlite, + format!( + r#"INSERT INTO {table} (service, id, last_heartbeat) + VALUES ($1, $2, CURRENT_TIMESTAMP) + ON CONFLICT (service) + DO UPDATE + SET id = CASE + WHEN DATETIME({table}.last_heartbeat, '+' || $3 || ' second') < CURRENT_TIMESTAMP THEN EXCLUDED.id + ELSE {table}.id + END, + last_heartbeat = CASE + WHEN DATETIME({table}.last_heartbeat, '+' || $3 || ' seconds') < CURRENT_TIMESTAMP THEN EXCLUDED.last_heartbeat + WHEN {table}.id = EXCLUDED.id THEN EXCLUDED.last_heartbeat + ELSE {table}.last_heartbeat + END + RETURNING service, id, last_heartbeat; + "#, + table = Self::election_table_name() + ), + vec![Value::from(service_name), Value::from(id), Value::from(ttl)], + )) .await?; + let row = query_result + .map(|query_result| ElectionRow::from_query_result(&query_result, "")) + .transpose()?; + + let row = row.ok_or_else(|| anyhow!("bad result from sqlite"))?; + Ok(row) } async fn leader(&self, service_name: &str) -> MetaResult> { - let row = sqlx::query_as::<_, ElectionRow>(&format!( - r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = $1;"#, - table = Self::election_table_name() - )) - .bind(service_name) - .fetch_optional(&self.pool) - .await?; + let query_result = self + .conn + .query_one(Statement::from_sql_and_values( + DatabaseBackend::Sqlite, + format!( + r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = $1;"#, + table = Self::election_table_name() + ), + vec![Value::from(service_name)], + )) + .await?; + + let row = query_result + .map(|query_result| ElectionRow::from_query_result(&query_result, "")) + .transpose()?; Ok(row) } async fn candidates(&self, service_name: &str) -> MetaResult> { - let row = sqlx::query_as::<_, ElectionRow>(&format!( - r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = $1;"#, - table = Self::member_table_name() - )) - .bind(service_name) - .fetch_all(&self.pool) - .await?; + let all = self + .conn + .query_all(Statement::from_sql_and_values( + DatabaseBackend::Sqlite, + format!( + r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = $1;"#, + table = Self::member_table_name() + ), + vec![Value::from(service_name)], + )) + .await?; - Ok(row) + let rows = all + .into_iter() + .map(|query_result| ElectionRow::from_query_result(&query_result, "")) + .collect::>()?; + + Ok(rows) } async fn resign(&self, service_name: &str, id: &str) -> MetaResult<()> { - let mut txn = self.pool.begin().await?; - sqlx::query(&format!( - r#" - DELETE FROM {table} WHERE service = $1 AND id = $2; - "#, - table = Self::election_table_name() + let txn = self.conn.begin().await?; + + txn.execute(Statement::from_sql_and_values( + DatabaseBackend::Sqlite, + format!( + r#" + DELETE FROM {table} WHERE service = $1 AND id = $2; + "#, + table = Self::election_table_name() + ), + vec![Value::from(service_name), Value::from(id)], )) - .bind(service_name) - .bind(id) - .execute(&mut *txn) .await?; - sqlx::query(&format!( - r#" - DELETE FROM {table} WHERE service = $1 AND id = $2; - "#, - table = Self::member_table_name() + txn.execute(Statement::from_sql_and_values( + DatabaseBackend::Sqlite, + format!( + r#" + DELETE FROM {table} WHERE service = $1 AND id = $2; + "#, + table = Self::member_table_name() + ), + vec![Value::from(service_name), Value::from(id)], )) - .bind(service_name) - .bind(id) - .execute(&mut *txn) .await?; txn.commit().await?; @@ -187,19 +266,37 @@ RETURNING service, id, last_heartbeat; #[async_trait::async_trait] impl SqlDriver for MySqlDriver { + async fn init_database(&self) -> MetaResult<()> { + self.conn.execute( + Statement::from_string(DatabaseBackend::MySql, format!( + r#"CREATE TABLE IF NOT EXISTS {table} (service VARCHAR(256), id VARCHAR(256), last_heartbeat DATETIME, PRIMARY KEY (service, id));"#, + table = Self::member_table_name() + ))).await?; + + self.conn.execute( + Statement::from_string(DatabaseBackend::MySql, format!( + r#"CREATE TABLE IF NOT EXISTS {table} (service VARCHAR(256), id VARCHAR(256), last_heartbeat DATETIME, PRIMARY KEY (service));"#, + table = Self::election_table_name() + ))).await?; + + Ok(()) + } + async fn update_heartbeat(&self, service_name: &str, id: &str) -> MetaResult<()> { - sqlx::query(&format!( - r#"INSERT INTO {table} (id, service, last_heartbeat) -VALUES(?, ?, NOW()) -ON duplicate KEY - UPDATE last_heartbeat = VALUES(last_heartbeat); -"#, - table = Self::member_table_name() - )) - .bind(id) - .bind(service_name) - .execute(&self.pool) - .await?; + self.conn + .execute(Statement::from_sql_and_values( + DatabaseBackend::MySql, + format!( + r#"INSERT INTO {table} (id, service, last_heartbeat) + VALUES(?, ?, NOW()) + ON duplicate KEY + UPDATE last_heartbeat = VALUES(last_heartbeat); + "#, + table = Self::member_table_name() + ), + vec![Value::from(id), Value::from(service_name)], + )) + .await?; Ok(()) } @@ -210,82 +307,113 @@ ON duplicate KEY id: &str, ttl: i64, ) -> MetaResult { - let _ = sqlx::query::(&format!( - r#"INSERT - IGNORE -INTO {table} (service, id, last_heartbeat) -VALUES (?, ?, NOW()) -ON duplicate KEY - UPDATE id = if(last_heartbeat < NOW() - INTERVAL ? SECOND, - VALUES(id), id), - last_heartbeat = if(id = - VALUES(id), - VALUES(last_heartbeat), last_heartbeat);"#, - table = Self::election_table_name() - )) - .bind(service_name) - .bind(id) - .bind(ttl) - .execute(&self.pool) - .await?; + self.conn + .execute(Statement::from_sql_and_values( + DatabaseBackend::MySql, + format!( + r#"INSERT + IGNORE + INTO {table} (service, id, last_heartbeat) + VALUES (?, ?, NOW()) + ON duplicate KEY + UPDATE id = if(last_heartbeat < NOW() - INTERVAL ? SECOND, + VALUES(id), id), + last_heartbeat = if(id = + VALUES(id), + VALUES(last_heartbeat), last_heartbeat);"#, + table = Self::election_table_name() + ), + vec![Value::from(service_name), Value::from(id), Value::from(ttl)], + )) + .await?; - let row = sqlx::query_as::(&format!( - r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = ?;"#, - table = Self::election_table_name(), - )) - .bind(service_name) - .fetch_one(&self.pool) - .await?; + let query_result = self + .conn + .query_one(Statement::from_sql_and_values( + DatabaseBackend::MySql, + format!( + r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = ?;"#, + table = Self::election_table_name(), + ), + vec![Value::from(service_name)], + )) + .await?; + + let row = query_result + .map(|query_result| ElectionRow::from_query_result(&query_result, "")) + .transpose()?; + + let row = row.ok_or_else(|| anyhow!("bad result from mysql"))?; Ok(row) } async fn leader(&self, service_name: &str) -> MetaResult> { - let row = sqlx::query_as::(&format!( - r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = ?;"#, - table = Self::election_table_name() - )) - .bind(service_name) - .fetch_optional(&self.pool) - .await?; + let query_result = self + .conn + .query_one(Statement::from_sql_and_values( + DatabaseBackend::MySql, + format!( + r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = ?;"#, + table = Self::election_table_name() + ), + vec![Value::from(service_name)], + )) + .await?; + + let row = query_result + .map(|query_result| ElectionRow::from_query_result(&query_result, "")) + .transpose()?; Ok(row) } async fn candidates(&self, service_name: &str) -> MetaResult> { - let row = sqlx::query_as::(&format!( - r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = ?;"#, - table = Self::member_table_name() - )) - .bind(service_name) - .fetch_all(&self.pool) - .await?; + let all = self + .conn + .query_all(Statement::from_sql_and_values( + DatabaseBackend::MySql, + format!( + r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = ?;"#, + table = Self::member_table_name() + ), + vec![Value::from(service_name)], + )) + .await?; - Ok(row) + let rows = all + .into_iter() + .map(|query_result| ElectionRow::from_query_result(&query_result, "")) + .collect::>()?; + + Ok(rows) } async fn resign(&self, service_name: &str, id: &str) -> MetaResult<()> { - let mut txn = self.pool.begin().await?; - sqlx::query(&format!( - r#" - DELETE FROM {table} WHERE service = ? AND id = ?; - "#, - table = Self::election_table_name() + let txn = self.conn.begin().await?; + + txn.execute(Statement::from_sql_and_values( + DatabaseBackend::MySql, + format!( + r#" + DELETE FROM {table} WHERE service = ? AND id = ?; + "#, + table = Self::election_table_name() + ), + vec![Value::from(service_name), Value::from(id)], )) - .bind(service_name) - .bind(id) - .execute(&mut *txn) .await?; - sqlx::query(&format!( - r#" - DELETE FROM {table} WHERE service = ? AND id = ?; - "#, - table = Self::member_table_name() + txn.execute(Statement::from_sql_and_values( + DatabaseBackend::MySql, + format!( + r#" + DELETE FROM {table} WHERE service = ? AND id = ?; + "#, + table = Self::member_table_name() + ), + vec![Value::from(service_name), Value::from(id)], )) - .bind(service_name) - .bind(id) - .execute(&mut *txn) .await?; txn.commit().await?; @@ -296,20 +424,38 @@ ON duplicate KEY #[async_trait::async_trait] impl SqlDriver for PostgresDriver { + async fn init_database(&self) -> MetaResult<()> { + self.conn.execute( + Statement::from_string(DatabaseBackend::Postgres, format!( + r#"CREATE TABLE IF NOT EXISTS {table} (service VARCHAR, id VARCHAR, last_heartbeat TIMESTAMPTZ, PRIMARY KEY (service, id));"#, + table = Self::member_table_name() + ))).await?; + + self.conn.execute( + Statement::from_string(DatabaseBackend::Postgres, format!( + r#"CREATE TABLE IF NOT EXISTS {table} (service VARCHAR, id VARCHAR, last_heartbeat TIMESTAMPTZ, PRIMARY KEY (service));"#, + table = Self::election_table_name() + ))).await?; + + Ok(()) + } + async fn update_heartbeat(&self, service_name: &str, id: &str) -> MetaResult<()> { - sqlx::query(&format!( - r#"INSERT INTO {table} (id, service, last_heartbeat) -VALUES($1, $2, NOW()) -ON CONFLICT (id, service) -DO - UPDATE SET last_heartbeat = EXCLUDED.last_heartbeat; -"#, - table = Self::member_table_name() - )) - .bind(id) - .bind(service_name) - .execute(&self.pool) - .await?; + self.conn + .execute(Statement::from_sql_and_values( + DatabaseBackend::Postgres, + format!( + r#"INSERT INTO {table} (id, service, last_heartbeat) + VALUES($1, $2, NOW()) + ON CONFLICT (id, service) + DO + UPDATE SET last_heartbeat = EXCLUDED.last_heartbeat; + "#, + table = Self::member_table_name() + ), + vec![Value::from(id), Value::from(service_name)], + )) + .await?; Ok(()) } @@ -320,79 +466,112 @@ DO id: &str, ttl: i64, ) -> MetaResult { - let row = sqlx::query_as::(&format!( - r#"INSERT INTO {table} (service, id, last_heartbeat) -VALUES ($1, $2, NOW()) -ON CONFLICT (service) - DO UPDATE - SET id = CASE - WHEN {table}.last_heartbeat < NOW() - $3::INTERVAL THEN EXCLUDED.id - ELSE {table}.id - END, - last_heartbeat = CASE - WHEN {table}.last_heartbeat < NOW() - $3::INTERVAL THEN EXCLUDED.last_heartbeat - WHEN {table}.id = EXCLUDED.id THEN EXCLUDED.last_heartbeat - ELSE {table}.last_heartbeat - END -RETURNING service, id, last_heartbeat; -"#, - table = Self::election_table_name() - )) - .bind(service_name) - .bind(id) - .bind(Duration::from_secs(ttl as u64)) - .fetch_one(&self.pool) + let query_result = self + .conn + .query_one(Statement::from_sql_and_values( + DatabaseBackend::Postgres, + format!( + r#"INSERT INTO {table} (service, id, last_heartbeat) + VALUES ($1, $2, NOW()) + ON CONFLICT (service) + DO UPDATE + SET id = CASE + WHEN {table}.last_heartbeat < NOW() - $3::INTERVAL THEN EXCLUDED.id + ELSE {table}.id + END, + last_heartbeat = CASE + WHEN {table}.last_heartbeat < NOW() - $3::INTERVAL THEN EXCLUDED.last_heartbeat + WHEN {table}.id = EXCLUDED.id THEN EXCLUDED.last_heartbeat + ELSE {table}.last_heartbeat + END + RETURNING service, id, last_heartbeat; + "#, + table = Self::election_table_name() + ), + vec![ + Value::from(service_name), + Value::from(id), + // special handling for interval + Value::from(ttl.to_string()), + ], + )) .await?; + let row = query_result + .map(|query_result| ElectionRow::from_query_result(&query_result, "")) + .transpose()?; + + let row = row.ok_or_else(|| anyhow!("bad result from postgres"))?; + Ok(row) } async fn leader(&self, service_name: &str) -> MetaResult> { - let row = sqlx::query_as::(&format!( - r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = $1;"#, - table = Self::election_table_name() - )) - .bind(service_name) - .fetch_optional(&self.pool) - .await?; + let query_result = self + .conn + .query_one(Statement::from_sql_and_values( + DatabaseBackend::Postgres, + format!( + r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = $1;"#, + table = Self::election_table_name() + ), + vec![Value::from(service_name)], + )) + .await?; + + let row = query_result + .map(|query_result| ElectionRow::from_query_result(&query_result, "")) + .transpose()?; Ok(row) } async fn candidates(&self, service_name: &str) -> MetaResult> { - let row = sqlx::query_as::(&format!( - r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = $1;"#, - table = Self::member_table_name() - )) - .bind(service_name) - .fetch_all(&self.pool) - .await?; + let all = self + .conn + .query_all(Statement::from_sql_and_values( + DatabaseBackend::Postgres, + format!( + r#"SELECT service, id, last_heartbeat FROM {table} WHERE service = $1;"#, + table = Self::member_table_name() + ), + vec![Value::from(service_name)], + )) + .await?; - Ok(row) + let rows = all + .into_iter() + .map(|query_result| ElectionRow::from_query_result(&query_result, "")) + .collect::>()?; + + Ok(rows) } async fn resign(&self, service_name: &str, id: &str) -> MetaResult<()> { - let mut txn = self.pool.begin().await?; - sqlx::query(&format!( - r#" - DELETE FROM {table} WHERE service = $1 AND id = $2; - "#, - table = Self::election_table_name() + let txn = self.conn.begin().await?; + + txn.execute(Statement::from_sql_and_values( + DatabaseBackend::Postgres, + format!( + r#" + DELETE FROM {table} WHERE service = $1 AND id = $2; + "#, + table = Self::election_table_name() + ), + vec![Value::from(service_name), Value::from(id)], )) - .bind(service_name) - .bind(id) - .execute(&mut *txn) .await?; - sqlx::query(&format!( - r#" - DELETE FROM {table} WHERE service = $1 AND id = $2; - "#, - table = Self::member_table_name() + txn.execute(Statement::from_sql_and_values( + DatabaseBackend::Postgres, + format!( + r#" + DELETE FROM {table} WHERE service = $1 AND id = $2; + "#, + table = Self::member_table_name() + ), + vec![Value::from(service_name), Value::from(id)], )) - .bind(service_name) - .bind(id) - .execute(&mut *txn) .await?; txn.commit().await?; @@ -406,6 +585,11 @@ impl ElectionClient for SqlBackendElectionClient where T: SqlDriver + Send + Sync + 'static, { + async fn init(&self) -> MetaResult<()> { + tracing::info!("initializing database for Sql backend election client"); + self.driver.init_database().await + } + fn id(&self) -> MetaResult { Ok(self.id.clone()) } @@ -540,34 +724,40 @@ where mod tests { use std::sync::Arc; - use sqlx::sqlite::SqlitePoolOptions; - use sqlx::SqlitePool; + use sea_orm::{ConnectionTrait, Database, DatabaseConnection, DbBackend, Statement}; use tokio::sync::watch; use crate::rpc::election::sql::{SqlBackendElectionClient, SqlDriverCommon, SqliteDriver}; use crate::{ElectionClient, MetaResult}; - async fn prepare_sqlite_env() -> MetaResult { - let pool = SqlitePoolOptions::new().connect("sqlite::memory:").await?; - let _ = sqlx::query( - &format!("CREATE TABLE {table} (service VARCHAR(256) PRIMARY KEY, id VARCHAR(256), last_heartbeat DATETIME)", - table = SqliteDriver::election_table_name())) - .execute(&pool).await?; + async fn prepare_sqlite_env() -> MetaResult { + let db: DatabaseConnection = Database::connect("sqlite::memory:").await?; + + db.execute(Statement::from_sql_and_values( + DbBackend::Sqlite, + format!("CREATE TABLE {table} (service VARCHAR(256) PRIMARY KEY, id VARCHAR(256), last_heartbeat DATETIME)", + table = SqliteDriver::election_table_name()), + vec![], + )) + .await?; - let _ = sqlx::query( - &format!("CREATE TABLE {table} (service VARCHAR(256), id VARCHAR(256), last_heartbeat DATETIME, PRIMARY KEY (service, id))", - table = SqliteDriver::member_table_name())) - .execute(&pool).await?; + db.execute(Statement::from_sql_and_values( + DbBackend::Sqlite, + format!("CREATE TABLE {table} (service VARCHAR(256), id VARCHAR(256), last_heartbeat DATETIME, PRIMARY KEY (service, id))", + table = SqliteDriver::member_table_name()), + vec![], + )) + .await?; - Ok(pool) + Ok(db) } #[tokio::test] async fn test_sql_election() { let id = "test_id".to_string(); - let pool = prepare_sqlite_env().await.unwrap(); + let conn = prepare_sqlite_env().await.unwrap(); - let provider = SqliteDriver { pool }; + let provider = SqliteDriver { conn }; let (sender, _) = watch::channel(false); let sql_election_client: Arc = Arc::new(SqlBackendElectionClient { id, @@ -597,10 +787,10 @@ mod tests { let mut clients = vec![]; - let pool = prepare_sqlite_env().await.unwrap(); + let conn = prepare_sqlite_env().await.unwrap(); for i in 1..3 { let id = format!("test_id_{}", i); - let provider = SqliteDriver { pool: pool.clone() }; + let provider = SqliteDriver { conn: conn.clone() }; let (sender, _) = watch::channel(false); let sql_election_client: Arc = Arc::new(SqlBackendElectionClient { id, diff --git a/src/meta/src/rpc/metrics.rs b/src/meta/src/rpc/metrics.rs index 1518495df0f7c..3183007753cbd 100644 --- a/src/meta/src/rpc/metrics.rs +++ b/src/meta/src/rpc/metrics.rs @@ -37,7 +37,7 @@ use tokio::task::JoinHandle; use crate::hummock::HummockManagerRef; use crate::manager::{CatalogManagerRef, ClusterManagerRef, FragmentManagerRef}; -use crate::rpc::server::ElectionClientRef; +use crate::rpc::ElectionClientRef; #[derive(Clone)] pub struct MetaMetrics { @@ -536,7 +536,7 @@ impl MetaMetrics { let sink_info = register_int_gauge_vec_with_registry!( "sink_info", "Mapping from actor id to (actor id, sink name)", - &["actor_id", "sink_name",], + &["actor_id", "sink_id", "sink_name",], registry ) .unwrap(); @@ -690,7 +690,7 @@ impl Default for MetaMetrics { } } -pub async fn start_worker_info_monitor( +pub fn start_worker_info_monitor( cluster_manager: ClusterManagerRef, election_client: Option, interval: Duration, @@ -738,7 +738,7 @@ pub async fn start_worker_info_monitor( (join_handle, shutdown_tx) } -pub async fn start_fragment_info_monitor( +pub fn start_fragment_info_monitor( cluster_manager: ClusterManagerRef, catalog_manager: CatalogManagerRef, fragment_manager: FragmentManagerRef, @@ -810,13 +810,14 @@ pub async fn start_fragment_info_monitor( if let Some(stream_node) = &actor.nodes { if let Some(Sink(sink_node)) = &stream_node.node_body { - let sink_name = match &sink_node.sink_desc { - Some(sink_desc) => &sink_desc.name, - _ => "unknown", + let (sink_id, sink_name) = match &sink_node.sink_desc { + Some(sink_desc) => (sink_desc.id, sink_desc.name.as_str()), + _ => (0, "unknown"), // unreachable }; + let sink_id_str = sink_id.to_string(); meta_metrics .sink_info - .with_label_values(&[&actor_id_str, sink_name]) + .with_label_values(&[&actor_id_str, &sink_id_str, sink_name]) .set(1); } } diff --git a/src/meta/src/rpc/mod.rs b/src/meta/src/rpc/mod.rs index 36380c4d2dafb..99f1b51eaafce 100644 --- a/src/meta/src/rpc/mod.rs +++ b/src/meta/src/rpc/mod.rs @@ -12,19 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod cloud_provider; +pub mod cloud_provider; pub mod ddl_controller; pub mod election; -mod intercept; +pub mod intercept; pub mod metrics; -pub mod server; -pub mod service; + +pub type ElectionClientRef = std::sync::Arc; pub use election::etcd::EtcdElectionClient; pub use election::{ElectionClient, ElectionMember}; -pub use service::cluster_service::ClusterServiceImpl; -pub use service::ddl_service::DdlServiceImpl; -pub use service::heartbeat_service::HeartbeatServiceImpl; -pub use service::hummock_service::HummockServiceImpl; -pub use service::notification_service::NotificationServiceImpl; -pub use service::stream_service::StreamServiceImpl; diff --git a/src/meta/src/serving/mod.rs b/src/meta/src/serving/mod.rs index f6d1a5b1aa714..521a8b9ad1c0d 100644 --- a/src/meta/src/serving/mod.rs +++ b/src/meta/src/serving/mod.rs @@ -103,7 +103,7 @@ fn to_deleted_fragment_parallel_unit_mapping( .collect() } -pub(crate) async fn on_meta_start( +pub async fn on_meta_start( notification_manager: NotificationManagerRef, cluster_manager: ClusterManagerRef, fragment_manager: FragmentManagerRef, @@ -126,7 +126,7 @@ pub(crate) async fn on_meta_start( ); } -pub(crate) async fn start_serving_vnode_mapping_worker( +pub async fn start_serving_vnode_mapping_worker( notification_manager: NotificationManagerRef, cluster_manager: ClusterManagerRef, fragment_manager: FragmentManagerRef, diff --git a/src/meta/src/stream/scale.rs b/src/meta/src/stream/scale.rs index a125d61d91703..afe6186165e22 100644 --- a/src/meta/src/stream/scale.rs +++ b/src/meta/src/stream/scale.rs @@ -97,7 +97,7 @@ pub struct RescheduleOptions { pub resolve_no_shuffle_upstream: bool, } -pub(crate) struct RescheduleContext { +pub struct RescheduleContext { /// Index used to map `ParallelUnitId` to `WorkerId` parallel_unit_id_to_worker_id: BTreeMap, /// Meta information for all Actors @@ -171,7 +171,7 @@ impl RescheduleContext { /// assert to fail and should be skipped from the upper level. /// /// The return value is the bitmap distribution after scaling, which covers all virtual node indexes -pub(crate) fn rebalance_actor_vnode( +pub fn rebalance_actor_vnode( actors: &[StreamActor], actors_to_remove: &BTreeSet, actors_to_create: &BTreeSet, diff --git a/src/meta/src/stream/sink.rs b/src/meta/src/stream/sink.rs index 4717a1ffdfe95..8544011071ec2 100644 --- a/src/meta/src/stream/sink.rs +++ b/src/meta/src/stream/sink.rs @@ -25,5 +25,5 @@ pub async fn validate_sink(prost_sink_catalog: &PbSink) -> MetaResult<()> { let sink = build_sink(param)?; - dispatch_sink!(sink, sink, { Ok(sink.validate().await?) }) + dispatch_sink!(sink, sink, Ok(sink.validate().await?)) } diff --git a/src/meta/src/stream/source_manager.rs b/src/meta/src/stream/source_manager.rs index d6a7377f19928..1cd666e5d7160 100644 --- a/src/meta/src/stream/source_manager.rs +++ b/src/meta/src/stream/source_manager.rs @@ -47,7 +47,7 @@ pub type SourceManagerRef = Arc; pub type SplitAssignment = HashMap>>; pub struct SourceManager { - pub(crate) paused: Mutex<()>, + pub paused: Mutex<()>, env: MetaSrvEnv, barrier_scheduler: BarrierScheduler, core: Mutex, @@ -192,6 +192,7 @@ struct ConnectorSourceWorkerHandle { handle: JoinHandle<()>, sync_call_tx: UnboundedSender>>, splits: SharedSplitMapRef, + enable_scale_in: bool, } impl ConnectorSourceWorkerHandle { @@ -283,7 +284,9 @@ impl SourceManagerCore { *fragment_id, prev_actor_splits, &discovered_splits, - SplitDiffOptions::default(), + SplitDiffOptions { + enable_scale_in: handle.enable_scale_in, + }, ) { split_assignment.insert(*fragment_id, change); } @@ -603,6 +606,7 @@ impl SourceManager { fragment_id, empty_actor_splits, &prev_splits, + // pre-allocate splits is the first time getting splits and it does not have scale in scene SplitDiffOptions::default(), ) .unwrap_or_default(); @@ -701,7 +705,7 @@ impl SourceManager { let source_id = source.id; let connector_properties = extract_prop_from_source(&source)?; - + let enable_scale_in = connector_properties.enable_split_scale_in(); let handle = tokio::spawn(async move { let mut ticker = time::interval(Self::DEFAULT_SOURCE_TICK_INTERVAL); ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); @@ -739,6 +743,7 @@ impl SourceManager { handle, sync_call_tx, splits: current_splits_ref, + enable_scale_in, }, ); Ok(()) @@ -753,6 +758,7 @@ impl SourceManager { ) -> MetaResult<()> { let current_splits_ref = Arc::new(Mutex::new(SharedSplitMap { splits: None })); let connector_properties = extract_prop_from_source(source)?; + let enable_scale_in = connector_properties.enable_split_scale_in(); let (sync_call_tx, sync_call_rx) = tokio::sync::mpsc::unbounded_channel(); let handle = dispatch_source_prop!(connector_properties, prop, { let mut worker = ConnectorSourceWorker::create( @@ -794,6 +800,7 @@ impl SourceManager { handle, sync_call_tx, splits: current_splits_ref, + enable_scale_in, }, ); diff --git a/src/meta/src/stream/stream_manager.rs b/src/meta/src/stream/stream_manager.rs index df642802361ad..77a784c64ac09 100644 --- a/src/meta/src/stream/stream_manager.rs +++ b/src/meta/src/stream/stream_manager.rs @@ -18,7 +18,7 @@ use std::sync::Arc; use futures::future::{join_all, try_join_all, BoxFuture}; use itertools::Itertools; use risingwave_common::catalog::TableId; -use risingwave_pb::catalog::Table; +use risingwave_pb::catalog::{CreateType, Table}; use risingwave_pb::stream_plan::update_mutation::MergeUpdate; use risingwave_pb::stream_plan::Dispatcher; use risingwave_pb::stream_service::{ @@ -67,6 +67,8 @@ pub struct CreateStreamingJobContext { pub definition: String, pub mv_table_id: Option, + + pub create_type: CreateType, } impl CreateStreamingJobContext { @@ -112,22 +114,32 @@ impl CreatingStreamingJobInfo { jobs.remove(&job_id); } - async fn cancel_jobs(&self, job_ids: Vec) -> HashMap> { + async fn cancel_jobs( + &self, + job_ids: Vec, + ) -> (HashMap>, Vec) { let mut jobs = self.streaming_jobs.lock().await; let mut receivers = HashMap::new(); + let mut recovered_job_ids = vec![]; for job_id in job_ids { if let Some(job) = jobs.get_mut(&job_id) && let Some(shutdown_tx) = job.shutdown_tx.take() { let (tx, rx) = oneshot::channel(); - if shutdown_tx.send(CreatingState::Canceling{finish_tx: tx}).await.is_ok() { + if shutdown_tx.send(CreatingState::Canceling { finish_tx: tx }).await.is_ok() { receivers.insert(job_id, rx); } else { tracing::warn!("failed to send canceling state"); } + } else { + // If these job ids do not exist in streaming_jobs, + // we can infer they either: + // 1. are entirely non-existent, + // 2. OR they are recovered streaming jobs, and managed by BarrierManager. + recovered_job_ids.push(job_id); } } - receivers + (receivers, recovered_job_ids) } } @@ -159,26 +171,26 @@ pub struct ReplaceTableContext { /// `GlobalStreamManager` manages all the streams in the system. pub struct GlobalStreamManager { - pub(crate) env: MetaSrvEnv, + pub env: MetaSrvEnv, /// Manages definition and status of fragments and actors pub(super) fragment_manager: FragmentManagerRef, /// Broadcasts and collect barriers - pub(crate) barrier_scheduler: BarrierScheduler, + pub barrier_scheduler: BarrierScheduler, /// Maintains information of the cluster - pub(crate) cluster_manager: ClusterManagerRef, + pub cluster_manager: ClusterManagerRef, /// Maintains streaming sources from external system like kafka - pub(crate) source_manager: SourceManagerRef, + pub source_manager: SourceManagerRef, /// Creating streaming job info. creating_job_info: CreatingStreamingJobInfoRef, hummock_manager: HummockManagerRef, - pub(crate) reschedule_lock: RwLock<()>, + pub reschedule_lock: RwLock<()>, } impl GlobalStreamManager { @@ -407,7 +419,7 @@ impl GlobalStreamManager { definition, mv_table_id, internal_tables, - .. + create_type, }: CreateStreamingJobContext, ) -> MetaResult<()> { // Register to compaction group beforehand. @@ -424,8 +436,10 @@ impl GlobalStreamManager { table_fragments.internal_table_ids().len() + mv_table_id.map_or(0, |_| 1) ); revert_funcs.push(Box::pin(async move { - if let Err(e) = hummock_manager_ref.unregister_table_ids(®istered_table_ids).await { - tracing::warn!("Failed to unregister compaction group for {:#?}. They will be cleaned up on node restart. {:#?}", registered_table_ids, e); + if create_type == CreateType::Foreground { + if let Err(e) = hummock_manager_ref.unregister_table_ids(®istered_table_ids).await { + tracing::warn!("Failed to unregister compaction group for {:#?}. They will be cleaned up on node restart. {:#?}", registered_table_ids, e); + } } })); @@ -452,9 +466,11 @@ impl GlobalStreamManager { }) .await { - self.fragment_manager - .drop_table_fragments_vec(&HashSet::from_iter(std::iter::once(table_id))) - .await?; + if create_type == CreateType::Foreground { + self.fragment_manager + .drop_table_fragments_vec(&HashSet::from_iter(std::iter::once(table_id))) + .await?; + } return Err(err); } @@ -553,13 +569,18 @@ impl GlobalStreamManager { } /// Cancel streaming jobs and return the canceled table ids. + /// 1. Send cancel message to stream jobs (via `cancel_jobs`). + /// 2. Send cancel message to recovered stream jobs (via `barrier_scheduler`). + /// + /// Cleanup of their state will be cleaned up after the `CancelStreamJob` command succeeds, + /// by the barrier manager for both of them. pub async fn cancel_streaming_jobs(&self, table_ids: Vec) -> Vec { if table_ids.is_empty() { return vec![]; } let _reschedule_job_lock = self.reschedule_lock.read().await; - let receivers = self.creating_job_info.cancel_jobs(table_ids).await; + let (receivers, recovered_job_ids) = self.creating_job_info.cancel_jobs(table_ids).await; let futures = receivers.into_iter().map(|(id, receiver)| async move { if receiver.await.is_ok() { @@ -570,7 +591,35 @@ impl GlobalStreamManager { None } }); - join_all(futures).await.into_iter().flatten().collect_vec() + let mut cancelled_ids = join_all(futures).await.into_iter().flatten().collect_vec(); + + // NOTE(kwannoel): For recovered stream jobs, we can directly cancel them by running the barrier command, + // since Barrier manager manages the recovered stream jobs. + let futures = recovered_job_ids.into_iter().map(|id| async move { + let result: MetaResult<()> = try { + let fragment = self + .fragment_manager + .select_table_fragments_by_table_id(&id) + .await?; + self.barrier_scheduler + .run_command(Command::CancelStreamingJob(fragment)) + .await?; + }; + match result { + Ok(_) => { + tracing::info!("cancelled recovered streaming job {id}"); + Some(id) + }, + Err(_) => { + tracing::error!("failed to cancel recovered streaming job {id}, does {id} correspond to any jobs in `SHOW JOBS`?"); + None + }, + } + }); + let cancelled_recovered_ids = join_all(futures).await.into_iter().flatten().collect_vec(); + + cancelled_ids.extend(cancelled_recovered_ids); + cancelled_ids } } @@ -806,7 +855,7 @@ mod tests { .await?, ); - let (sink_manager, _) = SinkCoordinatorManager::start_worker(None); + let (sink_manager, _) = SinkCoordinatorManager::start_worker(); let barrier_manager = Arc::new(GlobalBarrierManager::new( scheduled_barriers, @@ -896,7 +945,7 @@ mod tests { }; self.catalog_manager - .start_create_table_procedure(&table) + .start_create_table_procedure(&table, vec![]) .await?; self.global_stream_manager .create_streaming_job(table_fragments, ctx) diff --git a/src/meta/src/telemetry.rs b/src/meta/src/telemetry.rs index 774b3cdda8146..fbbc89c2ff0ec 100644 --- a/src/meta/src/telemetry.rs +++ b/src/meta/src/telemetry.rs @@ -35,7 +35,7 @@ struct NodeCount { } #[derive(Debug, Serialize, Deserialize)] -pub(crate) struct MetaTelemetryReport { +pub struct MetaTelemetryReport { #[serde(flatten)] base: TelemetryReportBase, node_count: NodeCount, @@ -45,12 +45,12 @@ pub(crate) struct MetaTelemetryReport { impl TelemetryReport for MetaTelemetryReport {} -pub(crate) struct MetaTelemetryInfoFetcher { +pub struct MetaTelemetryInfoFetcher { tracking_id: ClusterId, } impl MetaTelemetryInfoFetcher { - pub(crate) fn new(tracking_id: ClusterId) -> Self { + pub fn new(tracking_id: ClusterId) -> Self { Self { tracking_id } } } @@ -63,13 +63,13 @@ impl TelemetryInfoFetcher for MetaTelemetryInfoFetcher { } #[derive(Clone)] -pub(crate) struct MetaReportCreator { +pub struct MetaReportCreator { cluster_mgr: Arc, meta_backend: MetaBackend, } impl MetaReportCreator { - pub(crate) fn new(cluster_mgr: Arc, meta_backend: MetaBackend) -> Self { + pub fn new(cluster_mgr: Arc, meta_backend: MetaBackend) -> Self { Self { cluster_mgr, meta_backend, @@ -79,6 +79,7 @@ impl MetaReportCreator { #[async_trait::async_trait] impl TelemetryReportCreator for MetaReportCreator { + #[expect(refining_impl_trait)] async fn create_report( &self, tracking_id: String, diff --git a/src/object_store/src/object/mod.rs b/src/object_store/src/object/mod.rs index e25159878f0db..96e58397dfa82 100644 --- a/src/object_store/src/object/mod.rs +++ b/src/object_store/src/object/mod.rs @@ -882,7 +882,7 @@ pub async fn parse_remote_object_store( } other => { unimplemented!( - "{} remote object store only supports s3, minio, disk, memory, and memory-shared for now.", + "{} remote object store only supports s3, minio, gcs, oss, cos, azure blob, hdfs, disk, memory, and memory-shared.", other ) } diff --git a/src/object_store/src/object/s3.rs b/src/object_store/src/object/s3.rs index 90e419567bceb..69e7f3687fdeb 100644 --- a/src/object_store/src/object/s3.rs +++ b/src/object_store/src/object/s3.rs @@ -616,7 +616,16 @@ impl S3ObjectStore { pub async fn with_minio(server: &str, metrics: Arc) -> Self { let server = server.strip_prefix("minio://").unwrap(); let (access_key_id, rest) = server.split_once(':').unwrap(); - let (secret_access_key, rest) = rest.split_once('@').unwrap(); + let (secret_access_key, mut rest) = rest.split_once('@').unwrap(); + let endpoint_prefix = if let Some(rest_stripped) = rest.strip_prefix("https://") { + rest = rest_stripped; + "https://" + } else if let Some(rest_stripped) = rest.strip_prefix("http://") { + rest = rest_stripped; + "http://" + } else { + "http://" + }; let (address, bucket) = rest.split_once('/').unwrap(); #[cfg(madsim)] @@ -626,10 +635,9 @@ impl S3ObjectStore { aws_sdk_s3::config::Builder::from(&aws_config::ConfigLoader::default().load().await) .force_path_style(true) .http_connector(Self::new_http_connector(&S3ObjectStoreConfig::default())); - let config = builder .region(Region::new("custom")) - .endpoint_url(format!("http://{}", address)) + .endpoint_url(format!("{}{}", endpoint_prefix, address)) .credentials_provider(Credentials::from_keys( access_key_id, secret_access_key, @@ -663,7 +671,6 @@ impl S3ObjectStore { range: impl ObjectRangeBounds, ) -> GetObjectFluentBuilder { let req = self.client.get_object().bucket(&self.bucket).key(path); - if range.is_full() { return req; } @@ -689,7 +696,7 @@ impl S3ObjectStore { /// - /// - MinIO /// - - pub async fn configure_bucket_lifecycle(&self) { + pub async fn configure_bucket_lifecycle(&self) -> bool { // Check if lifecycle is already configured to avoid overriding existing configuration. let bucket = self.bucket.as_str(); let mut configured_rules = vec![]; @@ -699,8 +706,12 @@ impl S3ObjectStore { .bucket(bucket) .send() .await; + let mut is_expiration_configured = false; if let Ok(config) = &get_config_result { for rule in config.rules().unwrap_or_default() { + if rule.expiration().is_some() { + is_expiration_configured = true; + } if matches!(rule.status().unwrap(), ExpirationStatus::Enabled) && rule.abort_incomplete_multipart_upload().is_some() { @@ -747,6 +758,13 @@ impl S3ObjectStore { tracing::warn!("Failed to configure life cycle rule for S3 bucket: {:?}. It is recommended to configure it manually to avoid unnecessary storage cost.", bucket); } } + if is_expiration_configured { + tracing::info!( + "S3 bucket {} has already configured the expiration for the lifecycle.", + bucket, + ); + } + is_expiration_configured } #[inline(always)] diff --git a/src/prost/Cargo.toml b/src/prost/Cargo.toml index dc6375662f806..d373207966640 100644 --- a/src/prost/Cargo.toml +++ b/src/prost/Cargo.toml @@ -10,9 +10,10 @@ repository = { workspace = true } [dependencies] enum-as-inner = "0.6" pbjson = "0.6" -prost = "0.11" +prost = { workspace = true } prost-helpers = { path = "helpers" } serde = { version = "1", features = ["derive"] } +strum = "0.25" tonic = { workspace = true } [target.'cfg(not(madsim))'.dependencies] diff --git a/src/prost/build.rs b/src/prost/build.rs index 172f9c0731a6d..5722a04767962 100644 --- a/src/prost/build.rs +++ b/src/prost/build.rs @@ -58,6 +58,9 @@ fn main() -> Result<(), Box> { .map(|f| format!("{}/{}.proto", proto_dir, f)) .collect(); + // Paths to generate `BTreeMap` for protobuf maps. + let btree_map_paths = [".monitor_service.StackTraceResponse"]; + // Build protobuf structs. // We first put generated files to `OUT_DIR`, then copy them to `/src` only if they are changed. @@ -72,7 +75,10 @@ fn main() -> Result<(), Box> { .compile_well_known_types(true) .protoc_arg("--experimental_allow_proto3_optional") .type_attribute(".", "#[derive(prost_helpers::AnyPB)]") - .type_attribute("node_body", "#[derive(::enum_as_inner::EnumAsInner)]") + .type_attribute( + "node_body", + "#[derive(::enum_as_inner::EnumAsInner, ::strum::Display)]", + ) .type_attribute("rex_node", "#[derive(::enum_as_inner::EnumAsInner)]") .type_attribute( "meta.PausedReason", @@ -82,6 +88,7 @@ fn main() -> Result<(), Box> { "stream_plan.Barrier.BarrierKind", "#[derive(::enum_as_inner::EnumAsInner)]", ) + .btree_map(btree_map_paths) // Eq + Hash are for plan nodes to do common sub-plan detection. // The requirement is from Source node -> SourceCatalog -> WatermarkDesc -> expr .type_attribute("catalog.WatermarkDesc", "#[derive(Eq, Hash)]") @@ -113,6 +120,7 @@ fn main() -> Result<(), Box> { // Implement `serde::Serialize` on those structs. let descriptor_set = fs_err::read(file_descriptor_set_path)?; pbjson_build::Builder::new() + .btree_map(btree_map_paths) .register_descriptors(&descriptor_set)? .out_dir(out_dir.as_path()) .build(&["."]) diff --git a/src/prost/helpers/Cargo.toml b/src/prost/helpers/Cargo.toml index 50d9b4febd80b..c78ac3f2a8ece 100644 --- a/src/prost/helpers/Cargo.toml +++ b/src/prost/helpers/Cargo.toml @@ -11,9 +11,6 @@ proc-macro2 = { version = "1", default-features = false } quote = "1" syn = "2" -[target.'cfg(not(madsim))'.dependencies] -workspace-hack = { path = "../../workspace-hack" } - [package.metadata.cargo-machete] ignored = ["workspace-hack"] diff --git a/src/risedevtool/config/Cargo.toml b/src/risedevtool/config/Cargo.toml index 441742e3c2b6c..e33eb0afd1647 100644 --- a/src/risedevtool/config/Cargo.toml +++ b/src/risedevtool/config/Cargo.toml @@ -11,7 +11,7 @@ repository = { workspace = true } anyhow = { version = "1", features = ["backtrace"] } clap = { version = "4", features = ["derive"] } console = "0.15" -dialoguer = "0.10" +dialoguer = "0.11" enum-iterator = "1" fs-err = "2.9.0" itertools = "0.11" diff --git a/src/risedevtool/config/src/main.rs b/src/risedevtool/config/src/main.rs index ac36d475b83c3..79df34bd815dc 100644 --- a/src/risedevtool/config/src/main.rs +++ b/src/risedevtool/config/src/main.rs @@ -396,12 +396,6 @@ fn main() -> Result<()> { )?; if chosen.contains(&component) { writeln!(file, "{}=true", component.env())?; - if component == Components::BuildConnectorNode { - writeln!( - file, - "CONNECTOR_LIBS_PATH=.risingwave/bin/connector-node/libs/" - )?; - } } else { writeln!(file, "# {}=true", component.env())?; } diff --git a/src/risedevtool/src/bin/risedev-compose.rs b/src/risedevtool/src/bin/risedev-compose.rs index 087c6519717f1..63925d919bb2b 100644 --- a/src/risedevtool/src/bin/risedev-compose.rs +++ b/src/risedevtool/src/bin/risedev-compose.rs @@ -222,7 +222,6 @@ fn main() -> Result<()> { (c.address.clone(), c.compose(&compose_config)?) } ServiceConfig::Redis(_) => return Err(anyhow!("not supported")), - ServiceConfig::ConnectorNode(_) => return Err(anyhow!("not supported")), }; compose.container_name = service.id().to_string(); if opts.deploy { diff --git a/src/risedevtool/src/bin/risedev-dev.rs b/src/risedevtool/src/bin/risedev-dev.rs index c2e586802489b..474e8dd0cbd15 100644 --- a/src/risedevtool/src/bin/risedev-dev.rs +++ b/src/risedevtool/src/bin/risedev-dev.rs @@ -25,10 +25,10 @@ use indicatif::ProgressBar; use risedev::util::{complete_spin, fail_spin}; use risedev::{ generate_risedev_env, preflight_check, AwsS3Config, CompactorService, ComputeNodeService, - ConfigExpander, ConfigureTmuxTask, ConnectorNodeService, EnsureStopService, ExecuteContext, - FrontendService, GrafanaService, KafkaService, MetaNodeService, MinioService, OpendalConfig, - PrometheusService, PubsubService, RedisService, ServiceConfig, Task, TempoService, - ZooKeeperService, RISEDEV_SESSION_NAME, + ConfigExpander, ConfigureTmuxTask, EnsureStopService, ExecuteContext, FrontendService, + GrafanaService, KafkaService, MetaNodeService, MinioService, OpendalConfig, PrometheusService, + PubsubService, RedisService, ServiceConfig, Task, TempoService, ZooKeeperService, + RISEDEV_SESSION_NAME, }; use tempfile::tempdir; use yaml_rust::YamlEmitter; @@ -114,7 +114,6 @@ fn task_main( ServiceConfig::AwsS3(_) => None, ServiceConfig::OpenDal(_) => None, ServiceConfig::RedPanda(_) => None, - ServiceConfig::ConnectorNode(c) => Some((c.port, c.id.clone())), }; if let Some(x) = listen_info { @@ -339,17 +338,6 @@ fn task_main( ctx.pb .set_message(format!("redis {}:{}", c.address, c.port)); } - ServiceConfig::ConnectorNode(c) => { - let mut ctx = - ExecuteContext::new(&mut logger, manager.new_progress(), status_dir.clone()); - let mut service = ConnectorNodeService::new(c.clone())?; - service.execute(&mut ctx)?; - let mut task = - risedev::ConfigureGrpcNodeTask::new(c.address.clone(), c.port, false)?; - task.execute(&mut ctx)?; - ctx.pb - .set_message(format!("connector grpc://{}:{}", c.address, c.port)); - } } let service_id = service.id().to_string(); diff --git a/src/risedevtool/src/config.rs b/src/risedevtool/src/config.rs index fe7d677a6a765..09e530487d4f0 100644 --- a/src/risedevtool/src/config.rs +++ b/src/risedevtool/src/config.rs @@ -171,9 +171,6 @@ impl ConfigExpander { "kafka" => ServiceConfig::Kafka(serde_yaml::from_str(&out_str)?), "pubsub" => ServiceConfig::Pubsub(serde_yaml::from_str(&out_str)?), "redis" => ServiceConfig::Redis(serde_yaml::from_str(&out_str)?), - "connector-node" => { - ServiceConfig::ConnectorNode(serde_yaml::from_str(&out_str)?) - } "zookeeper" => ServiceConfig::ZooKeeper(serde_yaml::from_str(&out_str)?), "redpanda" => ServiceConfig::RedPanda(serde_yaml::from_str(&out_str)?), other => return Err(anyhow!("unsupported use type: {}", other)), diff --git a/src/risedevtool/src/config_gen/prometheus_gen.rs b/src/risedevtool/src/config_gen/prometheus_gen.rs index aa6422416a31f..2143031f1ba21 100644 --- a/src/risedevtool/src/config_gen/prometheus_gen.rs +++ b/src/risedevtool/src/config_gen/prometheus_gen.rs @@ -79,14 +79,6 @@ impl PrometheusGen { .map(|node| format!("\"{}:{}\"", node.address, 9644)) .join(","); - let connector_node_targets = config - .provide_connector_node - .as_ref() - .unwrap() - .iter() - .map(|node| format!("\"{}:{}\"", node.address, node.exporter_port)) - .join(","); - let now = Local::now().format("%Y%m%d-%H%M%S"); let remote_write = if config.remote_write { @@ -151,10 +143,6 @@ scrape_configs: - job_name: redpanda static_configs: - targets: [{redpanda_targets}] - - - job_name: connector-node - static_configs: - - targets: [{connector_node_targets}] "#, ) } diff --git a/src/risedevtool/src/risedev_env.rs b/src/risedevtool/src/risedev_env.rs index 20b5fa97dae34..2ab3e350165f5 100644 --- a/src/risedevtool/src/risedev_env.rs +++ b/src/risedevtool/src/risedev_env.rs @@ -19,8 +19,9 @@ use std::process::Command; use crate::{add_hummock_backend, HummockInMemoryStrategy, ServiceConfig}; -/// Generate environment variables from the given service configurations to be used by future -/// RiseDev commands, like `risedev ctl` or `risedev psql`. +/// Generate environment variables (put in file `.risingwave/config/risedev-env`) +/// from the given service configurations to be used by future +/// RiseDev commands, like `risedev ctl` or `risedev psql` (). pub fn generate_risedev_env(services: &Vec) -> String { let mut env = String::new(); for item in services { diff --git a/src/risedevtool/src/service_config.rs b/src/risedevtool/src/service_config.rs index 8890f984971fe..516ae872d6c31 100644 --- a/src/risedevtool/src/service_config.rs +++ b/src/risedevtool/src/service_config.rs @@ -37,7 +37,6 @@ pub struct ComputeNodeConfig { pub provide_aws_s3: Option>, pub provide_tempo: Option>, pub user_managed: bool, - pub connector_rpc_endpoint: String, pub total_memory_bytes: usize, pub parallelism: usize, @@ -61,7 +60,6 @@ pub struct MetaNodeConfig { pub user_managed: bool, - pub connector_rpc_endpoint: String, pub provide_etcd_backend: Option>, pub provide_prometheus: Option>, @@ -190,7 +188,6 @@ pub struct PrometheusConfig { pub provide_etcd: Option>, pub provide_redpanda: Option>, pub provide_frontend: Option>, - pub provide_connector_node: Option>, } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -318,18 +315,6 @@ pub struct RedisConfig { pub address: String, } -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "kebab-case")] -#[serde(deny_unknown_fields)] -pub struct ConnectorNodeConfig { - #[serde(rename = "use")] - phantom_use: Option, - pub id: String, - pub port: u16, - pub exporter_port: u16, - pub address: String, -} - /// All service configuration #[derive(Clone, Debug, PartialEq)] pub enum ServiceConfig { @@ -349,7 +334,6 @@ pub enum ServiceConfig { Redis(RedisConfig), ZooKeeper(ZooKeeperConfig), RedPanda(RedPandaConfig), - ConnectorNode(ConnectorNodeConfig), } impl ServiceConfig { @@ -370,7 +354,6 @@ impl ServiceConfig { Self::Pubsub(c) => &c.id, Self::Redis(c) => &c.id, Self::RedPanda(c) => &c.id, - Self::ConnectorNode(c) => &c.id, Self::OpenDal(c) => &c.id, } } diff --git a/src/risedevtool/src/task.rs b/src/risedevtool/src/task.rs index 262a68c52cb9a..a2e4ec9bf46dc 100644 --- a/src/risedevtool/src/task.rs +++ b/src/risedevtool/src/task.rs @@ -15,7 +15,6 @@ mod compactor_service; mod compute_node_service; mod configure_tmux_service; -mod connector_service; mod ensure_stop_service; mod etcd_service; mod frontend_service; @@ -52,7 +51,6 @@ pub use utils::*; pub use self::compactor_service::*; pub use self::compute_node_service::*; pub use self::configure_tmux_service::*; -pub use self::connector_service::*; pub use self::ensure_stop_service::*; pub use self::etcd_service::*; pub use self::frontend_service::*; diff --git a/src/risedevtool/src/task/compactor_service.rs b/src/risedevtool/src/task/compactor_service.rs index d94083745154e..adecc007b8207 100644 --- a/src/risedevtool/src/task/compactor_service.rs +++ b/src/risedevtool/src/task/compactor_service.rs @@ -53,9 +53,7 @@ impl CompactorService { config.listen_address, config.exporter_port )) .arg("--advertise-addr") - .arg(format!("{}:{}", config.address, config.port)) - .arg("--metrics-level") - .arg("info"); + .arg(format!("{}:{}", config.address, config.port)); if let Some(compaction_worker_threads_number) = config.compaction_worker_threads_number.as_ref() { @@ -84,8 +82,6 @@ impl Task for CompactorService { cmd.env("RUST_BACKTRACE", "1"); - // FIXME: Otherwise, CI will throw log size too large error - // cmd.env("RW_QUERY_LOG_PATH", DEFAULT_QUERY_LOG_PATH); if crate::util::is_env_set("RISEDEV_ENABLE_PROFILE") { cmd.env( "RW_PROFILE_PATH", @@ -95,10 +91,9 @@ impl Task for CompactorService { if crate::util::is_env_set("RISEDEV_ENABLE_HEAP_PROFILE") { // See https://linux.die.net/man/3/jemalloc for the descriptions of profiling options - cmd.env( - "MALLOC_CONF", - "prof:true,lg_prof_interval:34,lg_prof_sample:19,prof_prefix:compactor", - ); + let conf = "prof:true,lg_prof_interval:34,lg_prof_sample:19,prof_prefix:compactor"; + cmd.env("_RJEM_MALLOC_CONF", conf); // prefixed for macos + cmd.env("MALLOC_CONF", conf); // unprefixed for linux } cmd.arg("--config-path") diff --git a/src/risedevtool/src/task/compute_node_service.rs b/src/risedevtool/src/task/compute_node_service.rs index 6c705154e0578..ced6bec115f6a 100644 --- a/src/risedevtool/src/task/compute_node_service.rs +++ b/src/risedevtool/src/task/compute_node_service.rs @@ -56,12 +56,8 @@ impl ComputeNodeService { )) .arg("--advertise-addr") .arg(format!("{}:{}", config.address, config.port)) - .arg("--metrics-level") - .arg("info") .arg("--async-stack-trace") .arg(&config.async_stack_trace) - .arg("--connector-rpc-endpoint") - .arg(&config.connector_rpc_endpoint) .arg("--parallelism") .arg(&config.parallelism.to_string()) .arg("--total-memory-bytes") @@ -92,8 +88,6 @@ impl Task for ComputeNodeService { "TOKIO_CONSOLE_BIND", format!("127.0.0.1:{}", self.config.port + 1000), ); - // FIXME: Otherwise, CI will throw log size too large error - // cmd.env("RW_QUERY_LOG_PATH", DEFAULT_QUERY_LOG_PATH); if crate::util::is_env_set("RISEDEV_ENABLE_PROFILE") { cmd.env( "RW_PROFILE_PATH", @@ -103,9 +97,16 @@ impl Task for ComputeNodeService { if crate::util::is_env_set("RISEDEV_ENABLE_HEAP_PROFILE") { // See https://linux.die.net/man/3/jemalloc for the descriptions of profiling options + let conf = "prof:true,lg_prof_interval:34,lg_prof_sample:19,prof_prefix:compute-node"; + cmd.env("_RJEM_MALLOC_CONF", conf); // prefixed for macos + cmd.env("MALLOC_CONF", conf); // unprefixed for linux + } + + if crate::util::is_env_set("ENABLE_BUILD_RW_CONNECTOR") { + let prefix_bin = env::var("PREFIX_BIN")?; cmd.env( - "MALLOC_CONF", - "prof:true,lg_prof_interval:34,lg_prof_sample:19,prof_prefix:compute-node", + "CONNECTOR_LIBS_PATH", + Path::new(&prefix_bin).join("connector-node/libs/"), ); } diff --git a/src/risedevtool/src/task/connector_service.rs b/src/risedevtool/src/task/connector_service.rs deleted file mode 100644 index 05268db6a43ea..0000000000000 --- a/src/risedevtool/src/task/connector_service.rs +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2023 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::env; -use std::io::Write; -use std::path::{Path, PathBuf}; -use std::process::Command; - -use anyhow::{anyhow, Result}; - -use crate::{ConnectorNodeConfig, ExecuteContext, Task}; - -pub struct ConnectorNodeService { - pub config: ConnectorNodeConfig, -} - -impl ConnectorNodeService { - pub fn new(config: ConnectorNodeConfig) -> Result { - Ok(Self { config }) - } - - fn connector_path(&self) -> Result { - let prefix_bin = env::var("PREFIX_BIN")?; - Ok(Path::new(&prefix_bin) - .join("connector-node") - .join("start-service.sh")) - } -} - -impl Task for ConnectorNodeService { - fn execute(&mut self, ctx: &mut ExecuteContext) -> Result<()> { - ctx.service(self); - ctx.pb.set_message("starting"); - let path = self.connector_path()?; - if !path.exists() { - return Err(anyhow!("RisingWave connector binary not found in {:?}\nPlease enable building RisingWave connector in `./risedev configure`?", path)); - } - let mut cmd = Command::new(path); - cmd.arg("-p").arg(self.config.port.to_string()); - ctx.run_command(ctx.tmux_run(cmd)?)?; - ctx.pb.set_message("started"); - - Ok(()) - } - - fn id(&self) -> String { - self.config.id.clone() - } -} diff --git a/src/risedevtool/src/task/frontend_service.rs b/src/risedevtool/src/task/frontend_service.rs index dd0015ac188bd..cf0213028e465 100644 --- a/src/risedevtool/src/task/frontend_service.rs +++ b/src/risedevtool/src/task/frontend_service.rs @@ -61,9 +61,7 @@ impl FrontendService { .arg(format!( "{}:{}", config.listen_address, config.health_check_port - )) - .arg("--metrics-level") - .arg("info"); + )); let provide_meta_node = config.provide_meta_node.as_ref().unwrap(); if provide_meta_node.is_empty() { @@ -94,8 +92,6 @@ impl Task for FrontendService { let mut cmd = self.frontend()?; cmd.env("RUST_BACKTRACE", "1"); - // FIXME: Otherwise, CI will throw log size too large error - // cmd.env("RW_QUERY_LOG_PATH", DEFAULT_QUERY_LOG_PATH); let prefix_config = env::var("PREFIX_CONFIG")?; cmd.arg("--config-path") diff --git a/src/risedevtool/src/task/meta_node_service.rs b/src/risedevtool/src/task/meta_node_service.rs index 64aca22962f9c..2494a9eceaf16 100644 --- a/src/risedevtool/src/task/meta_node_service.rs +++ b/src/risedevtool/src/task/meta_node_service.rs @@ -60,13 +60,10 @@ impl MetaNodeService { config.listen_address, config.dashboard_port )); - cmd.arg("--prometheus-host") - .arg(format!( - "{}:{}", - config.listen_address, config.exporter_port - )) - .arg("--connector-rpc-endpoint") - .arg(&config.connector_rpc_endpoint); + cmd.arg("--prometheus-host").arg(format!( + "{}:{}", + config.listen_address, config.exporter_port + )); match config.provide_prometheus.as_ref().unwrap().as_slice() { [] => {} @@ -174,8 +171,6 @@ impl Task for MetaNodeService { let mut cmd = self.meta_node()?; cmd.env("RUST_BACKTRACE", "1"); - // FIXME: Otherwise, CI will throw log size too large error - // cmd.env("RW_QUERY_LOG_PATH", DEFAULT_QUERY_LOG_PATH); if crate::util::is_env_set("RISEDEV_ENABLE_PROFILE") { cmd.env( @@ -186,9 +181,16 @@ impl Task for MetaNodeService { if crate::util::is_env_set("RISEDEV_ENABLE_HEAP_PROFILE") { // See https://linux.die.net/man/3/jemalloc for the descriptions of profiling options + let conf = "prof:true,lg_prof_interval:32,lg_prof_sample:19,prof_prefix:meta-node"; + cmd.env("_RJEM_MALLOC_CONF", conf); // prefixed for macos + cmd.env("MALLOC_CONF", conf); // unprefixed for linux + } + + if crate::util::is_env_set("ENABLE_BUILD_RW_CONNECTOR") { + let prefix_bin = env::var("PREFIX_BIN")?; cmd.env( - "MALLOC_CONF", - "prof:true,lg_prof_interval:32,lg_prof_sample:19,prof_prefix:meta-node", + "CONNECTOR_LIBS_PATH", + Path::new(&prefix_bin).join("connector-node/libs/"), ); } diff --git a/src/risedevtool/src/task/utils.rs b/src/risedevtool/src/task/utils.rs index dbb52aaa5e644..cbf1bb8cdcedf 100644 --- a/src/risedevtool/src/task/utils.rs +++ b/src/risedevtool/src/task/utils.rs @@ -19,9 +19,6 @@ use itertools::Itertools; use crate::{AwsS3Config, MetaNodeConfig, MinioConfig, OpendalConfig, TempoConfig}; -#[allow(dead_code)] -pub(crate) const DEFAULT_QUERY_LOG_PATH: &str = ".risingwave/log/"; - /// Add a meta node to the parameters. pub fn add_meta_node(provide_meta_node: &[MetaNodeConfig], cmd: &mut Command) -> Result<()> { match provide_meta_node { diff --git a/src/rpc_client/src/compactor_client.rs b/src/rpc_client/src/compactor_client.rs index cdd1b08049087..77fd3e0a44700 100644 --- a/src/rpc_client/src/compactor_client.rs +++ b/src/rpc_client/src/compactor_client.rs @@ -12,15 +12,32 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; use std::time::Duration; use risingwave_common::util::addr::HostAddr; +use risingwave_pb::hummock::hummock_manager_service_client::HummockManagerServiceClient; +use risingwave_pb::hummock::{ + GetNewSstIdsRequest, GetNewSstIdsResponse, ReportCompactionTaskRequest, + ReportCompactionTaskResponse, ReportFullScanTaskRequest, ReportFullScanTaskResponse, + ReportVacuumTaskRequest, ReportVacuumTaskResponse, +}; +use risingwave_pb::meta::system_params_service_client::SystemParamsServiceClient; +use risingwave_pb::meta::{GetSystemParamsRequest, GetSystemParamsResponse}; use risingwave_pb::monitor_service::monitor_service_client::MonitorServiceClient; use risingwave_pb::monitor_service::{StackTraceRequest, StackTraceResponse}; +use tokio::sync::RwLock; +use tokio_retry::strategy::{jitter, ExponentialBackoff}; use tonic::transport::{Channel, Endpoint}; use crate::error::Result; +use crate::retry_rpc; +const ENDPOINT_KEEP_ALIVE_INTERVAL_SEC: u64 = 60; +const ENDPOINT_KEEP_ALIVE_TIMEOUT_SEC: u64 = 60; +const DEFAULT_RETRY_INTERVAL: u64 = 20; +const DEFAULT_RETRY_MAX_DELAY: Duration = Duration::from_secs(5); +const DEFAULT_RETRY_MAX_ATTEMPTS: usize = 3; #[derive(Clone)] pub struct CompactorClient { pub monitor_client: MonitorServiceClient, @@ -46,3 +63,154 @@ impl CompactorClient { .into_inner()) } } + +#[derive(Debug, Clone)] +pub struct GrpcCompactorProxyClientCore { + hummock_client: HummockManagerServiceClient, + system_params_client: SystemParamsServiceClient, +} + +impl GrpcCompactorProxyClientCore { + pub(crate) fn new(channel: Channel) -> Self { + let hummock_client = + HummockManagerServiceClient::new(channel.clone()).max_decoding_message_size(usize::MAX); + let system_params_client = SystemParamsServiceClient::new(channel); + + Self { + hummock_client, + system_params_client, + } + } +} + +/// Client to proxy server. Cloning the instance is lightweight. +/// +/// Todo(wcy-fdu): add refresh client interface. +#[derive(Debug, Clone)] +pub struct GrpcCompactorProxyClient { + pub core: Arc>, + endpoint: String, +} + +impl GrpcCompactorProxyClient { + pub fn new(channel: Channel, endpoint: String) -> Self { + let core = Arc::new(RwLock::new(GrpcCompactorProxyClientCore::new(channel))); + Self { core, endpoint } + } + + async fn recreate_core(&self) { + tracing::info!("GrpcCompactorProxyClient rpc transfer failed, try to reconnect"); + let channel = self.connect_to_endpoint().await; + let mut core = self.core.write().await; + *core = GrpcCompactorProxyClientCore::new(channel); + } + + async fn connect_to_endpoint(&self) -> Channel { + let endpoint = + Endpoint::from_shared(self.endpoint.clone()).expect("Fail to construct tonic Endpoint"); + endpoint + .http2_keep_alive_interval(Duration::from_secs(ENDPOINT_KEEP_ALIVE_INTERVAL_SEC)) + .keep_alive_timeout(Duration::from_secs(ENDPOINT_KEEP_ALIVE_TIMEOUT_SEC)) + .connect_timeout(Duration::from_secs(5)) + .connect() + .await + .expect("Failed to create channel via proxy rpc endpoint.") + } + + pub async fn get_new_sst_ids( + &self, + request: GetNewSstIdsRequest, + ) -> std::result::Result, tonic::Status> { + retry_rpc!(self, get_new_sst_ids, request, GetNewSstIdsResponse) + } + + pub async fn report_compaction_task( + &self, + request: ReportCompactionTaskRequest, + ) -> std::result::Result, tonic::Status> { + retry_rpc!( + self, + report_compaction_task, + request, + ReportCompactionTaskResponse + ) + } + + pub async fn report_full_scan_task( + &self, + request: ReportFullScanTaskRequest, + ) -> std::result::Result, tonic::Status> { + retry_rpc!( + self, + report_full_scan_task, + request, + ReportFullScanTaskResponse + ) + } + + pub async fn report_vacuum_task( + &self, + request: ReportVacuumTaskRequest, + ) -> std::result::Result, tonic::Status> { + retry_rpc!(self, report_vacuum_task, request, ReportVacuumTaskResponse) + } + + pub async fn get_system_params( + &self, + ) -> std::result::Result, tonic::Status> { + tokio_retry::RetryIf::spawn( + Self::get_retry_strategy(), + || async { + let mut system_params_client = self.core.read().await.system_params_client.clone(); + let rpc_res = system_params_client + .get_system_params(GetSystemParamsRequest {}) + .await; + if rpc_res.is_err() { + self.recreate_core().await; + } + rpc_res + }, + Self::should_retry, + ) + .await + } + + #[inline(always)] + fn get_retry_strategy() -> impl Iterator { + ExponentialBackoff::from_millis(DEFAULT_RETRY_INTERVAL) + .max_delay(DEFAULT_RETRY_MAX_DELAY) + .take(DEFAULT_RETRY_MAX_ATTEMPTS) + .map(jitter) + } + + #[inline(always)] + fn should_retry(status: &tonic::Status) -> bool { + if status.code() == tonic::Code::Unavailable + || status.code() == tonic::Code::Unknown + || (status.code() == tonic::Code::Unauthenticated + && status.message().contains("invalid auth token")) + { + return true; + } + false + } +} + +#[macro_export] +macro_rules! retry_rpc { + ($self:expr, $rpc_call:ident, $request:expr, $response:ty) => { + tokio_retry::RetryIf::spawn( + Self::get_retry_strategy(), + || async { + let mut hummock_client = $self.core.read().await.hummock_client.clone(); + let rpc_res = hummock_client.$rpc_call($request.clone()).await; + if rpc_res.is_err() { + $self.recreate_core().await; + } + rpc_res + }, + Self::should_retry, + ) + .await + }; +} diff --git a/src/rpc_client/src/lib.rs b/src/rpc_client/src/lib.rs index 7d94c1a0d789d..3e744bb61608d 100644 --- a/src/rpc_client/src/lib.rs +++ b/src/rpc_client/src/lib.rs @@ -16,7 +16,6 @@ //! response gRPC message structs. #![feature(trait_alias)] -#![feature(binary_heap_drain_sorted)] #![feature(result_option_inspect)] #![feature(type_alias_impl_trait)] #![feature(associated_type_defaults)] @@ -31,12 +30,13 @@ use std::any::type_name; use std::fmt::{Debug, Formatter}; use std::future::Future; use std::iter::repeat; +use std::pin::pin; use std::sync::Arc; use anyhow::anyhow; use async_trait::async_trait; -use futures::future::try_join_all; -use futures::stream::BoxStream; +use futures::future::{select, try_join_all, Either}; +use futures::stream::{BoxStream, Peekable}; use futures::{Stream, StreamExt}; use moka::future::Cache; use rand::prelude::SliceRandom; @@ -58,7 +58,9 @@ mod sink_coordinate_client; mod stream_client; mod tracing; -pub use compactor_client::CompactorClient; +use std::pin::Pin; + +pub use compactor_client::{CompactorClient, GrpcCompactorProxyClient}; pub use compute_client::{ComputeClient, ComputeClientPool, ComputeClientPoolRef}; pub use connector_client::{ConnectorClient, SinkCoordinatorStreamHandle, SinkWriterStreamHandle}; pub use hummock_meta_client::{CompactionEventItem, HummockMetaClient}; @@ -173,7 +175,7 @@ macro_rules! meta_rpc_client_method_impl { pub struct BidiStreamHandle { request_sender: Sender, - response_stream: BoxStream<'static, std::result::Result>, + response_stream: Peekable>>, } impl Debug for BidiStreamHandle { @@ -189,7 +191,7 @@ impl BidiStreamHandle { ) -> Self { Self { request_sender, - response_stream, + response_stream: response_stream.peekable(), } } @@ -223,7 +225,7 @@ impl BidiStreamHandle { Ok(( Self { request_sender, - response_stream: response_stream.boxed(), + response_stream: response_stream.boxed().peekable(), }, first_response, )) @@ -238,10 +240,25 @@ impl BidiStreamHandle { } pub async fn send_request(&mut self, request: REQ) -> Result<()> { - Ok(self - .request_sender - .send(request) - .await - .map_err(|_| anyhow!("unable to send request {}", type_name::()))?) + // Poll the response stream to early see the error + let send_request_result = match select( + pin!(self.request_sender.send(request)), + pin!(Pin::new(&mut self.response_stream).peek()), + ) + .await + { + Either::Left((result, _)) => result, + Either::Right((response_result, send_future)) => match response_result { + None => { + return Err(anyhow!("end of response stream").into()); + } + Some(Err(e)) => { + return Err(e.clone().into()); + } + Some(Ok(_)) => send_future.await, + }, + }; + send_request_result + .map_err(|_| anyhow!("unable to send request {}", type_name::()).into()) } } diff --git a/src/rpc_client/src/meta_client.rs b/src/rpc_client/src/meta_client.rs index 2b695f9c045b0..827860d1af7b3 100644 --- a/src/rpc_client/src/meta_client.rs +++ b/src/rpc_client/src/meta_client.rs @@ -50,6 +50,7 @@ use risingwave_pb::ddl_service::alter_relation_name_request::Relation; use risingwave_pb::ddl_service::ddl_service_client::DdlServiceClient; use risingwave_pb::ddl_service::drop_table_request::SourceId; use risingwave_pb::ddl_service::*; +use risingwave_pb::hummock::get_compaction_score_response::PickerInfo; use risingwave_pb::hummock::hummock_manager_service_client::HummockManagerServiceClient; use risingwave_pb::hummock::rise_ctl_update_compaction_config_request::mutable_config::MutableConfig; use risingwave_pb::hummock::subscribe_compaction_event_request::Register; @@ -332,12 +333,10 @@ impl MetaClient { &self, table: PbTable, graph: StreamFragmentGraph, - stream_job_execution_mode: StreamJobExecutionMode, ) -> Result<(TableId, CatalogVersion)> { let request = CreateMaterializedViewRequest { materialized_view: Some(table), fragment_graph: Some(graph), - stream_job_execution_mode: stream_job_execution_mode as i32, }; let resp = self.inner.create_materialized_view(request).await?; // TODO: handle error in `resp.status` here @@ -666,7 +665,7 @@ impl MetaClient { extra_info.push(info); } } - tracing::trace!(target: "events::meta::client_heartbeat", "heartbeat"); + tracing::debug!(target: "events::meta::client_heartbeat", "heartbeat"); match tokio::time::timeout( // TODO: decide better min_interval for timeout min_interval * 3, @@ -938,10 +937,10 @@ impl MetaClient { Ok(resp.job_id) } - pub async fn get_backup_job_status(&self, job_id: u64) -> Result { + pub async fn get_backup_job_status(&self, job_id: u64) -> Result<(BackupJobStatus, String)> { let req = GetBackupJobStatusRequest { job_id }; let resp = self.inner.get_backup_job_status(req).await?; - Ok(resp.job_status()) + Ok((resp.job_status(), resp.message)) } pub async fn delete_meta_snapshot(&self, snapshot_ids: &[u64]) -> Result<()> { @@ -1047,6 +1046,23 @@ impl MetaClient { )) } + pub async fn get_compaction_score( + &self, + compaction_group_id: CompactionGroupId, + ) -> Result> { + let req = GetCompactionScoreRequest { + compaction_group_id, + }; + let resp = self.inner.get_compaction_score(req).await?; + Ok(resp.scores) + } + + pub async fn risectl_rebuild_table_stats(&self) -> Result<()> { + let req = RiseCtlRebuildTableStatsRequest {}; + let _resp = self.inner.rise_ctl_rebuild_table_stats(req).await?; + Ok(()) + } + pub async fn list_branched_object(&self) -> Result> { let req = ListBranchedObjectRequest {}; let resp = self.inner.list_branched_object(req).await?; @@ -1730,6 +1746,8 @@ macro_rules! for_all_meta_rpc { ,{ hummock_client, init_metadata_for_replay, InitMetadataForReplayRequest, InitMetadataForReplayResponse } ,{ hummock_client, split_compaction_group, SplitCompactionGroupRequest, SplitCompactionGroupResponse } ,{ hummock_client, rise_ctl_list_compaction_status, RiseCtlListCompactionStatusRequest, RiseCtlListCompactionStatusResponse } + ,{ hummock_client, get_compaction_score, GetCompactionScoreRequest, GetCompactionScoreResponse } + ,{ hummock_client, rise_ctl_rebuild_table_stats, RiseCtlRebuildTableStatsRequest, RiseCtlRebuildTableStatsResponse } ,{ hummock_client, subscribe_compaction_event, impl tonic::IntoStreamingRequest, Streaming } ,{ hummock_client, list_branched_object, ListBranchedObjectRequest, ListBranchedObjectResponse } ,{ hummock_client, list_active_write_limit, ListActiveWriteLimitRequest, ListActiveWriteLimitResponse } diff --git a/src/source/Cargo.toml b/src/source/Cargo.toml index bf60bc45f7395..aedb0b9158908 100644 --- a/src/source/Cargo.toml +++ b/src/source/Cargo.toml @@ -15,7 +15,6 @@ normal = ["workspace-hack"] [dependencies] anyhow = "1" -easy-ext = "1" futures = { version = "0.3", default-features = false, features = ["alloc"] } futures-async-stream = { workspace = true } itertools = "0.11" diff --git a/src/source/benches/json_parser.rs b/src/source/benches/json_parser.rs index 70df93b902f57..e54a51befa9f1 100644 --- a/src/source/benches/json_parser.rs +++ b/src/source/benches/json_parser.rs @@ -85,11 +85,11 @@ fn generate_json_row(rng: &mut impl Rng) -> String { ) } -fn generate_json_rows() -> Vec>> { +fn generate_json_rows() -> Vec> { let mut rng = rand::thread_rng(); let mut records = Vec::with_capacity(NUM_RECORDS); for _ in 0..NUM_RECORDS { - records.push(Some(generate_json_row(&mut rng).into_bytes())); + records.push(generate_json_row(&mut rng).into_bytes()); } records } diff --git a/src/source/src/connector_source.rs b/src/source/src/connector_source.rs index 445bf0f6dbb90..733ba6a8c4a83 100644 --- a/src/source/src/connector_source.rs +++ b/src/source/src/connector_source.rs @@ -18,17 +18,22 @@ use std::sync::Arc; use futures::future::try_join_all; use futures::stream::pending; use futures::StreamExt; +use futures_async_stream::try_stream; use itertools::Itertools; use risingwave_common::catalog::ColumnId; use risingwave_common::error::ErrorCode::ConnectorError; -use risingwave_common::error::{internal_error, Result}; +use risingwave_common::error::{internal_error, Result, RwError}; use risingwave_common::util::select_all; use risingwave_connector::dispatch_source_prop; use risingwave_connector::parser::{CommonParserConfig, ParserConfig, SpecificParserConfig}; +use risingwave_connector::source::filesystem::{FsPage, FsPageItem, S3SplitEnumerator}; use risingwave_connector::source::{ - create_split_reader, BoxSourceWithStateStream, Column, ConnectorProperties, ConnectorState, - SourceColumnDesc, SourceContext, SplitReader, + create_split_reader, BoxSourceWithStateStream, BoxTryStream, Column, ConnectorProperties, + ConnectorState, FsFilterCtrlCtx, FsListInner, SourceColumnDesc, SourceContext, + SourceEnumeratorContext, SplitEnumerator, SplitReader, }; +use tokio::time; +use tokio::time::{Duration, MissedTickBehavior}; #[derive(Clone, Debug)] pub struct ConnectorSource { @@ -38,6 +43,15 @@ pub struct ConnectorSource { pub connector_message_buffer_size: usize, } +#[derive(Clone, Debug)] +pub struct FsListCtrlContext { + pub interval: Duration, + pub last_tick: Option, + + pub filter_ctx: FsFilterCtrlCtx, +} +pub type FsListCtrlContextRef = Arc; + impl ConnectorSource { pub fn new( properties: HashMap, @@ -74,6 +88,25 @@ impl ConnectorSource { .collect::>>() } + pub async fn get_source_list(&self) -> Result> { + let config = self.config.clone(); + let lister = match config { + ConnectorProperties::S3(prop) => { + S3SplitEnumerator::new(*prop, Arc::new(SourceEnumeratorContext::default())).await? + } + other => return Err(internal_error(format!("Unsupported source: {:?}", other))), + }; + + Ok(build_fs_list_stream( + FsListCtrlContext { + interval: Duration::from_secs(60), + last_tick: None, + filter_ctx: FsFilterCtrlCtx, + }, + lister, + )) + } + pub async fn stream_reader( &self, state: ConnectorState, @@ -147,3 +180,35 @@ impl ConnectorSource { }) } } + +#[try_stream(boxed, ok = FsPage, error = RwError)] +async fn build_fs_list_stream( + mut ctrl_ctx: FsListCtrlContext, + mut list_op: impl FsListInner + Send + 'static, +) { + let mut interval = time::interval(ctrl_ctx.interval); + interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + + // controlling whether request for next page + fn page_ctrl_logic(_ctx: &FsListCtrlContext, has_finished: bool, _page_num: usize) -> bool { + !has_finished + } + + loop { + let mut page_num = 0; + ctrl_ctx.last_tick = Some(time::Instant::now()); + 'inner: loop { + let (fs_page, has_finished) = list_op.get_next_page::().await?; + let matched_items = fs_page + .into_iter() + .filter(|item| list_op.filter_policy(&ctrl_ctx.filter_ctx, page_num, item)) + .collect_vec(); + yield matched_items; + page_num += 1; + if !page_ctrl_logic(&ctrl_ctx, has_finished, page_num) { + break 'inner; + } + } + interval.tick().await; + } +} diff --git a/src/source/src/fs_connector_source.rs b/src/source/src/fs_connector_source.rs index 974f0561e0f2d..671f5b99c5bae 100644 --- a/src/source/src/fs_connector_source.rs +++ b/src/source/src/fs_connector_source.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// *** NOTICE: TO BE DEPRECATED *** // + use std::collections::HashMap; use std::sync::Arc; diff --git a/src/source/src/lib.rs b/src/source/src/lib.rs index 30c7d90cfe771..1a32888cdf651 100644 --- a/src/source/src/lib.rs +++ b/src/source/src/lib.rs @@ -14,13 +14,13 @@ #![allow(clippy::derive_partial_eq_without_eq)] #![feature(trait_alias)] -#![feature(binary_heap_drain_sorted)] #![feature(lint_reasons)] #![feature(result_option_inspect)] #![feature(generators)] #![feature(hash_extract_if)] #![feature(type_alias_impl_trait)] #![feature(box_patterns)] +#![feature(stmt_expr_attributes)] pub use table::*; diff --git a/src/source/src/source_desc.rs b/src/source/src/source_desc.rs index 4d4b9f9cb5b80..161bbc41ceb63 100644 --- a/src/source/src/source_desc.rs +++ b/src/source/src/source_desc.rs @@ -18,14 +18,12 @@ use std::sync::Arc; use risingwave_common::catalog::ColumnDesc; use risingwave_common::error::ErrorCode::ProtocolError; use risingwave_common::error::{Result, RwError}; -use risingwave_connector::parser::SpecificParserConfig; +use risingwave_connector::parser::{EncodingProperties, ProtocolProperties, SpecificParserConfig}; use risingwave_connector::source::monitor::SourceMetrics; -use risingwave_connector::source::{ - SourceColumnDesc, SourceColumnType, SourceEncode, SourceFormat, SourceStruct, -}; +use risingwave_connector::source::{ConnectorProperties, SourceColumnDesc, SourceColumnType}; use risingwave_connector::ConnectorParams; use risingwave_pb::catalog::PbStreamSourceInfo; -use risingwave_pb::plan_common::{PbColumnCatalog, PbEncodeType, PbFormatType, RowFormatType}; +use risingwave_pb::plan_common::PbColumnCatalog; use crate::connector_source::ConnectorSource; use crate::fs_connector_source::FsConnectorSource; @@ -33,19 +31,19 @@ use crate::fs_connector_source::FsConnectorSource; pub const DEFAULT_CONNECTOR_MESSAGE_BUFFER_SIZE: usize = 16; /// `SourceDesc` describes a stream source. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct SourceDesc { pub source: ConnectorSource, - pub source_struct: SourceStruct, pub columns: Vec, pub metrics: Arc, + + pub is_new_fs_source: bool, } /// `FsSourceDesc` describes a stream source. #[derive(Debug)] pub struct FsSourceDesc { pub source: FsConnectorSource, - pub source_struct: SourceStruct, pub columns: Vec, pub metrics: Arc, } @@ -101,12 +99,16 @@ impl SourceDescBuilder { columns } - pub fn build(self) -> Result { + pub fn build(mut self) -> Result { let columns = self.column_catalogs_to_source_column_descs(); - let source_struct = extract_source_struct(&self.source_info)?; - let psrser_config = - SpecificParserConfig::new(source_struct, &self.source_info, &self.properties)?; + let psrser_config = SpecificParserConfig::new(&self.source_info, &self.properties)?; + + let is_new_fs_source = ConnectorProperties::is_new_fs_connector_hash_map(&self.properties); + if is_new_fs_source { + // new fs source requires `connector='s3_v2' but we simply reuse S3 connector` + ConnectorProperties::rewrite_upstream_source_key_hash_map(&mut self.properties); + } let source = ConnectorSource::new( self.properties, @@ -117,9 +119,9 @@ impl SourceDescBuilder { Ok(SourceDesc { source, - source_struct, columns, metrics: self.metrics, + is_new_fs_source, }) } @@ -128,9 +130,16 @@ impl SourceDescBuilder { } pub fn build_fs_source_desc(&self) -> Result { - let source_struct = extract_source_struct(&self.source_info)?; - match (source_struct.format, source_struct.encode) { - (SourceFormat::Plain, SourceEncode::Csv | SourceEncode::Json) => {} + let parser_config = SpecificParserConfig::new(&self.source_info, &self.properties)?; + + match ( + &parser_config.protocol_config, + &parser_config.encoding_config, + ) { + ( + ProtocolProperties::Plain, + EncodingProperties::Csv(_) | EncodingProperties::Json(_), + ) => {} (format, encode) => { return Err(RwError::from(ProtocolError(format!( "Unsupported combination of format {:?} and encode {:?}", @@ -141,9 +150,6 @@ impl SourceDescBuilder { let columns = self.column_catalogs_to_source_column_descs(); - let parser_config = - SpecificParserConfig::new(source_struct, &self.source_info, &self.properties)?; - let source = FsConnectorSource::new( self.properties.clone(), columns.clone(), @@ -156,71 +162,12 @@ impl SourceDescBuilder { Ok(FsSourceDesc { source, - source_struct, columns, metrics: self.metrics.clone(), }) } } -// Only return valid (format, encode) -pub fn extract_source_struct(info: &PbStreamSourceInfo) -> Result { - // old version meta. - if let Ok(format) = info.get_row_format() { - let (format, encode) = match format { - RowFormatType::Json => (SourceFormat::Plain, SourceEncode::Json), - RowFormatType::Protobuf => (SourceFormat::Plain, SourceEncode::Protobuf), - RowFormatType::DebeziumJson => (SourceFormat::Debezium, SourceEncode::Json), - RowFormatType::Avro => (SourceFormat::Plain, SourceEncode::Avro), - RowFormatType::Maxwell => (SourceFormat::Maxwell, SourceEncode::Json), - RowFormatType::CanalJson => (SourceFormat::Canal, SourceEncode::Json), - RowFormatType::Csv => (SourceFormat::Plain, SourceEncode::Csv), - RowFormatType::Native => (SourceFormat::Native, SourceEncode::Native), - RowFormatType::DebeziumAvro => (SourceFormat::Debezium, SourceEncode::Avro), - RowFormatType::UpsertJson => (SourceFormat::Upsert, SourceEncode::Json), - RowFormatType::UpsertAvro => (SourceFormat::Upsert, SourceEncode::Avro), - RowFormatType::DebeziumMongoJson => (SourceFormat::DebeziumMongo, SourceEncode::Json), - RowFormatType::Bytes => (SourceFormat::Plain, SourceEncode::Bytes), - RowFormatType::RowUnspecified => unreachable!(), - }; - return Ok(SourceStruct::new(format, encode)); - } - let source_format = info.get_format()?; - let source_encode = info.get_row_encode()?; - let (format, encode) = match (source_format, source_encode) { - (PbFormatType::Plain, PbEncodeType::Json) => (SourceFormat::Plain, SourceEncode::Json), - (PbFormatType::Plain, PbEncodeType::Protobuf) => { - (SourceFormat::Plain, SourceEncode::Protobuf) - } - (PbFormatType::Debezium, PbEncodeType::Json) => { - (SourceFormat::Debezium, SourceEncode::Json) - } - (PbFormatType::Plain, PbEncodeType::Avro) => (SourceFormat::Plain, SourceEncode::Avro), - (PbFormatType::Maxwell, PbEncodeType::Json) => (SourceFormat::Maxwell, SourceEncode::Json), - (PbFormatType::Canal, PbEncodeType::Json) => (SourceFormat::Canal, SourceEncode::Json), - (PbFormatType::Plain, PbEncodeType::Csv) => (SourceFormat::Plain, SourceEncode::Csv), - (PbFormatType::Native, PbEncodeType::Native) => { - (SourceFormat::Native, SourceEncode::Native) - } - (PbFormatType::Debezium, PbEncodeType::Avro) => { - (SourceFormat::Debezium, SourceEncode::Avro) - } - (PbFormatType::Upsert, PbEncodeType::Json) => (SourceFormat::Upsert, SourceEncode::Json), - (PbFormatType::Upsert, PbEncodeType::Avro) => (SourceFormat::Upsert, SourceEncode::Avro), - (PbFormatType::DebeziumMongo, PbEncodeType::Json) => { - (SourceFormat::DebeziumMongo, SourceEncode::Json) - } - (PbFormatType::Plain, PbEncodeType::Bytes) => (SourceFormat::Plain, SourceEncode::Bytes), - (format, encode) => { - return Err(RwError::from(ProtocolError(format!( - "Unsupported combination of format {:?} and encode {:?}", - format, encode - )))); - } - }; - Ok(SourceStruct::new(format, encode)) -} - pub mod test_utils { use std::collections::HashMap; diff --git a/src/sqlparser/src/ast/statement.rs b/src/sqlparser/src/ast/statement.rs index d2120c23b07fd..76de970a919a9 100644 --- a/src/sqlparser/src/ast/statement.rs +++ b/src/sqlparser/src/ast/statement.rs @@ -497,6 +497,30 @@ impl Parser { Ok(parse_source_schema(self)?) } } + + /// Parse `FORMAT ... ENCODE ... (...)` in `CREATE SINK`. + /// + /// TODO: After [`SourceSchemaV2`] and [`SinkSchema`] merge, call this in [`parse_source_schema`]. + pub fn parse_schema(&mut self) -> Result, ParserError> { + if !self.parse_keyword(Keyword::FORMAT) { + return Ok(None); + } + + let id = self.parse_identifier()?; + let s = id.value.to_ascii_uppercase(); + let format = Format::from_keyword(&s)?; + self.expect_keyword(Keyword::ENCODE)?; + let id = self.parse_identifier()?; + let s = id.value.to_ascii_uppercase(); + let row_encode = Encode::from_keyword(&s)?; + let row_options = self.parse_options()?; + + Ok(Some(SinkSchema { + format, + row_encode, + row_options, + })) + } } impl SourceSchemaV2 { @@ -798,6 +822,27 @@ impl fmt::Display for CreateSink { } } +/// Same as [`SourceSchemaV2`]. Will be merged in a dedicated rename PR. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct SinkSchema { + pub format: Format, + pub row_encode: Encode, + pub row_options: Vec, +} + +impl fmt::Display for SinkSchema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "FORMAT {} ENCODE {}", self.format, self.row_encode)?; + + if !self.row_options.is_empty() { + write!(f, " ({})", display_comma_separated(&self.row_options)) + } else { + Ok(()) + } + } +} + // sql_grammar!(CreateSinkStatement { // if_not_exists => [Keyword::IF, Keyword::NOT, Keyword::EXISTS], // sink_name: Ident, @@ -814,6 +859,7 @@ pub struct CreateSinkStatement { pub sink_from: CreateSink, pub columns: Vec, pub emit_mode: Option, + pub sink_schema: Option, } impl ParseTo for CreateSinkStatement { @@ -842,6 +888,8 @@ impl ParseTo for CreateSinkStatement { )); } + let sink_schema = p.parse_schema()?; + Ok(Self { if_not_exists, sink_name, @@ -849,6 +897,7 @@ impl ParseTo for CreateSinkStatement { sink_from, columns, emit_mode, + sink_schema, }) } } @@ -863,6 +912,9 @@ impl fmt::Display for CreateSinkStatement { v.push(format!("EMIT {}", emit_mode)); } impl_fmt_display!(with_properties, v, self); + if let Some(schema) = &self.sink_schema { + v.push(format!("{}", schema)); + } v.iter().join(" ").fmt(f) } } diff --git a/src/sqlparser/tests/testdata/create.yaml b/src/sqlparser/tests/testdata/create.yaml index 92bdabc83048c..5509ccad53a04 100644 --- a/src/sqlparser/tests/testdata/create.yaml +++ b/src/sqlparser/tests/testdata/create.yaml @@ -48,6 +48,24 @@ formatted_sql: CREATE SINK IF NOT EXISTS snk FROM mv WITH (connector = 'mysql', mysql.endpoint = '127.0.0.1:3306', mysql.table = '', mysql.database = '', mysql.user = '', mysql.password = '') - input: CREATE SINK IF NOT EXISTS snk AS SELECT count(*) AS cnt FROM mv WITH (connector = 'mysql', mysql.endpoint = '127.0.0.1:3306', mysql.table = '', mysql.database = '', mysql.user = '', mysql.password = '') formatted_sql: CREATE SINK IF NOT EXISTS snk AS SELECT count(*) AS cnt FROM mv WITH (connector = 'mysql', mysql.endpoint = '127.0.0.1:3306', mysql.table = '', mysql.database = '', mysql.user = '', mysql.password = '') +- input: CREATE SINK snk FROM mv WITH (connector = 'kafka', properties.bootstrap.server = '127.0.0.1:9092', topic = 'test_topic', type = 'append-only'); + formatted_sql: CREATE SINK snk FROM mv WITH (connector = 'kafka', properties.bootstrap.server = '127.0.0.1:9092', topic = 'test_topic', type = 'append-only') +- input: CREATE SINK snk FROM mv WITH (connector = 'kafka', properties.bootstrap.server = '127.0.0.1:9092', topic = 'test_topic') format plain encode json; + formatted_sql: CREATE SINK snk FROM mv WITH (connector = 'kafka', properties.bootstrap.server = '127.0.0.1:9092', topic = 'test_topic') FORMAT PLAIN ENCODE JSON +- input: CREATE SINK snk FROM mv WITH (connector = 'kafka', properties.bootstrap.server = '127.0.0.1:9092', topic = 'test_topic') format upsert encode protobuf (schema.location = 'location', message = 'main_message'); + formatted_sql: CREATE SINK snk FROM mv WITH (connector = 'kafka', properties.bootstrap.server = '127.0.0.1:9092', topic = 'test_topic') FORMAT UPSERT ENCODE PROTOBUF (schema.location = 'location', message = 'main_message') +- input: CREATE SINK snk FROM mv WITH (connector = 'kafka', properties.bootstrap.server = '127.0.0.1:9092', topic = 'test_topic') format; + error_msg: |- + sql parser error: Expected identifier, found: ; at line:1, column:123 + Near " topic = 'test_topic') format;" +- input: CREATE SINK snk FROM mv WITH (connector = 'kafka', properties.bootstrap.server = '127.0.0.1:9092', topic = 'test_topic') format debezium; + error_msg: |- + sql parser error: Expected ENCODE, found: ; at line:1, column:132 + Near "topic = 'test_topic') format debezium" +- input: CREATE SINK snk FROM mv WITH (connector = 'kafka', properties.bootstrap.server = '127.0.0.1:9092', topic = 'test_topic') format debezium encode; + error_msg: |- + sql parser error: Expected identifier, found: ; at line:1, column:139 + Near " 'test_topic') format debezium encode;" - input: create user tmp createdb nocreatedb error_msg: 'sql parser error: conflicting or redundant options' - input: create user tmp createdb createdb diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index 8c03464e34ae8..f1022ab2fd935 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -14,7 +14,6 @@ ignored = ["workspace-hack"] normal = ["workspace-hack"] [dependencies] -anyhow = "1" arc-swap = "1" async-trait = "0.1" auto_enums = { version = "0.8", features = ["futures03"] } @@ -26,7 +25,7 @@ dyn-clone = "1.0.14" either = "1" enum-as-inner = "0.6" fail = "0.5" -foyer = { git = "https://github.com/mrcroxx/foyer", rev = "41b1d39" } +foyer = { git = "https://github.com/mrcroxx/foyer", rev = "438eec8" } futures = { version = "0.3", default-features = false, features = ["alloc"] } futures-async-stream = { workspace = true } hex = "0.4" @@ -38,7 +37,7 @@ more-asserts = "0.3" num-integer = "0.1" parking_lot = "0.12" prometheus = { version = "0.13", features = ["process"] } -prost = "0.11" +prost = { workspace = true } rand = "0.8" risingwave_backup = { workspace = true } risingwave_common = { workspace = true } @@ -66,11 +65,12 @@ tokio = { version = "0.2", package = "madsim-tokio", features = [ "signal", ] } tokio-retry = "0.3" +tonic = { workspace = true } tracing = "0.1" tracing-futures = { version = "0.2", features = ["futures-03"] } -xorf = "0.8.1" +xorf = "0.10.2" xxhash-rust = { version = "0.8.7", features = ["xxh32", "xxh64"] } -zstd = { version = "0.12", default-features = false } +zstd = { version = "0.13", default-features = false } [target.'cfg(target_os = "linux")'.dependencies] procfs = { version = "0.15", default-features = false } diff --git a/src/storage/backup/Cargo.toml b/src/storage/backup/Cargo.toml index c36dd17f364ca..f4f66927c33d4 100644 --- a/src/storage/backup/Cargo.toml +++ b/src/storage/backup/Cargo.toml @@ -19,7 +19,7 @@ async-trait = "0.1" bytes = { version = "1", features = ["serde"] } itertools = "0.11" parking_lot = { version = "0.12", features = ["arc_lock"] } -prost = "0.11" +prost = { workspace = true } risingwave_common = { workspace = true } risingwave_hummock_sdk = { workspace = true } risingwave_object_store = { workspace = true } diff --git a/src/storage/backup/integration_tests/common.sh b/src/storage/backup/integration_tests/common.sh index 5ee314ba6482c..638c2b923776e 100644 --- a/src/storage/backup/integration_tests/common.sh +++ b/src/storage/backup/integration_tests/common.sh @@ -51,7 +51,7 @@ function drop_mvs() { function backup() { local job_id - job_id=$(${BACKUP_TEST_RW_ALL_IN_ONE} risectl meta backup-meta 2>&1 | grep "backup job succeeded" | awk '{print $(NF)}') + job_id=$(${BACKUP_TEST_RW_ALL_IN_ONE} risectl meta backup-meta 2>&1 | grep "backup job succeeded" | awk -F ',' '{print $(NF-1)}'| awk '{print $(NF)}') [ -n "${job_id}" ] echo "${job_id}" } diff --git a/src/storage/backup/src/lib.rs b/src/storage/backup/src/lib.rs index 330dfbc4de44c..3e0549db188a2 100644 --- a/src/storage/backup/src/lib.rs +++ b/src/storage/backup/src/lib.rs @@ -14,7 +14,6 @@ #![allow(clippy::derive_partial_eq_without_eq)] #![feature(trait_alias)] -#![feature(binary_heap_drain_sorted)] #![feature(type_alias_impl_trait)] #![feature(extract_if)] #![feature(custom_test_frameworks)] diff --git a/src/storage/benches/bench_compactor.rs b/src/storage/benches/bench_compactor.rs index df455cf1000ee..41a3649adc5cf 100644 --- a/src/storage/benches/bench_compactor.rs +++ b/src/storage/benches/bench_compactor.rs @@ -56,6 +56,7 @@ pub fn mock_sstable_store() -> SstableStoreRef { 0, FileCache::none(), FileCache::none(), + None, )) } diff --git a/src/storage/benches/bench_multi_builder.rs b/src/storage/benches/bench_multi_builder.rs index a295864060866..9bf0e0a9546ec 100644 --- a/src/storage/benches/bench_multi_builder.rs +++ b/src/storage/benches/bench_multi_builder.rs @@ -144,6 +144,7 @@ fn bench_builder( 0, FileCache::none(), FileCache::none(), + None, )); let mut group = c.benchmark_group("bench_multi_builder"); diff --git a/src/storage/compactor/Cargo.toml b/src/storage/compactor/Cargo.toml index f4118ff639b5d..e6e985b2ba424 100644 --- a/src/storage/compactor/Cargo.toml +++ b/src/storage/compactor/Cargo.toml @@ -15,20 +15,18 @@ ignored = ["workspace-hack"] normal = ["workspace-hack"] [dependencies] -anyhow = "1" async-trait = "0.1" await-tree = { workspace = true } clap = { version = "4", features = ["derive"] } parking_lot = "0.12" -prometheus = { version = "0.13" } risingwave_common = { workspace = true } +risingwave_common_heap_profiling = { workspace = true } risingwave_common_service = { workspace = true } risingwave_object_store = { workspace = true } risingwave_pb = { workspace = true } risingwave_rpc_client = { workspace = true } risingwave_storage = { workspace = true } serde = { version = "1", features = ["derive"] } -serde_json = "1" tokio = { version = "0.2", package = "madsim-tokio", features = [ "fs", "rt", diff --git a/src/storage/compactor/src/lib.rs b/src/storage/compactor/src/lib.rs index 8043cb5d2214d..b269b2aec73d8 100644 --- a/src/storage/compactor/src/lib.rs +++ b/src/storage/compactor/src/lib.rs @@ -14,13 +14,15 @@ mod compactor_observer; mod rpc; -mod server; +pub mod server; mod telemetry; use clap::Parser; -use risingwave_common::config::{AsyncStackTraceOption, MetricLevel, OverrideConfig}; +use risingwave_common::config::{ + AsyncStackTraceOption, CompactorMode, MetricLevel, OverrideConfig, +}; -use crate::server::compactor_serve; +use crate::server::{compactor_serve, shared_compactor_serve}; /// Command-line arguments for compactor-node. #[derive(Parser, Clone, Debug, OverrideConfig)] @@ -75,6 +77,11 @@ pub struct CompactorOpts { #[override_opts(path = streaming.async_stack_trace)] pub async_stack_trace: Option, + /// Enable heap profile dump when memory usage is high. + #[clap(long, env = "RW_HEAP_PROFILING_DIR")] + #[override_opts(path = server.heap_profiling.dir)] + pub heap_profiling_dir: Option, + #[clap(long, env = "RW_OBJECT_STORE_STREAMING_READ_TIMEOUT_MS", value_enum)] #[override_opts(path = storage.object_store_streaming_read_timeout_ms)] pub object_store_streaming_read_timeout_ms: Option, @@ -87,6 +94,12 @@ pub struct CompactorOpts { #[clap(long, env = "RW_OBJECT_STORE_READ_TIMEOUT_MS", value_enum)] #[override_opts(path = storage.object_store_read_timeout_ms)] pub object_store_read_timeout_ms: Option, + + #[clap(long, env = "RW_COMPACTOR_MODE", value_enum)] + pub compactor_mode: Option, + + #[clap(long, env = "RW_PROXY_RPC_ENDPOINT", default_value = "")] + pub proxy_rpc_endpoint: String, } use std::future::Future; @@ -95,28 +108,42 @@ use std::pin::Pin; pub fn start(opts: CompactorOpts) -> Pin + Send>> { // WARNING: don't change the function signature. Making it `async fn` will cause // slow compile in release mode. - Box::pin(async move { - tracing::info!("Compactor node options: {:?}", opts); - tracing::info!("meta address: {}", opts.meta_address.clone()); - - let listen_addr = opts.listen_addr.parse().unwrap(); - tracing::info!("Server Listening at {}", listen_addr); - - let advertise_addr = opts - .advertise_addr - .as_ref() - .unwrap_or_else(|| { - tracing::warn!("advertise addr is not specified, defaulting to listen address"); - &opts.listen_addr - }) - .parse() - .unwrap(); - tracing::info!(" address is {}", advertise_addr); - - let (join_handle, observer_join_handle, _shutdown_sender) = - compactor_serve(listen_addr, advertise_addr, opts).await; - - join_handle.await.unwrap(); - observer_join_handle.abort(); - }) + match opts.compactor_mode { + Some(CompactorMode::Shared) => Box::pin(async move { + tracing::info!("Shared compactor pod options: {:?}", opts); + tracing::info!("Proxy rpc endpoint: {}", opts.proxy_rpc_endpoint.clone()); + + let listen_addr = opts.listen_addr.parse().unwrap(); + + let (join_handle, _shutdown_sender) = shared_compactor_serve(listen_addr, opts).await; + + tracing::info!("Server listening at {}", listen_addr); + + join_handle.await.unwrap(); + }), + None | Some(CompactorMode::Dedicated) => Box::pin(async move { + tracing::info!("Compactor node options: {:?}", opts); + tracing::info!("meta address: {}", opts.meta_address.clone()); + + let listen_addr = opts.listen_addr.parse().unwrap(); + + let advertise_addr = opts + .advertise_addr + .as_ref() + .unwrap_or_else(|| { + tracing::warn!("advertise addr is not specified, defaulting to listen address"); + &opts.listen_addr + }) + .parse() + .unwrap(); + tracing::info!(" address is {}", advertise_addr); + let (join_handle, observer_join_handle, _shutdown_sender) = + compactor_serve(listen_addr, advertise_addr, opts).await; + + tracing::info!("Server listening at {}", listen_addr); + + join_handle.await.unwrap(); + observer_join_handle.abort(); + }), + } } diff --git a/src/storage/compactor/src/rpc.rs b/src/storage/compactor/src/rpc.rs index d7f01115610f1..2182d47af8642 100644 --- a/src/storage/compactor/src/rpc.rs +++ b/src/storage/compactor/src/rpc.rs @@ -12,28 +12,59 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; use std::sync::Arc; use parking_lot::RwLock; use risingwave_pb::compactor::compactor_service_server::CompactorService; -use risingwave_pb::compactor::{EchoRequest, EchoResponse}; +use risingwave_pb::compactor::{ + DispatchCompactionTaskRequest, DispatchCompactionTaskResponse, EchoRequest, EchoResponse, +}; use risingwave_pb::monitor_service::monitor_service_server::MonitorService; use risingwave_pb::monitor_service::{ AnalyzeHeapRequest, AnalyzeHeapResponse, HeapProfilingRequest, HeapProfilingResponse, ListHeapProfilingRequest, ListHeapProfilingResponse, ProfilingRequest, ProfilingResponse, StackTraceRequest, StackTraceResponse, }; +use tokio::sync::mpsc; use tonic::{Request, Response, Status}; #[derive(Default)] -pub struct CompactorServiceImpl {} - +pub struct CompactorServiceImpl { + sender: Option>>, +} +impl CompactorServiceImpl { + pub fn new(sender: mpsc::UnboundedSender>) -> Self { + Self { + sender: Some(sender), + } + } +} #[async_trait::async_trait] impl CompactorService for CompactorServiceImpl { async fn echo(&self, _request: Request) -> Result, Status> { Ok(Response::new(EchoResponse {})) } + + async fn dispatch_compaction_task( + &self, + request: Request, + ) -> Result, Status> { + match &self.sender.as_ref() { + Some(sender) => { + sender + .send(request) + .expect("DispatchCompactionTaskRequest should be able to send"); + } + None => { + tracing::error!( + "fail to send DispatchCompactionTaskRequest, sender has not been initialized." + ); + } + } + Ok(Response::new(DispatchCompactionTaskResponse { + status: None, + })) + } } pub struct MonitorServiceImpl { @@ -53,7 +84,7 @@ impl MonitorService for MonitorServiceImpl { _request: Request, ) -> Result, Status> { let compaction_task_traces = match &self.await_tree_reg { - None => HashMap::default(), + None => Default::default(), Some(await_tree_reg) => await_tree_reg .read() .iter() diff --git a/src/storage/compactor/src/server.rs b/src/storage/compactor/src/server.rs index dd953b87c7af9..3ad23bf68cc3b 100644 --- a/src/storage/compactor/src/server.rs +++ b/src/storage/compactor/src/server.rs @@ -19,15 +19,17 @@ use std::time::Duration; use parking_lot::RwLock; use risingwave_common::config::{ - extract_storage_memory_config, load_config, AsyncStackTraceOption, MetricLevel, + extract_storage_memory_config, load_config, AsyncStackTraceOption, MetricLevel, RwConfig, }; use risingwave_common::monitor::connection::{RouterExt, TcpConfig}; use risingwave_common::system_param::local_manager::LocalSystemParamsManager; +use risingwave_common::system_param::reader::SystemParamsReader; use risingwave_common::telemetry::manager::TelemetryManager; use risingwave_common::telemetry::telemetry_env_enabled; use risingwave_common::util::addr::HostAddr; use risingwave_common::util::resource_util; use risingwave_common::{GIT_SHA, RW_VERSION}; +use risingwave_common_heap_profiling::HeapProfiler; use risingwave_common_service::metrics_manager::MetricsManager; use risingwave_common_service::observer_manager::ObserverManager; use risingwave_object_store::object::object_metrics::GLOBAL_OBJECT_STORE_METRICS; @@ -35,7 +37,7 @@ use risingwave_object_store::object::parse_remote_object_store; use risingwave_pb::common::WorkerType; use risingwave_pb::compactor::compactor_service_server::CompactorServiceServer; use risingwave_pb::monitor_service::monitor_service_server::MonitorServiceServer; -use risingwave_rpc_client::MetaClient; +use risingwave_rpc_client::{GrpcCompactorProxyClient, MetaClient}; use risingwave_storage::filter_key_extractor::{ FilterKeyExtractorManager, RemoteTableAccessor, RpcFilterKeyExtractorManager, }; @@ -45,11 +47,13 @@ use risingwave_storage::hummock::{ HummockMemoryCollector, MemoryLimiter, SstableObjectIdManager, SstableStore, }; use risingwave_storage::monitor::{ - monitor_cache, GLOBAL_COMPACTOR_METRICS, GLOBAL_HUMMOCK_METRICS, + monitor_cache, CompactorMetrics, GLOBAL_COMPACTOR_METRICS, GLOBAL_HUMMOCK_METRICS, }; use risingwave_storage::opts::StorageOpts; +use tokio::sync::mpsc; use tokio::sync::oneshot::Sender; use tokio::task::JoinHandle; +use tonic::transport::Endpoint; use tracing::info; use super::compactor_observer::observer_manager::CompactorObserverNode; @@ -57,47 +61,24 @@ use crate::rpc::{CompactorServiceImpl, MonitorServiceImpl}; use crate::telemetry::CompactorTelemetryCreator; use crate::CompactorOpts; -/// Fetches and runs compaction tasks. -pub async fn compactor_serve( - listen_addr: SocketAddr, - advertise_addr: HostAddr, - opts: CompactorOpts, -) -> (JoinHandle<()>, JoinHandle<()>, Sender<()>) { - type CompactorMemoryCollector = HummockMemoryCollector; - - let config = load_config(&opts.config_path, &opts); - info!("Starting compactor node",); - info!("> config: {:?}", config); - info!( - "> debug assertions: {}", - if cfg!(debug_assertions) { "on" } else { "off" } - ); - info!("> version: {} ({})", RW_VERSION, GIT_SHA); - - // Register to the cluster. - let (meta_client, system_params_reader) = MetaClient::register_new( - &opts.meta_address, - WorkerType::Compactor, - &advertise_addr, - Default::default(), - &config.meta, - ) - .await - .unwrap(); - - info!("Assigned compactor id {}", meta_client.worker_id()); - meta_client.activate(&advertise_addr).await.unwrap(); - +const ENDPOINT_KEEP_ALIVE_INTERVAL_SEC: u64 = 60; +// See `Endpoint::keep_alive_timeout` +const ENDPOINT_KEEP_ALIVE_TIMEOUT_SEC: u64 = 60; +pub async fn prepare_start_parameters( + config: RwConfig, + system_params_reader: SystemParamsReader, +) -> ( + Arc, + Arc, + HeapProfiler, + Option>>>, + Arc, + Arc, +) { // Boot compactor let object_metrics = Arc::new(GLOBAL_OBJECT_STORE_METRICS.clone()); - let hummock_metrics = Arc::new(GLOBAL_HUMMOCK_METRICS.clone()); let compactor_metrics = Arc::new(GLOBAL_COMPACTOR_METRICS.clone()); - let hummock_meta_client = Arc::new(MonitoredHummockMetaClient::new( - meta_client.clone(), - hummock_metrics.clone(), - )); - let state_store_url = system_params_reader.state_store(); let storage_memory_config = extract_storage_memory_config(&config); @@ -106,9 +87,8 @@ pub async fn compactor_serve( &system_params_reader, &storage_memory_config, ))); - let total_memory_available_bytes = - (resource_util::memory::total_memory_available_bytes() as f64 + (resource_util::memory::system_memory_available_bytes() as f64 * config.storage.compactor_memory_available_proportion) as usize; let meta_cache_capacity_bytes = storage_opts.meta_cache_capacity_mb * (1 << 20); let compactor_memory_limit_bytes = match config.storage.compactor_memory_limit_mb { @@ -156,6 +136,86 @@ pub async fn compactor_serve( meta_cache_capacity_bytes, )); + let memory_limiter = Arc::new(MemoryLimiter::new(compactor_memory_limit_bytes)); + let storage_memory_config = extract_storage_memory_config(&config); + let memory_collector: Arc = Arc::new(HummockMemoryCollector::new( + sstable_store.clone(), + memory_limiter.clone(), + storage_memory_config, + )); + + let heap_profiler = HeapProfiler::new( + total_memory_available_bytes, + config.server.heap_profiling.clone(), + ); + + monitor_cache(memory_collector); + + let await_tree_config = match &config.streaming.async_stack_trace { + AsyncStackTraceOption::Off => None, + c => await_tree::ConfigBuilder::default() + .verbose(c.is_verbose().unwrap()) + .build() + .ok(), + }; + let await_tree_reg = + await_tree_config.map(|c| Arc::new(RwLock::new(await_tree::Registry::new(c)))); + + ( + sstable_store, + memory_limiter, + heap_profiler, + await_tree_reg, + storage_opts, + compactor_metrics, + ) +} + +/// Fetches and runs compaction tasks. +pub async fn compactor_serve( + listen_addr: SocketAddr, + advertise_addr: HostAddr, + opts: CompactorOpts, +) -> (JoinHandle<()>, JoinHandle<()>, Sender<()>) { + let config = load_config(&opts.config_path, &opts); + info!("Starting compactor node",); + info!("> config: {:?}", config); + info!( + "> debug assertions: {}", + if cfg!(debug_assertions) { "on" } else { "off" } + ); + info!("> version: {} ({})", RW_VERSION, GIT_SHA); + + // Register to the cluster. + let (meta_client, system_params_reader) = MetaClient::register_new( + &opts.meta_address, + WorkerType::Compactor, + &advertise_addr, + Default::default(), + &config.meta, + ) + .await + .unwrap(); + + info!("Assigned compactor id {}", meta_client.worker_id()); + meta_client.activate(&advertise_addr).await.unwrap(); + + let hummock_metrics = Arc::new(GLOBAL_HUMMOCK_METRICS.clone()); + + let hummock_meta_client = Arc::new(MonitoredHummockMetaClient::new( + meta_client.clone(), + hummock_metrics.clone(), + )); + + let ( + sstable_store, + memory_limiter, + heap_profiler, + await_tree_reg, + storage_opts, + compactor_metrics, + ) = prepare_start_parameters(config.clone(), system_params_reader.clone()).await; + let filter_key_extractor_manager = Arc::new(RpcFilterKeyExtractorManager::new(Box::new( RemoteTableAccessor::new(meta_client.clone()), ))); @@ -167,31 +227,20 @@ pub async fn compactor_serve( let observer_manager = ObserverManager::new_with_meta_client(meta_client.clone(), compactor_observer_node).await; + // Run a background heap profiler + heap_profiler.start(); + // use half of limit because any memory which would hold in meta-cache will be allocate by // limited at first. let observer_join_handle = observer_manager.start().await; - let memory_limiter = Arc::new(MemoryLimiter::new(compactor_memory_limit_bytes)); - let memory_collector = Arc::new(CompactorMemoryCollector::new( - sstable_store.clone(), - memory_limiter.clone(), - storage_memory_config, - )); - - monitor_cache(memory_collector); let sstable_object_id_manager = Arc::new(SstableObjectIdManager::new( hummock_meta_client.clone(), storage_opts.sstable_id_remote_fetch_number, )); - let await_tree_config = match &config.streaming.async_stack_trace { - AsyncStackTraceOption::Off => None, - c => await_tree::ConfigBuilder::default() - .verbose(c.is_verbose().unwrap()) - .build() - .ok(), - }; - let await_tree_reg = - await_tree_config.map(|c| Arc::new(RwLock::new(await_tree::Registry::new(c)))); + let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( + filter_key_extractor_manager.clone(), + ); let compactor_context = CompactorContext { storage_opts, sstable_store: sstable_store.clone(), @@ -200,9 +249,6 @@ pub async fn compactor_serve( compaction_executor: Arc::new(CompactionExecutor::new( opts.compaction_worker_threads_number, )), - filter_key_extractor_manager: FilterKeyExtractorManager::RpcFilterKeyExtractorManager( - filter_key_extractor_manager.clone(), - ), memory_limiter, task_progress_manager: Default::default(), @@ -219,6 +265,7 @@ pub async fn compactor_serve( compactor_context.clone(), hummock_meta_client.clone(), sstable_object_id_manager.clone(), + filter_key_extractor_manager.clone(), ), ]; @@ -275,3 +322,105 @@ pub async fn compactor_serve( (join_handle, observer_join_handle, shutdown_send) } + +pub async fn shared_compactor_serve( + listen_addr: SocketAddr, + opts: CompactorOpts, +) -> (JoinHandle<()>, Sender<()>) { + let config = load_config(&opts.config_path, &opts); + info!("Starting shared compactor node",); + info!("> config: {:?}", config); + info!( + "> debug assertions: {}", + if cfg!(debug_assertions) { "on" } else { "off" } + ); + info!("> version: {} ({})", RW_VERSION, GIT_SHA); + + let endpoint_str = opts.proxy_rpc_endpoint.clone().to_string(); + let endpoint = + Endpoint::from_shared(opts.proxy_rpc_endpoint).expect("Fail to construct tonic Endpoint"); + let channel = endpoint + .http2_keep_alive_interval(Duration::from_secs(ENDPOINT_KEEP_ALIVE_INTERVAL_SEC)) + .keep_alive_timeout(Duration::from_secs(ENDPOINT_KEEP_ALIVE_TIMEOUT_SEC)) + .connect_timeout(Duration::from_secs(5)) + .connect() + .await + .expect("Failed to create channel via proxy rpc endpoint."); + let grpc_proxy_client = GrpcCompactorProxyClient::new(channel, endpoint_str); + let system_params_response = grpc_proxy_client + .get_system_params() + .await + .expect("Fail to get system params, the compactor pod cannot be started."); + let system_params = system_params_response.into_inner().params.unwrap(); + + let ( + sstable_store, + memory_limiter, + heap_profiler, + await_tree_reg, + storage_opts, + compactor_metrics, + ) = prepare_start_parameters(config.clone(), system_params.into()).await; + let (sender, receiver) = mpsc::unbounded_channel(); + let compactor_srv: CompactorServiceImpl = CompactorServiceImpl::new(sender); + + let monitor_srv = MonitorServiceImpl::new(await_tree_reg.clone()); + + // Run a background heap profiler + heap_profiler.start(); + + let (shutdown_send, mut shutdown_recv) = tokio::sync::oneshot::channel(); + let compactor_context = CompactorContext { + storage_opts, + sstable_store, + compactor_metrics, + is_share_buffer_compact: false, + compaction_executor: Arc::new(CompactionExecutor::new( + opts.compaction_worker_threads_number, + )), + memory_limiter, + task_progress_manager: Default::default(), + await_tree_reg, + running_task_count: Arc::new(AtomicU32::new(0)), + }; + let join_handle = tokio::spawn(async move { + tonic::transport::Server::builder() + .add_service(CompactorServiceServer::new(compactor_srv)) + .add_service(MonitorServiceServer::new(monitor_srv)) + .monitored_serve_with_shutdown( + listen_addr, + "grpc-compactor-node-service", + TcpConfig { + tcp_nodelay: true, + keepalive_duration: None, + }, + async move { + let (join_handle, shutdown_sender) = + risingwave_storage::hummock::compactor::start_shared_compactor( + grpc_proxy_client, + receiver, + compactor_context, + ); + tokio::select! { + _ = tokio::signal::ctrl_c() => {}, + _ = &mut shutdown_recv => { + if let Err(err) = shutdown_sender.send(()) { + tracing::warn!("Failed to send shutdown: {:?}", err); + } + if let Err(err) = join_handle.await { + tracing::warn!("Failed to join shutdown: {:?}", err); + } + }, + } + }, + ) + .await + }); + + // Boot metrics service. + if config.server.metrics_level > MetricLevel::Disabled { + MetricsManager::boot_metrics_service(opts.prometheus_listener_addr.clone()); + } + + (join_handle, shutdown_send) +} diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs index 1193877a14c9b..3e4286eb856bc 100644 --- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs +++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs @@ -937,7 +937,15 @@ pub fn add_ssts_to_sub_level( let b = sst2.key_range.as_ref().unwrap(); a.compare(b) }); - assert!(can_concat(&l0.sub_levels[sub_level_idx].table_infos)); + assert!( + can_concat(&l0.sub_levels[sub_level_idx].table_infos), + "sstable ids: {:?}", + l0.sub_levels[sub_level_idx] + .table_infos + .iter() + .map(|sst| sst.sst_id) + .collect_vec() + ); } } @@ -1039,7 +1047,15 @@ fn level_insert_ssts(operand: &mut Level, insert_table_infos: Vec) if operand.level_type == LevelType::Overlapping as i32 { operand.level_type = LevelType::Nonoverlapping as i32; } - assert!(can_concat(&operand.table_infos)); + assert!( + can_concat(&operand.table_infos), + "sstable ids: {:?}", + operand + .table_infos + .iter() + .map(|sst| sst.sst_id) + .collect_vec() + ); } pub fn object_size_map(version: &HummockVersion) -> HashMap { diff --git a/src/storage/hummock_test/src/bin/replay/main.rs b/src/storage/hummock_test/src/bin/replay/main.rs index 7a000c914e3a9..1e9c9591bc864 100644 --- a/src/storage/hummock_test/src/bin/replay/main.rs +++ b/src/storage/hummock_test/src/bin/replay/main.rs @@ -111,6 +111,7 @@ async fn create_replay_hummock(r: Record, args: &Args) -> Result CompactorContext { - get_compactor_context_with_filter_key_extractor_manager_impl( - storage.storage_opts().clone(), - storage.sstable_store(), - filter_key_extractor_manager, - ) + fn get_compactor_context(storage: &HummockStorage) -> CompactorContext { + get_compactor_context_impl(storage.storage_opts().clone(), storage.sstable_store()) } - fn get_compactor_context_with_filter_key_extractor_manager_impl( + fn get_compactor_context_impl( options: Arc, sstable_store: SstableStoreRef, - filter_key_extractor_manager: FilterKeyExtractorManagerRef, ) -> CompactorContext { CompactorContext { storage_opts: options, @@ -200,9 +195,6 @@ pub(crate) mod tests { is_share_buffer_compact: false, compaction_executor: Arc::new(CompactionExecutor::new(Some(1))), memory_limiter: MemoryLimiter::unlimit(), - filter_key_extractor_manager: FilterKeyExtractorManager::RpcFilterKeyExtractorManager( - filter_key_extractor_manager, - ), task_progress_manager: Default::default(), await_tree_reg: None, running_task_count: Arc::new(AtomicU32::new(0)), @@ -234,7 +226,6 @@ pub(crate) mod tests { Default::default(), ) .await; - let rpc_filter_key_extractor_manager = match storage.filter_key_extractor_manager().clone() { FilterKeyExtractorManager::RpcFilterKeyExtractorManager( @@ -242,10 +233,10 @@ pub(crate) mod tests { ) => rpc_filter_key_extractor_manager, FilterKeyExtractorManager::StaticFilterKeyExtractorManager(_) => unreachable!(), }; - let compact_ctx = get_compactor_context_with_filter_key_extractor_manager( - &storage, + let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( rpc_filter_key_extractor_manager, ); + let compact_ctx = get_compactor_context(&storage); let sstable_object_id_manager = Arc::new(SstableObjectIdManager::new( hummock_meta_client.clone(), storage @@ -266,7 +257,7 @@ pub(crate) mod tests { .pin_snapshot(worker_node2.id) .await .unwrap(); - let key = TableKey(key.freeze()); + let key = key.freeze(); const SST_COUNT: u64 = 32; const TEST_WATERMARK: u64 = 8; prepare_test_put_data( @@ -281,7 +272,7 @@ pub(crate) mod tests { while let Some(mut compact_task) = hummock_manager_ref .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() @@ -298,16 +289,23 @@ pub(crate) mod tests { compact_task.current_epoch_time = 0; let (_tx, rx) = tokio::sync::oneshot::channel(); - let (mut result_task, task_stats) = compact( + let (result_task, task_stats) = compact( compact_ctx.clone(), compact_task.clone(), rx, Box::new(sstable_object_id_manager.clone()), + filter_key_extractor_manager.clone(), ) .await; hummock_manager_ref - .report_compact_task(&mut result_task, Some(to_prost_table_stats_map(task_stats))) + .report_compact_task_for_test( + result_task.task_id, + Some(compact_task), + result_task.task_status(), + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) .await .unwrap(); } @@ -356,7 +354,7 @@ pub(crate) mod tests { let get_ret = storage .get( - key.clone(), + TableKey(key.clone()), read_epoch, ReadOptions { cache_policy: CachePolicy::Fill(CachePriority::High), @@ -369,10 +367,10 @@ pub(crate) mod tests { assert_eq!(get_val, val); let ret = storage .get( - key.clone(), + TableKey(key.clone()), ((TEST_WATERMARK - 1) * 1000) << 16, ReadOptions { - prefix_hint: Some(key.clone().0), + prefix_hint: Some(key.clone()), cache_policy: CachePolicy::Fill(CachePriority::High), ..Default::default() }, @@ -405,10 +403,10 @@ pub(crate) mod tests { ) => rpc_filter_key_extractor_manager, FilterKeyExtractorManager::StaticFilterKeyExtractorManager(_) => unreachable!(), }; - let compact_ctx = get_compactor_context_with_filter_key_extractor_manager( - &storage, + let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( rpc_filter_key_extractor_manager, ); + let compact_ctx = get_compactor_context(&storage); let sstable_object_id_manager = Arc::new(SstableObjectIdManager::new( hummock_meta_client.clone(), storage @@ -420,7 +418,7 @@ pub(crate) mod tests { let mut key = BytesMut::default(); key.put_u16(0); key.put_slice(b"same_key"); - let key = TableKey(key.freeze()); + let key = key.freeze(); const SST_COUNT: u64 = 16; let mut val = b"0"[..].repeat(1 << 20); @@ -440,23 +438,29 @@ pub(crate) mod tests { while let Some(compact_task) = hummock_manager_ref .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap() { // 3. compact let (_tx, rx) = tokio::sync::oneshot::channel(); - let (mut result_task, task_stats) = compact( + let (result_task, task_stats) = compact( compact_ctx.clone(), compact_task.clone(), rx, Box::new(sstable_object_id_manager.clone()), + filter_key_extractor_manager.clone(), ) .await; hummock_manager_ref - .report_compact_task(&mut result_task, Some(to_prost_table_stats_map(task_stats))) + .report_compact_task( + result_task.task_id, + result_task.task_status(), + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) .await .unwrap(); } @@ -483,12 +487,11 @@ pub(crate) mod tests { target_table_size ); } - // 5. storage get back the correct kv after compaction storage.wait_version(version).await; let get_val = storage .get( - key.clone(), + TableKey(key.clone()), SST_COUNT + 1, ReadOptions { cache_policy: CachePolicy::Fill(CachePriority::High), @@ -552,10 +555,10 @@ pub(crate) mod tests { } } - pub(crate) fn prepare_compactor_and_filter( + pub fn prepare_compactor_and_filter( storage: &HummockStorage, existing_table_id: u32, - ) -> CompactorContext { + ) -> (CompactorContext, FilterKeyExtractorManager) { let rpc_filter_key_extractor_manager = match storage.filter_key_extractor_manager().clone() { FilterKeyExtractorManager::RpcFilterKeyExtractorManager( @@ -568,10 +571,11 @@ pub(crate) mod tests { Arc::new(FilterKeyExtractorImpl::FullKey(FullKeyFilterKeyExtractor)), ); - get_compactor_context_with_filter_key_extractor_manager( - storage, + let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( rpc_filter_key_extractor_manager, - ) + ); + + (get_compactor_context(storage), filter_key_extractor_manager) } #[tokio::test] @@ -616,6 +620,7 @@ pub(crate) mod tests { ) .await .unwrap(); + assert!(compact_task.is_none()); // 3. get the latest version and check @@ -631,7 +636,7 @@ pub(crate) mod tests { let compact_task = hummock_manager_ref .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap(); @@ -679,11 +684,12 @@ pub(crate) mod tests { 2, Arc::new(FilterKeyExtractorImpl::FullKey(FullKeyFilterKeyExtractor)), ); - - let compact_ctx = get_compactor_context_with_filter_key_extractor_manager_impl( + let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( + rpc_filter_key_extractor_manager, + ); + let compact_ctx = get_compactor_context_impl( global_storage.storage_opts().clone(), global_storage.sstable_store(), - rpc_filter_key_extractor_manager, ); let sstable_object_id_manager = Arc::new(SstableObjectIdManager::new( hummock_meta_client.clone(), @@ -771,16 +777,22 @@ pub(crate) mod tests { // 4. compact let (_tx, rx) = tokio::sync::oneshot::channel(); - let (mut result_task, task_stats) = compact( + let (result_task, task_stats) = compact( compact_ctx, compact_task.clone(), rx, Box::new(sstable_object_id_manager.clone()), + filter_key_extractor_manager, ) .await; hummock_manager_ref - .report_compact_task(&mut result_task, Some(to_prost_table_stats_map(task_stats))) + .report_compact_task( + result_task.task_id, + result_task.task_status(), + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) .await .unwrap(); @@ -809,7 +821,7 @@ pub(crate) mod tests { let compact_task = hummock_manager_ref .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap(); @@ -869,10 +881,7 @@ pub(crate) mod tests { FilterKeyExtractorManager::StaticFilterKeyExtractorManager(_) => unreachable!(), }; - let compact_ctx = get_compactor_context_with_filter_key_extractor_manager( - &storage, - rpc_filter_key_extractor_manager.clone(), - ); + let compact_ctx = get_compactor_context(&storage); let sstable_object_id_manager = Arc::new(SstableObjectIdManager::new( hummock_meta_client.clone(), storage @@ -884,7 +893,9 @@ pub(crate) mod tests { 2, Arc::new(FilterKeyExtractorImpl::FullKey(FullKeyFilterKeyExtractor)), ); - + let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( + rpc_filter_key_extractor_manager, + ); // 1. add sstables let val = Bytes::from(b"0"[..].to_vec()); // 1 Byte value @@ -961,16 +972,22 @@ pub(crate) mod tests { // 3. compact let (_tx, rx) = tokio::sync::oneshot::channel(); - let (mut result_task, task_stats) = compact( + let (result_task, task_stats) = compact( compact_ctx, compact_task.clone(), rx, Box::new(sstable_object_id_manager.clone()), + filter_key_extractor_manager, ) .await; hummock_manager_ref - .report_compact_task(&mut result_task, Some(to_prost_table_stats_map(task_stats))) + .report_compact_task( + result_task.task_id, + result_task.task_status(), + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) .await .unwrap(); @@ -1000,7 +1017,7 @@ pub(crate) mod tests { let compact_task = hummock_manager_ref .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap(); @@ -1070,10 +1087,10 @@ pub(crate) mod tests { FixedLengthFilterKeyExtractor::new(TABLE_PREFIX_LEN + key_prefix.len()), )), ); - let compact_ctx = get_compactor_context_with_filter_key_extractor_manager( - &storage, + let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( rpc_filter_key_extractor_manager, ); + let compact_ctx = get_compactor_context(&storage); let sstable_object_id_manager = Arc::new(SstableObjectIdManager::new( hummock_meta_client.clone(), storage @@ -1146,16 +1163,22 @@ pub(crate) mod tests { // 3. compact let (_tx, rx) = tokio::sync::oneshot::channel(); - let (mut result_task, task_stats) = compact( + let (result_task, task_stats) = compact( compact_ctx, compact_task.clone(), rx, Box::new(sstable_object_id_manager.clone()), + filter_key_extractor_manager, ) .await; hummock_manager_ref - .report_compact_task(&mut result_task, Some(to_prost_table_stats_map(task_stats))) + .report_compact_task( + result_task.task_id, + result_task.task_status(), + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) .await .unwrap(); @@ -1186,7 +1209,7 @@ pub(crate) mod tests { let compact_task = hummock_manager_ref .get_compact_task( StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), + &mut default_compaction_selector(), ) .await .unwrap(); @@ -1248,7 +1271,8 @@ pub(crate) mod tests { TableId::from(existing_table_id), ) .await; - let compact_ctx = prepare_compactor_and_filter(&storage, existing_table_id); + let (compact_ctx, filter_key_extractor_manager) = + prepare_compactor_and_filter(&storage, existing_table_id); let sstable_object_id_manager = Arc::new(SstableObjectIdManager::new( hummock_meta_client.clone(), storage @@ -1304,16 +1328,22 @@ pub(crate) mod tests { // 3. compact let (_tx, rx) = tokio::sync::oneshot::channel(); - let (mut result_task, task_stats) = compact( + let (result_task, task_stats) = compact( compact_ctx, compact_task.clone(), rx, Box::new(sstable_object_id_manager.clone()), + filter_key_extractor_manager, ) .await; hummock_manager_ref - .report_compact_task(&mut result_task, Some(to_prost_table_stats_map(task_stats))) + .report_compact_task( + result_task.task_id, + result_task.task_status(), + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) .await .unwrap(); @@ -1351,7 +1381,7 @@ pub(crate) mod tests { ) .await; hummock_manager_ref.get_new_sst_ids(10).await.unwrap(); - let compact_ctx = prepare_compactor_and_filter(&storage, existing_table_id); + let (compact_ctx, _) = prepare_compactor_and_filter(&storage, existing_table_id); let sstable_store = compact_ctx.sstable_store.clone(); let capacity = 256 * 1024; @@ -1401,17 +1431,17 @@ pub(crate) mod tests { 0, compact_ctx.clone(), task.clone(), - Box::new(SharedComapctorObjectIdManager::new(VecDeque::from_iter([ - 5, 6, 7, 8, 9, - ]))), + Box::new(SharedComapctorObjectIdManager::for_test( + VecDeque::from_iter([5, 6, 7, 8, 9]), + )), ); let fast_compact_runner = FastCompactorRunner::new( compact_ctx.clone(), task.clone(), multi_filter_key_extractor.clone(), - Box::new(SharedComapctorObjectIdManager::new(VecDeque::from_iter([ - 10, 11, 12, 13, 14, - ]))), + Box::new(SharedComapctorObjectIdManager::for_test( + VecDeque::from_iter([10, 11, 12, 13, 14]), + )), Arc::new(TaskProgress::default()), ); let (_, ret1, _) = slow_compact_runner diff --git a/src/storage/hummock_test/src/sync_point_tests.rs b/src/storage/hummock_test/src/sync_point_tests.rs index d28db261785c0..1b7d6d80029a5 100644 --- a/src/storage/hummock_test/src/sync_point_tests.rs +++ b/src/storage/hummock_test/src/sync_point_tests.rs @@ -28,15 +28,11 @@ use risingwave_hummock_sdk::key::{next_key, user_key}; use risingwave_hummock_sdk::table_stats::to_prost_table_stats_map; use risingwave_hummock_sdk::HummockVersionId; use risingwave_meta::hummock::compaction::compaction_config::CompactionConfigBuilder; -use risingwave_meta::hummock::compaction::{default_level_selector, ManualCompactionOption}; -use risingwave_meta::hummock::test_utils::{ - add_ssts, register_table_ids_to_compaction_group, setup_compute_env, - setup_compute_env_with_config, -}; +use risingwave_meta::hummock::compaction::selector::ManualCompactionOption; +use risingwave_meta::hummock::test_utils::{setup_compute_env, setup_compute_env_with_config}; use risingwave_meta::hummock::{HummockManagerRef, MockHummockMetaClient}; -use risingwave_meta::manager::LocalNotification; -use risingwave_pb::hummock::compact_task::TaskStatus; use risingwave_rpc_client::HummockMetaClient; +use risingwave_storage::filter_key_extractor::FilterKeyExtractorManager; use risingwave_storage::hummock::compactor::compactor_runner::compact; use risingwave_storage::hummock::compactor::CompactorContext; use risingwave_storage::hummock::{CachePolicy, GetObjectId, SstableObjectIdManager}; @@ -44,9 +40,8 @@ use risingwave_storage::store::{LocalStateStore, NewLocalOptions, ReadOptions, S use risingwave_storage::StateStore; use serial_test::serial; -use super::compactor_tests::tests::{ - flush_and_commit, get_hummock_storage, prepare_compactor_and_filter, -}; +use super::compactor_tests::tests::{get_hummock_storage, prepare_compactor_and_filter}; +use crate::compactor_tests::tests::flush_and_commit; use crate::get_notification_client_for_test; use crate::local_state_store_test_utils::LocalStateStoreTestExt; use crate::test_utils::gen_key_from_bytes; @@ -181,57 +176,10 @@ async fn test_syncpoints_test_failpoints_fetch_ids() { } } -#[tokio::test] -#[cfg(feature = "sync_point")] -#[serial] -async fn test_syncpoints_test_local_notification_receiver() { - let (env, hummock_manager, _cluster_manager, worker_node) = setup_compute_env(80).await; - let context_id = worker_node.id; - - register_table_ids_to_compaction_group( - hummock_manager.as_ref(), - &[1], - StaticCompactionGroupId::StateDefault.into(), - ) - .await; - // Test cancel compaction task - let _sst_infos = add_ssts(1, hummock_manager.as_ref(), context_id).await; - let mut task = hummock_manager - .get_compact_task( - StaticCompactionGroupId::StateDefault.into(), - &mut default_level_selector(), - ) - .await - .unwrap() - .unwrap(); - task.task_status = TaskStatus::ManualCanceled as i32; - assert_eq!(hummock_manager.list_all_tasks_ids().await.len(), 1); - env.notification_manager() - .notify_local_subscribers(LocalNotification::CompactionTaskNeedCancel(task)) - .await; - sync_point::wait_timeout( - "AFTER_CANCEL_COMPACTION_TASK_ASYNC", - Duration::from_secs(10), - ) - .await - .unwrap(); - assert_eq!(hummock_manager.list_all_tasks_ids().await.len(), 0); - - // Test release hummock contexts - env.notification_manager() - .notify_local_subscribers(LocalNotification::WorkerNodeDeleted(worker_node)) - .await; - sync_point::wait_timeout( - "AFTER_RELEASE_HUMMOCK_CONTEXTS_ASYNC", - Duration::from_secs(10), - ) - .await - .unwrap(); -} - pub async fn compact_once( hummock_manager_ref: HummockManagerRef, compact_ctx: CompactorContext, + filter_key_extractor_manager: FilterKeyExtractorManager, sstable_object_id_manager: Arc, ) { // 2. get compact task @@ -254,16 +202,22 @@ pub async fn compact_once( compact_task.compaction_filter_mask = compaction_filter_flag.bits(); // 3. compact let (_tx, rx) = tokio::sync::oneshot::channel(); - let (mut result_task, task_stats) = compact( + let (result_task, task_stats) = compact( compact_ctx, compact_task.clone(), rx, Box::new(sstable_object_id_manager), + filter_key_extractor_manager.clone(), ) .await; hummock_manager_ref - .report_compact_task(&mut result_task, Some(to_prost_table_stats_map(task_stats))) + .report_compact_task( + result_task.task_id, + result_task.task_status(), + result_task.sorted_output_ssts, + Some(to_prost_table_stats_map(task_stats)), + ) .await .unwrap(); } @@ -291,7 +245,8 @@ async fn test_syncpoints_get_in_delete_range_boundary() { TableId::from(existing_table_id), ) .await; - let compact_ctx = prepare_compactor_and_filter(&storage, existing_table_id); + let (compact_ctx, filter_key_extractor_manager) = + prepare_compactor_and_filter(&storage, existing_table_id); let sstable_object_id_manager = Arc::new(SstableObjectIdManager::new( hummock_meta_client.clone(), @@ -348,6 +303,7 @@ async fn test_syncpoints_get_in_delete_range_boundary() { compact_once( hummock_manager_ref.clone(), compact_ctx.clone(), + filter_key_extractor_manager.clone(), sstable_object_id_manager.clone(), ) .await; @@ -378,6 +334,7 @@ async fn test_syncpoints_get_in_delete_range_boundary() { compact_once( hummock_manager_ref.clone(), compact_ctx.clone(), + filter_key_extractor_manager.clone(), sstable_object_id_manager.clone(), ) .await; @@ -409,6 +366,7 @@ async fn test_syncpoints_get_in_delete_range_boundary() { compact_once( hummock_manager_ref.clone(), compact_ctx.clone(), + filter_key_extractor_manager.clone(), sstable_object_id_manager.clone(), ) .await; @@ -434,6 +392,7 @@ async fn test_syncpoints_get_in_delete_range_boundary() { compact_once( hummock_manager_ref.clone(), compact_ctx.clone(), + filter_key_extractor_manager.clone(), sstable_object_id_manager.clone(), ) .await; diff --git a/src/storage/hummock_trace/Cargo.toml b/src/storage/hummock_trace/Cargo.toml index f9b8fa85bc101..46eabf17835e4 100644 --- a/src/storage/hummock_trace/Cargo.toml +++ b/src/storage/hummock_trace/Cargo.toml @@ -16,7 +16,7 @@ bytes = { version = "1", features = ["serde"] } futures = { version = "0.3", default-features = false, features = ["alloc"] } futures-async-stream = "0.2" parking_lot = "0.12" -prost = "0.11" +prost = { workspace = true } risingwave_common = { workspace = true } risingwave_hummock_sdk = { workspace = true } risingwave_pb = { workspace = true } diff --git a/src/storage/src/filter_key_extractor.rs b/src/storage/src/filter_key_extractor.rs index c3b65bc26992d..b5a79a6f6b42f 100644 --- a/src/storage/src/filter_key_extractor.rs +++ b/src/storage/src/filter_key_extractor.rs @@ -448,7 +448,7 @@ mod tests { use risingwave_common::util::sort_util::OrderType; use risingwave_hummock_sdk::key::TABLE_PREFIX_LEN; use risingwave_pb::catalog::table::TableType; - use risingwave_pb::catalog::{PbStreamJobStatus, PbTable}; + use risingwave_pb::catalog::{PbCreateType, PbStreamJobStatus, PbTable}; use risingwave_pb::common::{PbColumnOrder, PbDirection, PbNullsAre, PbOrderType}; use risingwave_pb::plan_common::PbColumnCatalog; @@ -550,6 +550,7 @@ mod tests { created_at_epoch: None, cleaned_by_watermark: false, stream_job_status: PbStreamJobStatus::Created.into(), + create_type: PbCreateType::Foreground.into(), } } diff --git a/src/storage/src/hummock/block_cache.rs b/src/storage/src/hummock/block_cache.rs index a9162f037f55a..1260b3c881486 100644 --- a/src/storage/src/hummock/block_cache.rs +++ b/src/storage/src/hummock/block_cache.rs @@ -17,6 +17,7 @@ use std::hash::{Hash, Hasher}; use std::ops::Deref; use std::sync::Arc; +use await_tree::InstrumentAwait; use futures::Future; use risingwave_common::cache::{ CachePriority, CacheableEntry, LookupResponse, LruCache, LruCacheEventListener, @@ -99,10 +100,12 @@ impl BlockResponse { match self { BlockResponse::Block(block_holder) => Ok(block_holder), BlockResponse::WaitPendingRequest(receiver) => receiver + .verbose_instrument_await("wait_pending_fetch_block") .await .map_err(|recv_error| recv_error.into()) .map(BlockHolder::from_cached_block), BlockResponse::Miss(join_handle) => join_handle + .verbose_instrument_await("fetch_block") .await .unwrap() .map(BlockHolder::from_cached_block), diff --git a/src/storage/src/hummock/compactor/compaction_utils.rs b/src/storage/src/hummock/compactor/compaction_utils.rs index 245673f7656a8..3c1332d09317c 100644 --- a/src/storage/src/hummock/compactor/compaction_utils.rs +++ b/src/storage/src/hummock/compactor/compaction_utils.rs @@ -158,6 +158,63 @@ pub fn build_multi_compaction_filter(compact_task: &CompactTask) -> MultiCompact multi_filter } +const MAX_FILE_COUNT: usize = 32; + +fn generate_splits_fast( + sstable_infos: &Vec, + compaction_size: u64, + context: CompactorContext, +) -> HummockResult> { + let worker_num = context.compaction_executor.worker_num(); + let parallel_compact_size = (context.storage_opts.parallel_compact_size_mb as u64) << 20; + + let parallelism = (compaction_size + parallel_compact_size - 1) / parallel_compact_size; + + let parallelism = std::cmp::min( + worker_num, + std::cmp::min( + parallelism as usize, + context.storage_opts.max_sub_compaction as usize, + ), + ); + let mut indexes = vec![]; + for sst in sstable_infos { + let key_range = sst.key_range.as_ref().unwrap(); + indexes.push( + FullKey { + user_key: FullKey::decode(&key_range.left).user_key, + epoch: HummockEpoch::MAX, + } + .encode(), + ); + indexes.push( + FullKey { + user_key: FullKey::decode(&key_range.right).user_key, + epoch: HummockEpoch::MAX, + } + .encode(), + ); + } + indexes.sort_by(|a, b| KeyComparator::compare_encoded_full_key(a.as_ref(), b.as_ref())); + indexes.dedup(); + if indexes.len() <= parallelism { + return Ok(vec![]); + } + let mut splits = vec![]; + splits.push(KeyRange_vec::new(vec![], vec![])); + let parallel_key_count = indexes.len() / parallelism; + let mut last_split_key_count = 0; + for key in indexes { + if last_split_key_count >= parallel_key_count { + splits.last_mut().unwrap().right = key.clone(); + splits.push(KeyRange_vec::new(key.clone(), vec![])); + last_split_key_count = 0; + } + last_split_key_count += 1; + } + Ok(splits) +} + pub async fn generate_splits( sstable_infos: &Vec, compaction_size: u64, @@ -165,6 +222,9 @@ pub async fn generate_splits( ) -> HummockResult> { let parallel_compact_size = (context.storage_opts.parallel_compact_size_mb as u64) << 20; if compaction_size > parallel_compact_size { + if sstable_infos.len() > MAX_FILE_COUNT { + return generate_splits_fast(sstable_infos, compaction_size, context); + } let mut indexes = vec![]; // preload the meta and get the smallest key to split sub_compaction for sstable_info in sstable_infos { @@ -193,6 +253,7 @@ pub async fn generate_splits( indexes.sort_by(|a, b| KeyComparator::compare_encoded_full_key(a.1.as_ref(), b.1.as_ref())); let mut splits = vec![]; splits.push(KeyRange_vec::new(vec![], vec![])); + let worker_num = context.compaction_executor.worker_num(); let parallelism = std::cmp::min( diff --git a/src/storage/src/hummock/compactor/compactor_runner.rs b/src/storage/src/hummock/compactor/compactor_runner.rs index 85bfb7235b1c9..583bab3d10b3c 100644 --- a/src/storage/src/hummock/compactor/compactor_runner.rs +++ b/src/storage/src/hummock/compactor/compactor_runner.rs @@ -32,7 +32,7 @@ use tokio::sync::oneshot::Receiver; use super::task_progress::TaskProgress; use super::{CompactionStatistics, TaskConfig}; -use crate::filter_key_extractor::FilterKeyExtractorImpl; +use crate::filter_key_extractor::{FilterKeyExtractorImpl, FilterKeyExtractorManager}; use crate::hummock::compactor::compaction_utils::{ build_multi_compaction_filter, estimate_task_output_capacity, generate_splits, }; @@ -152,17 +152,20 @@ impl CompactorRunner { let mut local_stats = StoreLocalStatistic::default(); for table_info in sstable_infos { - let table = sstable_store.sstable(table_info, &mut local_stats).await?; - let mut range_tombstone_list = table.value().meta.monotonic_tombstone_events.clone(); - range_tombstone_list.iter_mut().for_each(|tombstone| { - if filter.should_delete(FullKey::from_user_key( - tombstone.event_key.left_user_key.as_ref(), - tombstone.new_epoch, - )) { - tombstone.new_epoch = HummockEpoch::MAX; - } - }); - builder.add_delete_events(range_tombstone_list); + if table_info.range_tombstone_count > 0 { + let table = sstable_store.sstable(table_info, &mut local_stats).await?; + let mut range_tombstone_list = + table.value().meta.monotonic_tombstone_events.clone(); + range_tombstone_list.iter_mut().for_each(|tombstone| { + if filter.should_delete(FullKey::from_user_key( + tombstone.event_key.left_user_key.as_ref(), + tombstone.new_epoch, + )) { + tombstone.new_epoch = HummockEpoch::MAX; + } + }); + builder.add_delete_events(range_tombstone_list); + } } let aggregator = builder.build_for_compaction(); @@ -244,6 +247,7 @@ pub async fn compact( mut compact_task: CompactTask, mut shutdown_rx: Receiver<()>, object_id_getter: Box, + filter_key_extractor_manager: FilterKeyExtractorManager, ) -> (CompactTask, HashMap) { let context = compactor_context.clone(); let group_label = compact_task.compaction_group_id.to_string(); @@ -316,8 +320,7 @@ pub async fn compact( .into_iter() .filter(|table_id| existing_table_ids.contains(table_id)), ); - let multi_filter_key_extractor = match compactor_context - .filter_key_extractor_manager + let multi_filter_key_extractor = match filter_key_extractor_manager .acquire(compact_table_ids.clone()) .await { @@ -433,17 +436,17 @@ pub async fn compact( ) * compact_task.splits.len() as u64; tracing::info!( - "Ready to handle compaction group {} task: {} compact_task_statistics {:?} target_level {} compression_algorithm {:?} table_ids {:?} parallelism {} task_memory_capacity_with_parallelism {}, enable fast runner: {}", - compact_task.compaction_group_id, - compact_task.task_id, - compact_task_statistics, - compact_task.target_level, - compact_task.compression_algorithm, - compact_task.existing_table_ids, - parallelism, - task_memory_capacity_with_parallelism, - optimize_by_copy_block - ); + "Ready to handle compaction group {} task: {} compact_task_statistics {:?} target_level {} compression_algorithm {:?} table_ids {:?} parallelism {} task_memory_capacity_with_parallelism {}, enable fast runner: {}", + compact_task.compaction_group_id, + compact_task.task_id, + compact_task_statistics, + compact_task.target_level, + compact_task.compression_algorithm, + compact_task.existing_table_ids, + parallelism, + task_memory_capacity_with_parallelism, + optimize_by_copy_block + ); // If the task does not have enough memory, it should cancel the task and let the meta // reschedule it, so that it does not occupy the compactor's resources. @@ -702,9 +705,9 @@ where progress_key_num += 1; if let Some(task_progress) = task_progress.as_ref() && progress_key_num >= PROGRESS_KEY_INTERVAL { - task_progress.inc_progress_key(progress_key_num); - progress_key_num = 0; - } + task_progress.inc_progress_key(progress_key_num); + progress_key_num = 0; + } let mut iter_key = iter.key(); compaction_statistics.iter_total_key_counts += 1; @@ -750,7 +753,13 @@ where .await?; } del_iter.next(); + progress_key_num += 1; + if let Some(task_progress) = task_progress.as_ref() && progress_key_num >= PROGRESS_KEY_INTERVAL { + task_progress.inc_progress_key(progress_key_num); + progress_key_num = 0; + } } + let earliest_range_delete_which_can_see_iter_key = del_iter.earliest_delete_since(epoch); // Among keys with same user key, only retain keys which satisfy `epoch` >= `watermark`. @@ -851,13 +860,18 @@ where }) .await?; del_iter.next(); + progress_key_num += 1; + if let Some(task_progress) = task_progress.as_ref() && progress_key_num >= PROGRESS_KEY_INTERVAL { + task_progress.inc_progress_key(progress_key_num); + progress_key_num = 0; + } } } if let Some(task_progress) = task_progress.as_ref() && progress_key_num > 0 { - // Avoid losing the progress_key_num in the last Interval - task_progress.inc_progress_key(progress_key_num); - } + // Avoid losing the progress_key_num in the last Interval + task_progress.inc_progress_key(progress_key_num); + } if let Some(last_table_id) = last_table_id.take() { table_stats_drop.insert(last_table_id, std::mem::take(&mut last_table_stats)); @@ -868,6 +882,7 @@ where Ok(compaction_statistics) } + #[cfg(test)] mod tests { use std::collections::HashSet; @@ -879,10 +894,8 @@ mod tests { use super::*; use crate::hummock::compactor::StateCleanUpCompactionFilter; use crate::hummock::iterator::test_utils::mock_sstable_store; - use crate::hummock::test_utils::{ - default_builder_opt_for_test, gen_test_sstable_with_range_tombstone, - }; - use crate::hummock::{create_monotonic_events, DeleteRangeTombstone}; + use crate::hummock::test_utils::{default_builder_opt_for_test, gen_test_sstable_impl}; + use crate::hummock::{create_monotonic_events, DeleteRangeTombstone, Xor16FilterBuilder}; #[tokio::test] async fn test_delete_range_aggregator_with_filter() { @@ -902,26 +915,26 @@ mod tests { 1, ), ]; - let mut sstable_info_1 = gen_test_sstable_with_range_tombstone( + let mut sstable_info_1 = gen_test_sstable_impl::( default_builder_opt_for_test(), 1, kv_pairs.clone().into_iter(), range_tombstones.clone(), sstable_store.clone(), + CachePolicy::NotFill, ) - .await - .get_sstable_info(); + .await; sstable_info_1.table_ids = vec![1]; - let mut sstable_info_2 = gen_test_sstable_with_range_tombstone( + let mut sstable_info_2 = gen_test_sstable_impl::( default_builder_opt_for_test(), 2, kv_pairs.into_iter(), range_tombstones.clone(), sstable_store.clone(), + CachePolicy::NotFill, ) - .await - .get_sstable_info(); + .await; sstable_info_2.table_ids = vec![2]; let compact_task = CompactTask { diff --git a/src/storage/src/hummock/compactor/context.rs b/src/storage/src/hummock/compactor/context.rs index ad3d5ffcc2dd6..ef015f26cded7 100644 --- a/src/storage/src/hummock/compactor/context.rs +++ b/src/storage/src/hummock/compactor/context.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use parking_lot::RwLock; use super::task_progress::TaskProgressManagerRef; -use crate::filter_key_extractor::FilterKeyExtractorManager; use crate::hummock::compactor::CompactionExecutor; use crate::hummock::sstable_store::SstableStoreRef; use crate::hummock::MemoryLimiter; @@ -42,8 +41,6 @@ pub struct CompactorContext { pub compaction_executor: Arc, - pub filter_key_extractor_manager: FilterKeyExtractorManager, - pub memory_limiter: Arc, pub task_progress_manager: TaskProgressManagerRef, @@ -58,7 +55,6 @@ impl CompactorContext { storage_opts: Arc, sstable_store: SstableStoreRef, compactor_metrics: Arc, - filter_key_extractor_manager: FilterKeyExtractorManager, ) -> Self { let compaction_executor = if storage_opts.share_buffer_compaction_worker_threads_number == 0 { @@ -76,7 +72,6 @@ impl CompactorContext { compactor_metrics, is_share_buffer_compact: true, compaction_executor, - filter_key_extractor_manager, memory_limiter: MemoryLimiter::unlimit(), task_progress_manager: Default::default(), await_tree_reg: None, diff --git a/src/storage/src/hummock/compactor/fast_compactor_runner.rs b/src/storage/src/hummock/compactor/fast_compactor_runner.rs index 787b12bde9c32..6dcfb0e2392cf 100644 --- a/src/storage/src/hummock/compactor/fast_compactor_runner.rs +++ b/src/storage/src/hummock/compactor/fast_compactor_runner.rs @@ -482,7 +482,7 @@ impl CompactorRunner { total_read_bytes += sst.file_size; } self.metrics - .write_build_l0_bytes + .compact_fast_runner_bytes .inc_by(skip_raw_block_size); tracing::info!( "OPTIMIZATION: skip {} blocks for task-{}, optimize {}% data compression", diff --git a/src/storage/src/hummock/compactor/mod.rs b/src/storage/src/hummock/compactor/mod.rs index 8442bf39c124e..d2f36167675e7 100644 --- a/src/storage/src/hummock/compactor/mod.rs +++ b/src/storage/src/hummock/compactor/mod.rs @@ -15,6 +15,16 @@ mod compaction_executor; mod compaction_filter; pub mod compaction_utils; +use risingwave_pb::compactor::{dispatch_compaction_task_request, DispatchCompactionTaskRequest}; +use risingwave_pb::hummock::report_compaction_task_request::{ + Event as ReportCompactionTaskEvent, HeartBeat as SharedHeartBeat, + ReportTask as ReportSharedTask, +}; +use risingwave_pb::hummock::{ReportFullScanTaskRequest, ReportVacuumTaskRequest}; +use risingwave_rpc_client::GrpcCompactorProxyClient; +use tokio::sync::mpsc; +use tonic::Request; + pub mod compactor_runner; mod context; pub mod fast_compactor_runner; @@ -22,7 +32,7 @@ mod iterator; mod shared_buffer_compact; pub(super) mod task_progress; -use std::collections::HashMap; +use std::collections::{HashMap, VecDeque}; use std::marker::PhantomData; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; @@ -46,7 +56,8 @@ use risingwave_pb::hummock::subscribe_compaction_event_request::{ }; use risingwave_pb::hummock::subscribe_compaction_event_response::Event as ResponseEvent; use risingwave_pb::hummock::{ - CompactTaskProgress, SubscribeCompactionEventRequest, SubscribeCompactionEventResponse, + CompactTaskProgress, ReportCompactionTaskRequest, SubscribeCompactionEventRequest, + SubscribeCompactionEventResponse, }; use risingwave_rpc_client::HummockMetaClient; pub use shared_buffer_compact::{compact, merge_imms_in_memory}; @@ -61,14 +72,17 @@ use super::{ CompactionDeleteRanges, GetObjectId, HummockResult, SstableBuilderOptions, SstableObjectIdManager, Xor16FilterBuilder, }; -use crate::filter_key_extractor::FilterKeyExtractorImpl; +use crate::filter_key_extractor::{ + FilterKeyExtractorImpl, FilterKeyExtractorManager, StaticFilterKeyExtractorManager, +}; use crate::hummock::compactor::compactor_runner::compact_and_build_sst; use crate::hummock::iterator::{Forward, HummockIterator}; use crate::hummock::multi_builder::SplitTableOutput; use crate::hummock::vacuum::Vacuum; use crate::hummock::{ validate_ssts, BatchSstableWriterFactory, BlockedXor16FilterBuilder, FilterBuilder, - HummockError, SstableWriterFactory, StreamingSstableWriterFactory, + HummockError, SharedComapctorObjectIdManager, SstableWriterFactory, + StreamingSstableWriterFactory, }; use crate::monitor::CompactorMetrics; @@ -316,6 +330,7 @@ pub fn start_compactor( compactor_context: CompactorContext, hummock_meta_client: Arc, sstable_object_id_manager: Arc, + filter_key_extractor_manager: FilterKeyExtractorManager, ) -> (JoinHandle<()>, Sender<()>) { type CompactionShutdownMap = Arc>>>; let (shutdown_tx, mut shutdown_rx) = tokio::sync::oneshot::channel(); @@ -391,17 +406,7 @@ pub fn start_compactor( let request_sender = request_sender.clone(); let event: Option> = tokio::select! { _ = periodic_event_interval.tick() => { - let mut progress_list = Vec::new(); - for (&task_id, progress) in &*task_progress.lock() { - progress_list.push(CompactTaskProgress { - task_id, - num_ssts_sealed: progress.num_ssts_sealed.load(Ordering::Relaxed), - num_ssts_uploaded: progress.num_ssts_uploaded.load(Ordering::Relaxed), - num_progress_key: progress.num_progress_key.load(Ordering::Relaxed), - num_pending_read_io: progress.num_pending_read_io.load(Ordering::Relaxed) as u64, - num_pending_write_io: progress.num_pending_write_io.load(Ordering::Relaxed) as u64, - }); - } + let progress_list = get_task_progress(task_progress.clone()); if let Err(e) = request_sender.send(SubscribeCompactionEventRequest { event: Some(RequestEvent::HeartBeat( @@ -458,7 +463,6 @@ pub fn start_compactor( continue; } - event = response_event_stream.next() => { event } @@ -489,6 +493,7 @@ pub fn start_compactor( let meta_client = hummock_meta_client.clone(); let sstable_object_id_manager = sstable_object_id_manager.clone(); + let filter_key_extractor_manager = filter_key_extractor_manager.clone(); executor.spawn(async move { let running_task_count = running_task_count.clone(); match event { @@ -507,7 +512,7 @@ pub fn start_compactor( sstable_object_id_manager.remove_watermark_object_id(tracker_id); }, ); - compactor_runner::compact(context, compact_task, rx, Box::new(sstable_object_id_manager.clone())).await + compactor_runner::compact(context, compact_task, rx, Box::new(sstable_object_id_manager.clone()), filter_key_extractor_manager.clone()).await }, Err(err) => { tracing::warn!("Failed to track pending SST object id. {:#?}", err); @@ -523,7 +528,9 @@ pub fn start_compactor( if let Err(e) = request_sender.send(SubscribeCompactionEventRequest { event: Some(RequestEvent::ReportTask( ReportTask { - compact_task: Some(compact_task), + task_id: compact_task.task_id, + task_status: compact_task.task_status, + sorted_output_ssts: compact_task.sorted_output_ssts, table_stats_change:to_prost_table_stats_map(table_stats), } )), @@ -608,3 +615,204 @@ pub fn start_compactor( (join_handle, shutdown_tx) } + +/// The background compaction thread that receives compaction tasks from hummock compaction +/// manager and runs compaction tasks. +#[cfg_attr(coverage, no_coverage)] +pub fn start_shared_compactor( + grpc_proxy_client: GrpcCompactorProxyClient, + mut receiver: mpsc::UnboundedReceiver>, + context: CompactorContext, +) -> (JoinHandle<()>, Sender<()>) { + type CompactionShutdownMap = Arc>>>; + let task_progress = context.task_progress_manager.clone(); + let (shutdown_tx, mut shutdown_rx) = tokio::sync::oneshot::channel(); + let periodic_event_update_interval = Duration::from_millis(1000); + + let join_handle = tokio::spawn(async move { + let shutdown_map = CompactionShutdownMap::default(); + + let mut periodic_event_interval = tokio::time::interval(periodic_event_update_interval); + let executor = context.compaction_executor.clone(); + let report_heartbeat_client = grpc_proxy_client.clone(); + 'consume_stream: loop { + let request: Option> = tokio::select! { + _ = periodic_event_interval.tick() => { + let progress_list = get_task_progress(task_progress.clone()); + let report_compaction_task_request = ReportCompactionTaskRequest{ + event: Some(ReportCompactionTaskEvent::HeartBeat( + SharedHeartBeat { + progress: progress_list + } + )), + }; + if let Err(e) = report_heartbeat_client.report_compaction_task(report_compaction_task_request).await{ + tracing::warn!("Failed to report heartbeat {:#?}", e); + } + continue + } + + + _ = &mut shutdown_rx => { + tracing::info!("Compactor is shutting down"); + return + } + + request = receiver.recv() => { + request + } + + }; + match request { + Some(request) => { + let context = context.clone(); + let shutdown = shutdown_map.clone(); + + let cloned_grpc_proxy_client = grpc_proxy_client.clone(); + executor.spawn(async move { + let DispatchCompactionTaskRequest { + tables, + output_object_ids, + task: dispatch_task, + } = request.into_inner(); + let id_to_tables = tables.into_iter().fold(HashMap::new(), |mut acc, table| { + acc.insert(table.id, table); + acc + }); + let static_filter_key_extractor_manager: Arc = + Arc::new(StaticFilterKeyExtractorManager::new(id_to_tables)); + let filter_key_extractor_manager = + FilterKeyExtractorManager::StaticFilterKeyExtractorManager( + static_filter_key_extractor_manager, + ); + + let mut output_object_ids_deque: VecDeque<_> = VecDeque::new(); + output_object_ids_deque.extend(output_object_ids); + let shared_compactor_object_id_manager = + SharedComapctorObjectIdManager::new(output_object_ids_deque, cloned_grpc_proxy_client.clone(), context.storage_opts.sstable_id_remote_fetch_number); + match dispatch_task.unwrap() { + dispatch_compaction_task_request::Task::CompactTask(compact_task) => { + context.running_task_count.fetch_add(1, Ordering::SeqCst); + let (tx, rx) = tokio::sync::oneshot::channel(); + let task_id = compact_task.task_id; + shutdown.lock().unwrap().insert(task_id, tx); + + let (compact_task, table_stats) = compactor_runner::compact( + context.clone(), + compact_task, + rx, + Box::new(shared_compactor_object_id_manager), + filter_key_extractor_manager.clone(), + ) + .await; + shutdown.lock().unwrap().remove(&task_id); + context.running_task_count.fetch_sub(1, Ordering::SeqCst); + let report_compaction_task_request = ReportCompactionTaskRequest { + event: Some(ReportCompactionTaskEvent::ReportTask(ReportSharedTask { + compact_task: Some(compact_task), + table_stats_change: to_prost_table_stats_map(table_stats), + })), + }; + + match cloned_grpc_proxy_client + .report_compaction_task(report_compaction_task_request) + .await + { + Ok(_) => {} + Err(e) => tracing::warn!("Failed to report task {task_id:?} . {e:?}"), + } + } + dispatch_compaction_task_request::Task::VacuumTask(vacuum_task) => { + match Vacuum::handle_vacuum_task( + context.sstable_store.clone(), + &vacuum_task.sstable_object_ids, + ) + .await + { + Ok(_) => { + let report_vacuum_task_request = ReportVacuumTaskRequest { + vacuum_task: Some(vacuum_task), + }; + match cloned_grpc_proxy_client.report_vacuum_task(report_vacuum_task_request).await { + Ok(_) => tracing::info!("Finished vacuuming SSTs"), + Err(e) => tracing::warn!("Failed to report vacuum task: {:#?}", e), + } + } + Err(e) => { + tracing::warn!("Failed to vacuum task: {:#?}", e) + } + } + } + dispatch_compaction_task_request::Task::FullScanTask(full_scan_task) => { + match Vacuum::handle_full_scan_task(full_scan_task, context.sstable_store.clone()) + .await + { + Ok((object_ids, total_object_count, total_object_size)) => { + let report_full_scan_task_request = ReportFullScanTaskRequest { + object_ids, + total_object_count, + total_object_size, + }; + match cloned_grpc_proxy_client + .report_full_scan_task(report_full_scan_task_request) + .await + { + Ok(_) => tracing::info!("Finished full scan SSTs"), + Err(e) => tracing::warn!("Failed to report full scan task: {:#?}", e), + } + } + Err(e) => { + tracing::warn!("Failed to iter object: {:#?}", e); + } + } + } + dispatch_compaction_task_request::Task::ValidationTask(validation_task) => { + validate_ssts(validation_task, context.sstable_store.clone()).await; + } + dispatch_compaction_task_request::Task::CancelCompactTask(cancel_compact_task) => { + if let Some(tx) = shutdown + .lock() + .unwrap() + .remove(&cancel_compact_task.task_id) + { + if tx.send(()).is_err() { + tracing::warn!( + "Cancellation of compaction task failed. task_id: {}", + cancel_compact_task.task_id + ); + } + } else { + tracing::warn!( + "Attempting to cancel non-existent compaction task. task_id: {}", + cancel_compact_task.task_id + ); + } + } + } + }); + } + None => continue 'consume_stream, + } + } + }); + (join_handle, shutdown_tx) +} + +fn get_task_progress( + task_progress: Arc< + parking_lot::lock_api::Mutex>>, + >, +) -> Vec { + let mut progress_list = Vec::new(); + for (&task_id, progress) in &*task_progress.lock() { + progress_list.push(CompactTaskProgress { + task_id, + num_ssts_sealed: progress.num_ssts_sealed.load(Ordering::Relaxed), + num_ssts_uploaded: progress.num_ssts_uploaded.load(Ordering::Relaxed), + num_progress_key: progress.num_progress_key.load(Ordering::Relaxed), + num_pending_read_io: progress.num_pending_read_io.load(Ordering::Relaxed) as u64, + num_pending_write_io: progress.num_pending_write_io.load(Ordering::Relaxed) as u64, + }); + } + progress_list +} diff --git a/src/storage/src/hummock/compactor/shared_buffer_compact.rs b/src/storage/src/hummock/compactor/shared_buffer_compact.rs index 0eca74f1dcaba..428361237c0ac 100644 --- a/src/storage/src/hummock/compactor/shared_buffer_compact.rs +++ b/src/storage/src/hummock/compactor/shared_buffer_compact.rs @@ -30,7 +30,7 @@ use risingwave_hummock_sdk::{CompactionGroupId, HummockEpoch, LocalSstableInfo}; use risingwave_pb::hummock::compact_task; use tracing::error; -use crate::filter_key_extractor::FilterKeyExtractorImpl; +use crate::filter_key_extractor::{FilterKeyExtractorImpl, FilterKeyExtractorManager}; use crate::hummock::compactor::compaction_filter::DummyCompactionFilter; use crate::hummock::compactor::context::CompactorContext; use crate::hummock::compactor::{CompactOutput, Compactor}; @@ -59,6 +59,7 @@ pub async fn compact( sstable_object_id_manager: SstableObjectIdManagerRef, payload: UploadTaskPayload, compaction_group_index: Arc>, + filter_key_extractor_manager: FilterKeyExtractorManager, ) -> HummockResult> { let mut grouped_payload: HashMap = HashMap::new(); for imm in payload { @@ -86,6 +87,7 @@ pub async fn compact( compact_shared_buffer( context.clone(), sstable_object_id_manager.clone(), + filter_key_extractor_manager.clone(), group_payload, ) .map_ok(move |results| { @@ -112,6 +114,7 @@ pub async fn compact( async fn compact_shared_buffer( context: CompactorContext, sstable_object_id_manager: SstableObjectIdManagerRef, + filter_key_extractor_manager: FilterKeyExtractorManager, mut payload: UploadTaskPayload, ) -> HummockResult> { // Local memory compaction looks at all key ranges. @@ -124,8 +127,7 @@ async fn compact_shared_buffer( assert!(!existing_table_ids.is_empty()); - let multi_filter_key_extractor = context - .filter_key_extractor_manager + let multi_filter_key_extractor = filter_key_extractor_manager .acquire(existing_table_ids.clone()) .await?; if let FilterKeyExtractorImpl::Multi(multi) = &multi_filter_key_extractor { diff --git a/src/storage/src/hummock/event_handler/hummock_event_handler.rs b/src/storage/src/hummock/event_handler/hummock_event_handler.rs index d9e25ebe46555..c55b73e6af6b0 100644 --- a/src/storage/src/hummock/event_handler/hummock_event_handler.rs +++ b/src/storage/src/hummock/event_handler/hummock_event_handler.rs @@ -30,6 +30,7 @@ use tracing::{error, info, trace, warn}; use super::refiller::{CacheRefillConfig, CacheRefiller}; use super::{LocalInstanceGuard, LocalInstanceId, ReadVersionMappingType}; +use crate::filter_key_extractor::FilterKeyExtractorManager; use crate::hummock::compactor::{compact, CompactorContext}; use crate::hummock::conflict_detector::ConflictDetector; use crate::hummock::event_handler::refiller::CacheRefillerEvent; @@ -133,6 +134,7 @@ async fn flush_imms( payload: UploadTaskPayload, task_info: UploadTaskInfo, compactor_context: CompactorContext, + filter_key_extractor_manager: FilterKeyExtractorManager, sstable_object_id_manager: Arc, ) -> HummockResult> { for epoch in &task_info.epochs { @@ -148,6 +150,7 @@ async fn flush_imms( sstable_object_id_manager, payload, task_info.compaction_group_index, + filter_key_extractor_manager, ) .verbose_instrument_await("shared_buffer_compact") .await @@ -159,6 +162,7 @@ impl HummockEventHandler { hummock_event_rx: mpsc::UnboundedReceiver, pinned_version: PinnedVersion, compactor_context: CompactorContext, + filter_key_extractor_manager: FilterKeyExtractorManager, sstable_object_id_manager: Arc, state_store_metrics: Arc, cache_refill_config: CacheRefillConfig, @@ -184,6 +188,7 @@ impl HummockEventHandler { payload, task_info, upload_compactor_context.clone(), + filter_key_extractor_manager.clone(), cloned_sstable_object_id_manager.clone(), )) }), diff --git a/src/storage/src/hummock/event_handler/refiller.rs b/src/storage/src/hummock/event_handler/refiller.rs index 131fec93c774e..e9eabb952d2d2 100644 --- a/src/storage/src/hummock/event_handler/refiller.rs +++ b/src/storage/src/hummock/event_handler/refiller.rs @@ -19,6 +19,7 @@ use std::sync::{Arc, LazyLock}; use std::task::{ready, Context, Poll}; use std::time::{Duration, Instant}; +use foyer::common::code::Key; use futures::future::{join_all, try_join_all}; use futures::{Future, FutureExt}; use itertools::Itertools; @@ -28,12 +29,16 @@ use prometheus::{ register_int_gauge_with_registry, Histogram, HistogramVec, IntGauge, Registry, }; use risingwave_common::monitor::GLOBAL_METRICS_REGISTRY; +use risingwave_common::util::iter_util::ZipEqFast; use risingwave_hummock_sdk::compaction_group::hummock_version_ext::SstDeltaInfo; use tokio::sync::Semaphore; use tokio::task::JoinHandle; +use crate::hummock::file_cache::preclude::*; use crate::hummock::local_version::pinned_version::PinnedVersion; -use crate::hummock::{HummockResult, SstableStoreRef, TableHolder}; +use crate::hummock::{ + Block, HummockError, HummockResult, Sstable, SstableBlockIndex, SstableStoreRef, TableHolder, +}; use crate::monitor::StoreLocalStatistic; pub static GLOBAL_CACHE_REFILL_METRICS: LazyLock = @@ -42,6 +47,7 @@ pub static GLOBAL_CACHE_REFILL_METRICS: LazyLock = pub struct CacheRefillMetrics { pub refill_duration: HistogramVec, pub refill_total: GenericCounterVec, + pub refill_bytes: GenericCounterVec, pub data_refill_success_duration: Histogram, pub meta_refill_success_duration: Histogram, @@ -51,6 +57,9 @@ pub struct CacheRefillMetrics { pub data_refill_started_total: GenericCounter, pub meta_refill_attempts_total: GenericCounter, + pub data_refill_ideal_bytes: GenericCounter, + pub data_refill_success_bytes: GenericCounter, + pub refill_queue_total: IntGauge, } @@ -70,6 +79,13 @@ impl CacheRefillMetrics { registry, ) .unwrap(); + let refill_bytes = register_int_counter_vec_with_registry!( + "refill_bytes", + "refill bytes", + &["type", "op"], + registry, + ) + .unwrap(); let data_refill_success_duration = refill_duration .get_metric_with_label_values(&["data", "success"]) @@ -91,6 +107,13 @@ impl CacheRefillMetrics { .get_metric_with_label_values(&["meta", "attempts"]) .unwrap(); + let data_refill_ideal_bytes = refill_bytes + .get_metric_with_label_values(&["data", "ideal"]) + .unwrap(); + let data_refill_success_bytes = refill_bytes + .get_metric_with_label_values(&["data", "success"]) + .unwrap(); + let refill_queue_total = register_int_gauge_with_registry!( "refill_queue_total", "refill queue total", @@ -101,6 +124,7 @@ impl CacheRefillMetrics { Self { refill_duration, refill_total, + refill_bytes, data_refill_success_duration, meta_refill_success_duration, @@ -108,6 +132,10 @@ impl CacheRefillMetrics { data_refill_attempts_total, data_refill_started_total, meta_refill_attempts_total, + + data_refill_ideal_bytes, + data_refill_success_bytes, + refill_queue_total, } } @@ -115,9 +143,22 @@ impl CacheRefillMetrics { #[derive(Debug)] pub struct CacheRefillConfig { + /// Cache refill timeout. pub timeout: Duration, + + /// Data file cache refill levels. pub data_refill_levels: HashSet, + + /// Data file cache refill concurrency. pub concurrency: usize, + + /// Data file cache refill unit (blocks). + pub unit: usize, + + /// Data file cache reill unit threshold. + /// + /// Only units whose admit rate > threshold will be refilled. + pub threshold: f64, } struct Item { @@ -265,10 +306,8 @@ impl CacheRefillTask { delta: &SstDeltaInfo, holders: Vec, ) { - let now = Instant::now(); - // return if data file cache is disabled - let Some(filter) = context.sstable_store.data_file_cache_refill_filter() else { + let Some(filter) = context.sstable_store.data_recent_filter() else { return; }; @@ -294,29 +333,126 @@ impl CacheRefillTask { let mut tasks = vec![]; for sst_info in &holders { let task = async move { - GLOBAL_CACHE_REFILL_METRICS.data_refill_attempts_total.inc(); + if let Err(e) = Self::data_file_cache_refill_impl(context, sst_info.value()).await { + tracing::warn!("data cache refill error: {:?}", e); + } + }; + tasks.push(task); + } + + join_all(tasks).await; + } + + async fn data_file_cache_refill_impl( + context: &CacheRefillContext, + sst: &Sstable, + ) -> HummockResult<()> { + let sstable_store = &context.sstable_store; + let object_id = sst.id; + let unit = context.config.unit; + let threshold = context.config.threshold; + + if let Some(filter) = sstable_store.data_recent_filter() { + filter.insert(object_id); + } + + let mut tasks = vec![]; + + // unit-level refill: + // + // Although file cache receivces item by block, a larger range of data is still recommended to reduce + // S3 iops and per request base latency waste. + // + // To decide which unit to refill, we calculate the ratio that the block of a unit will be received by + // file cache. If the ratio is higher than a threshold, we fetich and refill the whole unit by block. + + for block_index_start in (0..sst.block_count()).step_by(unit) { + let block_index_end = std::cmp::min(block_index_start + unit, sst.block_count()); + + let (range_first, _) = sst.calculate_block_info(block_index_start); + let (range_last, _) = sst.calculate_block_info(block_index_end - 1); + let range = range_first.start..range_last.end; + + GLOBAL_CACHE_REFILL_METRICS + .data_refill_ideal_bytes + .inc_by((range.end - range.start) as u64); + + let mut writers = Vec::with_capacity(block_index_end - block_index_start); + let mut ranges = Vec::with_capacity(block_index_end - block_index_start); + let mut admits = 0; + + for block_index in block_index_start..block_index_end { + let (range, uncompressed_capacity) = sst.calculate_block_info(block_index); + let key = SstableBlockIndex { + sst_id: object_id, + block_idx: block_index as u64, + }; + let mut writer = sstable_store + .data_file_cache() + .writer(key, key.serialized_len() + uncompressed_capacity); + + if writer.judge() { + admits += 1; + } + + writers.push(writer); + ranges.push(range); + } + + if admits as f64 / writers.len() as f64 >= threshold { + let task = async move { + GLOBAL_CACHE_REFILL_METRICS.data_refill_attempts_total.inc(); - let permit = context.concurrency.acquire().await.unwrap(); + let permit = context.concurrency.acquire().await.unwrap(); - GLOBAL_CACHE_REFILL_METRICS.data_refill_started_total.inc(); + GLOBAL_CACHE_REFILL_METRICS.data_refill_started_total.inc(); - match context - .sstable_store - .fill_data_file_cache(sst_info.value()) - .await - { - Ok(()) => GLOBAL_CACHE_REFILL_METRICS + let timer = GLOBAL_CACHE_REFILL_METRICS .data_refill_success_duration - .observe(now.elapsed().as_secs_f64()), - Err(e) => { - tracing::warn!("data cache refill error: {:?}", e); + .start_timer(); + + let data = sstable_store + .store() + .read(&sstable_store.get_sst_data_path(object_id), range.clone()) + .await?; + let mut futures = vec![]; + for (mut writer, r) in writers.into_iter().zip_eq_fast(ranges) { + let offset = r.start - range.start; + let len = r.end - r.start; + let bytes = data.slice(offset..offset + len); + + let future = async move { + let block = Block::decode( + bytes, + writer.weight() - writer.key().serialized_len(), + )?; + let block = Box::new(block); + writer.force(); + let res = writer.finish(block).await.map_err(HummockError::file_cache); + if matches!(res, Ok(true)) { + GLOBAL_CACHE_REFILL_METRICS + .data_refill_success_bytes + .inc_by(len as u64); + } + res + }; + futures.push(future); } - } - drop(permit); - }; - tasks.push(task); + try_join_all(futures) + .await + .map_err(HummockError::file_cache)?; + + drop(permit); + drop(timer); + + Ok::<_, HummockError>(()) + }; + tasks.push(task); + } } - join_all(tasks).await; + try_join_all(tasks).await?; + + Ok(()) } } diff --git a/src/storage/src/hummock/file_cache/store.rs b/src/storage/src/hummock/file_cache/store.rs index fd549cbc2a96c..9de54552ae077 100644 --- a/src/storage/src/hummock/file_cache/store.rs +++ b/src/storage/src/hummock/file_cache/store.rs @@ -20,29 +20,25 @@ use std::time::Duration; use bytes::{Buf, BufMut, Bytes}; use foyer::common::code::{Key, Value}; -use foyer::storage::admission::rated_random::RatedRandomAdmissionPolicy; +use foyer::intrusive::eviction::lfu::LfuConfig; +use foyer::storage::admission::rated_ticket::RatedTicketAdmissionPolicy; use foyer::storage::admission::AdmissionPolicy; -use foyer::storage::event::EventListener; +use foyer::storage::device::fs::FsDeviceConfig; pub use foyer::storage::metrics::set_metrics_registry as set_foyer_metrics_registry; -use foyer::storage::store::FetchValueFuture; -use foyer::storage::LfuFsStoreConfig; -use risingwave_common::util::runtime::BackgroundShutdownRuntime; +use foyer::storage::reinsertion::ReinsertionPolicy; +use foyer::storage::runtime::{ + RuntimeConfig, RuntimeLazyStore, RuntimeLazyStoreConfig, RuntimeLazyStoreWriter, +}; +use foyer::storage::storage::{Storage, StorageWriter}; +use foyer::storage::store::{LfuFsStoreConfig, NoneStore, NoneStoreWriter}; use risingwave_hummock_sdk::HummockSstableObjectId; use crate::hummock::{Block, Sstable, SstableMeta}; -#[derive(thiserror::Error, Debug)] -pub enum FileCacheError { - #[error("foyer error: {0}")] - Foyer(#[from] foyer::storage::error::Error), - #[error("other {0}")] - Other(#[from] Box), -} - -impl FileCacheError { - fn foyer(e: foyer::storage::error::Error) -> Self { - Self::Foyer(e) - } +pub mod preclude { + pub use foyer::storage::storage::{ + AsyncStorageExt, ForceStorageExt, Storage, StorageExt, StorageWriter, + }; } pub type Result = core::result::Result; @@ -50,11 +46,11 @@ pub type Result = core::result::Result; pub type EvictionConfig = foyer::intrusive::eviction::lfu::LfuConfig; pub type DeviceConfig = foyer::storage::device::fs::FsDeviceConfig; -pub type FoyerStore = foyer::storage::LfuFsStore; -pub type FoyerStoreResult = foyer::storage::error::Result; -pub type FoyerStoreError = foyer::storage::error::Error; +pub type FileCacheResult = foyer::storage::error::Result; +pub type FileCacheError = foyer::storage::error::Error; -pub struct FoyerStoreConfig +#[derive(Debug)] +pub struct FileCacheConfig where K: Key, V: Value, @@ -73,331 +69,320 @@ where pub recover_concurrency: usize, pub lfu_window_to_cache_size_ratio: usize, pub lfu_tiny_lru_capacity_ratio: f64, - pub rated_random_rate: usize, - pub event_listener: Vec>>, - pub enable_filter: bool, + pub insert_rate_limit: usize, + pub allocator_bits: usize, + pub allocation_timeout: Duration, + pub admissions: Vec>>, + pub reinsertions: Vec>>, } -pub struct FoyerRuntimeConfig +impl Clone for FileCacheConfig where K: Key, V: Value, { - pub foyer_store_config: FoyerStoreConfig, - pub runtime_worker_threads: Option, + fn clone(&self) -> Self { + Self { + name: self.name.clone(), + dir: self.dir.clone(), + capacity: self.capacity, + file_capacity: self.file_capacity, + buffer_pool_size: self.buffer_pool_size, + device_align: self.device_align, + device_io_size: self.device_io_size, + flushers: self.flushers, + flush_rate_limit: self.flush_rate_limit, + reclaimers: self.reclaimers, + reclaim_rate_limit: self.reclaim_rate_limit, + recover_concurrency: self.recover_concurrency, + lfu_window_to_cache_size_ratio: self.lfu_window_to_cache_size_ratio, + lfu_tiny_lru_capacity_ratio: self.lfu_tiny_lru_capacity_ratio, + insert_rate_limit: self.insert_rate_limit, + allocator_bits: self.allocator_bits, + allocation_timeout: self.allocation_timeout, + admissions: self.admissions.clone(), + reinsertions: self.reinsertions.clone(), + } + } } -#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord, Hash)] -pub struct SstableBlockIndex { - pub sst_id: HummockSstableObjectId, - pub block_idx: u64, +#[derive(Debug)] +pub enum FileCacheWriter +where + K: Key, + V: Value, +{ + Foyer { + writer: RuntimeLazyStoreWriter, + }, + None { + writer: NoneStoreWriter, + }, } -impl Key for SstableBlockIndex { - fn serialized_len(&self) -> usize { - 8 + 8 // sst_id (8B) + block_idx (8B) - } - - fn write(&self, mut buf: &mut [u8]) { - buf.put_u64(self.sst_id); - buf.put_u64(self.block_idx); +impl From> for FileCacheWriter +where + K: Key, + V: Value, +{ + fn from(writer: RuntimeLazyStoreWriter) -> Self { + Self::Foyer { writer } } +} - fn read(mut buf: &[u8]) -> Self { - let sst_id = buf.get_u64(); - let block_idx = buf.get_u64(); - Self { sst_id, block_idx } +impl From> for FileCacheWriter +where + K: Key, + V: Value, +{ + fn from(writer: NoneStoreWriter) -> Self { + Self::None { writer } } } -impl Value for Box { - fn serialized_len(&self) -> usize { - self.raw_data().len() - } +impl StorageWriter for FileCacheWriter +where + K: Key, + V: Value, +{ + type Key = K; + type Value = V; - fn write(&self, mut buf: &mut [u8]) { - buf.put_slice(self.raw_data()) + fn key(&self) -> &Self::Key { + match self { + FileCacheWriter::Foyer { writer } => writer.key(), + FileCacheWriter::None { writer } => writer.key(), + } } - fn read(buf: &[u8]) -> Self { - let data = Bytes::copy_from_slice(buf); - let block = Block::decode_from_raw(data); - Box::new(block) + fn weight(&self) -> usize { + match self { + FileCacheWriter::Foyer { writer } => writer.weight(), + FileCacheWriter::None { writer } => writer.weight(), + } } -} -impl Value for Box { - fn serialized_len(&self) -> usize { - 8 + self.meta.encoded_size() // id (8B) + meta size + fn judge(&mut self) -> bool { + match self { + FileCacheWriter::Foyer { writer } => writer.judge(), + FileCacheWriter::None { writer } => writer.judge(), + } } - fn write(&self, mut buf: &mut [u8]) { - buf.put_u64(self.id); - // TODO(MrCroxx): avoid buffer copy - let mut buffer = vec![]; - self.meta.encode_to(&mut buffer); - buf.put_slice(&buffer[..]) + fn force(&mut self) { + match self { + FileCacheWriter::Foyer { writer } => writer.force(), + FileCacheWriter::None { writer } => writer.force(), + } } - fn read(mut buf: &[u8]) -> Self { - let id = buf.get_u64(); - let meta = SstableMeta::decode(buf).unwrap(); - Box::new(Sstable::new(id, meta)) + async fn finish(self, value: Self::Value) -> FileCacheResult { + match self { + FileCacheWriter::Foyer { writer } => writer.finish(value).await, + FileCacheWriter::None { writer } => writer.finish(value).await, + } } } -#[derive(Clone)] +#[derive(Debug)] pub enum FileCache where - K: Key + Copy, + K: Key, V: Value, { - None, - FoyerRuntime { - runtime: Arc, - store: Arc>, - enable_filter: bool, - }, + Foyer { store: RuntimeLazyStore }, + None { store: NoneStore }, +} + +impl Clone for FileCache +where + K: Key, + V: Value, +{ + fn clone(&self) -> Self { + match self { + Self::Foyer { store } => Self::Foyer { + store: store.clone(), + }, + Self::None { store } => Self::None { + store: store.clone(), + }, + } + } } impl FileCache where - K: Key + Copy, + K: Key, V: Value, { pub fn none() -> Self { - Self::None + Self::None { + store: NoneStore::default(), + } } +} - pub async fn foyer(config: FoyerRuntimeConfig) -> Result { - let mut builder = tokio::runtime::Builder::new_multi_thread(); - if let Some(runtime_worker_threads) = config.runtime_worker_threads { - builder.worker_threads(runtime_worker_threads); +impl Storage for FileCache +where + K: Key, + V: Value, +{ + type Config = FileCacheConfig; + type Key = K; + type Value = V; + type Writer = FileCacheWriter; + + async fn open(config: Self::Config) -> FileCacheResult { + let mut admissions = config.admissions; + if config.insert_rate_limit > 0 { + admissions.push(Arc::new(RatedTicketAdmissionPolicy::new( + config.insert_rate_limit, + ))); } - let runtime = builder - .thread_name("risingwave-foyer-storage") - .enable_all() - .build() - .map_err(|e| FileCacheError::Other(e.into()))?; - - let enable_filter = config.foyer_store_config.enable_filter; - - let store = runtime - .spawn(async move { - let foyer_store_config = config.foyer_store_config; - - let file_capacity = foyer_store_config.file_capacity; - let capacity = foyer_store_config.capacity; - let capacity = capacity - (capacity % file_capacity); - - let mut admissions: Vec>> = vec![]; - if foyer_store_config.rated_random_rate > 0 { - let rr = RatedRandomAdmissionPolicy::new( - foyer_store_config.rated_random_rate, - Duration::from_millis(100), - ); - admissions.push(Arc::new(rr)); - } - - let c = LfuFsStoreConfig { - name: foyer_store_config.name, - eviction_config: EvictionConfig { - window_to_cache_size_ratio: foyer_store_config - .lfu_window_to_cache_size_ratio, - tiny_lru_capacity_ratio: foyer_store_config.lfu_tiny_lru_capacity_ratio, - }, - device_config: DeviceConfig { - dir: foyer_store_config.dir.clone(), - capacity, - file_capacity, - align: foyer_store_config.device_align, - io_size: foyer_store_config.device_io_size, - }, - admissions, - reinsertions: vec![], - buffer_pool_size: foyer_store_config.buffer_pool_size, - flushers: foyer_store_config.flushers, - flush_rate_limit: foyer_store_config.flush_rate_limit, - reclaimers: foyer_store_config.reclaimers, - reclaim_rate_limit: foyer_store_config.reclaim_rate_limit, - recover_concurrency: foyer_store_config.recover_concurrency, - event_listeners: foyer_store_config.event_listener, - clean_region_threshold: foyer_store_config.reclaimers - + foyer_store_config.reclaimers / 2, - }; - - FoyerStore::open(c).await.map_err(FileCacheError::foyer) - }) - .await - .unwrap()?; - - Ok(Self::FoyerRuntime { - runtime: Arc::new(runtime.into()), - store, - enable_filter, - }) - } - #[tracing::instrument(skip(self, value))] - pub async fn insert(&self, key: K, value: V) -> Result { - match self { - FileCache::None => Ok(false), - FileCache::FoyerRuntime { runtime, store, .. } => { - let store = store.clone(); - runtime - .spawn(async move { store.insert_if_not_exists(key, value).await }) - .await - .unwrap() - .map_err(FileCacheError::foyer) + let c = RuntimeLazyStoreConfig { + store: LfuFsStoreConfig { + name: config.name.clone(), + eviction_config: LfuConfig { + window_to_cache_size_ratio: config.lfu_window_to_cache_size_ratio, + tiny_lru_capacity_ratio: config.lfu_tiny_lru_capacity_ratio, + }, + device_config: FsDeviceConfig { + dir: config.dir, + capacity: config.capacity, + file_capacity: config.file_capacity, + align: config.device_align, + io_size: config.device_io_size, + }, + allocator_bits: config.allocator_bits, + admissions, + reinsertions: config.reinsertions, + buffer_pool_size: config.buffer_pool_size, + flushers: config.flushers, + flush_rate_limit: config.flush_rate_limit, + reclaimers: config.reclaimers, + reclaim_rate_limit: config.reclaim_rate_limit, + allocation_timeout: config.allocation_timeout, + clean_region_threshold: config.reclaimers + config.reclaimers / 2, + recover_concurrency: config.recover_concurrency, } - } + .into(), + runtime: RuntimeConfig { + worker_threads: None, + thread_name: Some(config.name), + }, + }; + let store = RuntimeLazyStore::open(c).await?; + Ok(Self::Foyer { store }) } - #[tracing::instrument(skip(self))] - pub fn insert_without_wait(&self, key: K, value: V) { + fn is_ready(&self) -> bool { match self { - FileCache::None => {} - FileCache::FoyerRuntime { runtime, store, .. } => { - let store = store.clone(); - runtime.spawn(async move { store.insert_if_not_exists(key, value).await }); - } + FileCache::Foyer { store } => store.is_ready(), + FileCache::None { store } => store.is_ready(), } } - #[tracing::instrument(skip(self, value))] - pub async fn insert_force(&self, key: K, value: V) -> Result { + async fn close(&self) -> FileCacheResult<()> { match self { - FileCache::None => Ok(false), - FileCache::FoyerRuntime { runtime, store, .. } => { - let store = store.clone(); - runtime - .spawn(async move { store.insert_force(key, value).await }) - .await - .unwrap() - .map_err(FileCacheError::foyer) - } + FileCache::Foyer { store } => store.close().await, + FileCache::None { store } => store.close().await, } } - /// only fetch value if judge pass - #[tracing::instrument(skip(self, fetch_value))] - pub async fn insert_with( - &self, - key: K, - fetch_value: F, - value_serialized_len: usize, - ) -> Result - where - F: FnOnce() -> FU, - FU: FetchValueFuture, - { + fn writer(&self, key: Self::Key, weight: usize) -> Self::Writer { match self { - FileCache::None => Ok(false), - FileCache::FoyerRuntime { runtime, store, .. } => { - let store = store.clone(); - let future = fetch_value(); - runtime - .spawn(async move { - store - .insert_if_not_exists_with_future( - key, - || future, - key.serialized_len() + value_serialized_len, - ) - .await - }) - .await - .unwrap() - .map_err(FileCacheError::foyer) - } + FileCache::Foyer { store } => store.writer(key, weight).into(), + FileCache::None { store } => store.writer(key, weight).into(), } } - #[tracing::instrument(skip(self))] - pub async fn remove(&self, key: &K) -> Result { + fn exists(&self, key: &Self::Key) -> FileCacheResult { match self { - FileCache::None => Ok(false), - FileCache::FoyerRuntime { runtime, store, .. } => { - let store = store.clone(); - let key = *key; - runtime - .spawn(async move { store.remove(&key).await }) - .await - .unwrap() - .map_err(FileCacheError::foyer) - } + FileCache::Foyer { store } => store.exists(key), + FileCache::None { store } => store.exists(key), } } - #[tracing::instrument(skip(self))] - pub fn remove_without_wait(&self, key: &K) { + async fn lookup(&self, key: &Self::Key) -> FileCacheResult> { match self { - FileCache::None => {} - FileCache::FoyerRuntime { runtime, store, .. } => { - let store = store.clone(); - let key = *key; - runtime.spawn(async move { store.remove(&key).await }); - } + FileCache::Foyer { store } => store.lookup(key).await, + FileCache::None { store } => store.lookup(key).await, } } - #[tracing::instrument(skip(self))] - pub async fn clear(&self) -> Result<()> { + fn remove(&self, key: &Self::Key) -> FileCacheResult { match self { - FileCache::None => Ok(()), - FileCache::FoyerRuntime { runtime, store, .. } => { - let store = store.clone(); - runtime - .spawn(async move { store.clear().await }) - .await - .unwrap() - .map_err(FileCacheError::foyer) - } + FileCache::Foyer { store } => store.remove(key), + FileCache::None { store } => store.remove(key), } } - #[tracing::instrument(skip(self))] - pub fn clear_without_wait(&self) { + fn clear(&self) -> FileCacheResult<()> { match self { - FileCache::None => {} - FileCache::FoyerRuntime { runtime, store, .. } => { - let store = store.clone(); - runtime.spawn(async move { store.clear().await }); - } + FileCache::Foyer { store } => store.clear(), + FileCache::None { store } => store.clear(), } } +} - #[tracing::instrument(skip(self))] - pub async fn lookup(&self, key: &K) -> Result> { - match self { - FileCache::None => Ok(None), - FileCache::FoyerRuntime { runtime, store, .. } => { - let store = store.clone(); - let key = *key; - runtime - .spawn(async move { store.lookup(&key).await }) - .await - .unwrap() - .map_err(FileCacheError::foyer) - } - } +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord, Hash)] +pub struct SstableBlockIndex { + pub sst_id: HummockSstableObjectId, + pub block_idx: u64, +} + +impl Key for SstableBlockIndex { + fn serialized_len(&self) -> usize { + 8 + 8 // sst_id (8B) + block_idx (8B) } - #[tracing::instrument(skip(self))] - pub async fn exists(&self, key: &K) -> Result { - match self { - FileCache::None => Ok(false), - FileCache::FoyerRuntime { store, .. } => { - store.exists(key).map_err(FileCacheError::foyer) - } - } + fn write(&self, mut buf: &mut [u8]) { + buf.put_u64(self.sst_id); + buf.put_u64(self.block_idx); } - pub fn is_filter_enabled(&self) -> bool { - match self { - FileCache::None => false, - FileCache::FoyerRuntime { enable_filter, .. } => *enable_filter, - } + fn read(mut buf: &[u8]) -> Self { + let sst_id = buf.get_u64(); + let block_idx = buf.get_u64(); + Self { sst_id, block_idx } + } +} + +impl Value for Box { + fn serialized_len(&self) -> usize { + self.raw_data().len() + } + + fn write(&self, mut buf: &mut [u8]) { + buf.put_slice(self.raw_data()) + } + + fn read(buf: &[u8]) -> Self { + let data = Bytes::copy_from_slice(buf); + let block = Block::decode_from_raw(data); + Box::new(block) + } +} + +impl Value for Box { + fn serialized_len(&self) -> usize { + 8 + self.meta.encoded_size() // id (8B) + meta size + } + + fn write(&self, mut buf: &mut [u8]) { + buf.put_u64(self.id); + // TODO(MrCroxx): avoid buffer copy + let mut buffer = vec![]; + self.meta.encode_to(&mut buffer); + buf.put_slice(&buffer[..]) + } + + fn read(mut buf: &[u8]) -> Self { + let id = buf.get_u64(); + let meta = SstableMeta::decode(buf).unwrap(); + Box::new(Sstable::new(id, meta)) } } diff --git a/src/storage/src/hummock/iterator/test_utils.rs b/src/storage/src/hummock/iterator/test_utils.rs index a11e46e879dda..4845d7b43a0e4 100644 --- a/src/storage/src/hummock/iterator/test_utils.rs +++ b/src/storage/src/hummock/iterator/test_utils.rs @@ -67,6 +67,7 @@ pub fn mock_sstable_store_with_object_store(store: ObjectStoreRef) -> SstableSto 0, FileCache::none(), FileCache::none(), + None, )) } diff --git a/src/storage/src/hummock/mod.rs b/src/storage/src/hummock/mod.rs index 5634fbd56086a..60553b5aa09a3 100644 --- a/src/storage/src/hummock/mod.rs +++ b/src/storage/src/hummock/mod.rs @@ -25,7 +25,7 @@ use risingwave_pb::hummock::SstableInfo; mod block_cache; pub use block_cache::*; -mod file_cache; +pub mod file_cache; pub use file_cache::*; pub mod sstable; diff --git a/src/storage/src/hummock/sstable/block.rs b/src/storage/src/hummock/sstable/block.rs index 59ea0a6805b35..809f797bb11e8 100644 --- a/src/storage/src/hummock/sstable/block.rs +++ b/src/storage/src/hummock/sstable/block.rs @@ -563,8 +563,8 @@ impl BlockBuilder { /// # Format /// /// ```plain - /// compressed: | entries | restart point 0 (4B) | ... | restart point N-1 (4B) | N (4B) | restart point index 0 (5B)| ... | restart point index N-1 (5B) | N (4B) - /// uncompressed: | compression method (1B) | crc32sum (4B) | + /// compressed: | entries | restart point 0 (4B) | ... | restart point N-1 (4B) | N (4B) | restart point index 0 (5B)| ... | restart point index N-1 (5B) | N (4B) | table id (4B) + /// uncompressed: | compression method (1B) | xxhash64 checksum (8B) | /// ``` /// /// # Panics diff --git a/src/storage/src/hummock/sstable/builder.rs b/src/storage/src/hummock/sstable/builder.rs index 99af1e281cfe6..0cf7c2fd850a7 100644 --- a/src/storage/src/hummock/sstable/builder.rs +++ b/src/storage/src/hummock/sstable/builder.rs @@ -364,9 +364,6 @@ impl SstableBuilder { /// Finish building sst. /// - /// Unlike most LSM-Tree implementations, sstable meta and data are encoded separately. - /// Both meta and data has its own object (file). - /// /// # Format /// /// data: diff --git a/src/storage/src/hummock/sstable/forward_sstable_iterator.rs b/src/storage/src/hummock/sstable/forward_sstable_iterator.rs index edb6c372ba31c..3988d082177f8 100644 --- a/src/storage/src/hummock/sstable/forward_sstable_iterator.rs +++ b/src/storage/src/hummock/sstable/forward_sstable_iterator.rs @@ -235,7 +235,7 @@ impl SstableIterator { idx: usize, seek_key: Option>, ) -> HummockResult<()> { - tracing::trace!( + tracing::debug!( target: "events::storage::sstable::block_seek", "table iterator seek: sstable_object_id = {}, block_id = {}", self.sst.value().id, diff --git a/src/storage/src/hummock/sstable/sstable_object_id_manager.rs b/src/storage/src/hummock/sstable/sstable_object_id_manager.rs index 6ae7ddad4a7ea..69ca3712eb379 100644 --- a/src/storage/src/hummock/sstable/sstable_object_id_manager.rs +++ b/src/storage/src/hummock/sstable/sstable_object_id_manager.rs @@ -22,8 +22,9 @@ use std::sync::Arc; use itertools::Itertools; use parking_lot::Mutex; use risingwave_hummock_sdk::{HummockEpoch, HummockSstableObjectId, SstObjectIdRange}; +use risingwave_pb::hummock::GetNewSstIdsRequest; use risingwave_pb::meta::heartbeat_request::extra_info::Info; -use risingwave_rpc_client::{ExtraInfoSource, HummockMetaClient}; +use risingwave_rpc_client::{ExtraInfoSource, GrpcCompactorProxyClient, HummockMetaClient}; use sync_point::sync_point; use tokio::sync::oneshot; @@ -198,25 +199,95 @@ impl GetObjectId for Arc { } } +struct SharedComapctorObjectIdManagerCore { + output_object_ids: VecDeque, + client: Option, + sstable_id_remote_fetch_number: u32, +} +impl SharedComapctorObjectIdManagerCore { + pub fn new( + output_object_ids: VecDeque, + client: GrpcCompactorProxyClient, + sstable_id_remote_fetch_number: u32, + ) -> Self { + Self { + output_object_ids, + client: Some(client), + sstable_id_remote_fetch_number, + } + } + + pub fn for_test(output_object_ids: VecDeque) -> Self { + Self { + output_object_ids, + client: None, + sstable_id_remote_fetch_number: 0, + } + } +} /// `SharedComapctorObjectIdManager` is used to get output sst id for serverless compaction. #[derive(Clone)] pub struct SharedComapctorObjectIdManager { - output_object_ids: VecDeque, + core: Arc>, } impl SharedComapctorObjectIdManager { - pub fn new(output_object_ids: VecDeque) -> Self { - Self { output_object_ids } + pub fn new( + output_object_ids: VecDeque, + client: GrpcCompactorProxyClient, + sstable_id_remote_fetch_number: u32, + ) -> Self { + Self { + core: Arc::new(tokio::sync::Mutex::new( + SharedComapctorObjectIdManagerCore::new( + output_object_ids, + client, + sstable_id_remote_fetch_number, + ), + )), + } + } + + pub fn for_test(output_object_ids: VecDeque) -> Self { + Self { + core: Arc::new(tokio::sync::Mutex::new( + SharedComapctorObjectIdManagerCore::for_test(output_object_ids), + )), + } } } #[async_trait::async_trait] impl GetObjectId for SharedComapctorObjectIdManager { async fn get_new_sst_object_id(&mut self) -> HummockResult { - if let Some(first_element) = self.output_object_ids.pop_front() { + let mut guard = self.core.lock().await; + let core = guard.deref_mut(); + + if let Some(first_element) = core.output_object_ids.pop_front() { Ok(first_element) } else { - return Err(HummockError::other("Output object id runs out")); + tracing::warn!("The pre-allocated object ids are used up, and new object id are obtained through RPC."); + let request = GetNewSstIdsRequest { + number: core.sstable_id_remote_fetch_number, + }; + match core + .client + .as_mut() + .expect("GrpcCompactorProxyClient is None") + .get_new_sst_ids(request) + .await + { + Ok(response) => { + let resp = response.into_inner(); + let start_id = resp.start_id; + core.output_object_ids.extend((start_id + 1)..resp.end_id); + Ok(start_id) + } + Err(e) => Err(HummockError::other(format!( + "Fail to get new sst id, {}", + e + ))), + } } } } @@ -313,14 +384,10 @@ impl SstObjectIdTrackerInner { #[cfg(test)] mod test { - use std::collections::VecDeque; - use risingwave_common::try_match_expand; use crate::hummock::sstable::sstable_object_id_manager::AutoTrackerId; - use crate::hummock::{ - GetObjectId, SharedComapctorObjectIdManager, SstObjectIdTracker, TrackerId, - }; + use crate::hummock::{SstObjectIdTracker, TrackerId}; #[tokio::test] async fn test_object_id_tracker_basic() { @@ -390,18 +457,4 @@ mod test { object_id_tacker.remove_tracker(auto_id_3); assert!(object_id_tacker.tracking_object_ids().is_empty()); } - - #[tokio::test] - async fn test_shared_comapctor_object_id_manager() { - let mut pre_allocated_object_ids: VecDeque<_> = VecDeque::new(); - pre_allocated_object_ids.extend(vec![1, 3, 5]); - let mut object_id_manager = SharedComapctorObjectIdManager::new(pre_allocated_object_ids); - assert_eq!(object_id_manager.get_new_sst_object_id().await.unwrap(), 1); - - assert_eq!(object_id_manager.get_new_sst_object_id().await.unwrap(), 3); - - assert_eq!(object_id_manager.get_new_sst_object_id().await.unwrap(), 5); - - assert!(object_id_manager.get_new_sst_object_id().await.is_err()); - } } diff --git a/src/storage/src/hummock/sstable_store.rs b/src/storage/src/hummock/sstable_store.rs index 9d835409a6e28..73d6110cacd29 100644 --- a/src/storage/src/hummock/sstable_store.rs +++ b/src/storage/src/hummock/sstable_store.rs @@ -15,12 +15,10 @@ use std::clone::Clone; use std::future::Future; use std::sync::atomic::Ordering; use std::sync::Arc; -use std::time::Duration; use await_tree::InstrumentAwait; use bytes::Bytes; use fail::fail_point; -use futures::future::try_join_all; use futures::{future, StreamExt}; use itertools::Itertools; use risingwave_common::cache::{CachePriority, LookupResponse, LruCacheEventListener}; @@ -41,6 +39,7 @@ use super::{ Block, BlockCache, BlockMeta, BlockResponse, FileCache, RecentFilter, Sstable, SstableBlockIndex, SstableMeta, SstableWriter, }; +use crate::hummock::file_cache::preclude::*; use crate::hummock::multi_builder::UploadJoinHandle; use crate::hummock::{ BlockHolder, CacheableEntry, HummockError, HummockResult, LruCache, MemoryLimiter, @@ -107,7 +106,7 @@ impl LruCacheEventListener for BlockCacheEventListener { sst_id: key.0, block_idx: key.1, }; - self.data_file_cache.insert_without_wait(key, value); + self.data_file_cache.insert_async(key, value); } } @@ -118,7 +117,7 @@ impl LruCacheEventListener for MetaCacheEventListener { type T = Box; fn on_release(&self, key: Self::K, value: Self::T) { - self.0.insert_without_wait(key, value); + self.0.insert_async(key, value); } } @@ -131,7 +130,7 @@ pub struct SstableStore { data_file_cache: FileCache>, meta_file_cache: FileCache>, - data_file_cache_refill_filter: Option>>, + recent_filter: Option>>, } impl SstableStore { @@ -143,6 +142,7 @@ impl SstableStore { high_priority_ratio: usize, data_file_cache: FileCache>, meta_file_cache: FileCache>, + recent_filter: Option>>, ) -> Self { // TODO: We should validate path early. Otherwise object store won't report invalid path // error until first write attempt. @@ -154,11 +154,6 @@ impl SstableStore { data_file_cache: data_file_cache.clone(), }); let meta_cache_listener = Arc::new(MetaCacheEventListener(meta_file_cache.clone())); - let data_file_cache_refill_filter = if data_file_cache.is_filter_enabled() { - Some(Arc::new(RecentFilter::new(6, Duration::from_secs(10)))) - } else { - None - }; Self { path, @@ -179,7 +174,7 @@ impl SstableStore { data_file_cache, meta_file_cache, - data_file_cache_refill_filter, + recent_filter, } } @@ -200,7 +195,7 @@ impl SstableStore { data_file_cache: FileCache::none(), meta_file_cache: FileCache::none(), - data_file_cache_refill_filter: None, + recent_filter: None, } } @@ -210,7 +205,9 @@ impl SstableStore { .delete(self.get_sst_data_path(object_id).as_str()) .await?; self.meta_cache.erase(object_id, &object_id); - self.meta_file_cache.remove_without_wait(&object_id); + self.meta_file_cache + .remove(&object_id) + .map_err(HummockError::file_cache)?; Ok(()) } @@ -230,7 +227,9 @@ impl SstableStore { // Delete from cache. for &object_id in object_id_list { self.meta_cache.erase(object_id, &object_id); - self.meta_file_cache.remove_without_wait(&object_id); + self.meta_file_cache + .remove(&object_id) + .map_err(HummockError::file_cache)?; } Ok(()) @@ -238,7 +237,9 @@ impl SstableStore { pub fn delete_cache(&self, object_id: HummockSstableObjectId) { self.meta_cache.erase(object_id, &object_id); - self.meta_file_cache.remove_without_wait(&object_id); + if let Err(e) = self.meta_file_cache.remove(&object_id) { + tracing::warn!("meta file cache remove error: {}", e); + } } async fn put_sst_data( @@ -304,7 +305,7 @@ impl SstableStore { policy }; - if let Some(filter) = self.data_file_cache_refill_filter.as_ref() { + if let Some(filter) = self.recent_filter.as_ref() { filter.insert(object_id); } @@ -317,7 +318,7 @@ impl SstableStore { )), CachePolicy::FillFileCache => { let block = fetch_block().await?; - self.data_file_cache.insert_without_wait( + self.data_file_cache.insert_async( SstableBlockIndex { sst_id: object_id, block_idx: block_index as u64, @@ -380,13 +381,17 @@ impl SstableStore { #[cfg(any(test, feature = "test"))] pub fn clear_block_cache(&self) { self.block_cache.clear(); - self.data_file_cache.clear_without_wait(); + if let Err(e) = self.data_file_cache.clear() { + tracing::warn!("data file cache clear error: {}", e); + } } #[cfg(any(test, feature = "test"))] pub fn clear_meta_cache(&self) { self.meta_cache.clear(); - self.meta_file_cache.clear_without_wait(); + if let Err(e) = self.meta_file_cache.clear() { + tracing::warn!("meta file cache clear error: {}", e); + } } /// Returns `table_holder` @@ -480,7 +485,7 @@ impl SstableStore { block_index: u64, block: Box, ) { - if let Some(filter) = self.data_file_cache_refill_filter.as_ref() { + if let Some(filter) = self.recent_filter.as_ref() { filter.insert(object_id); } self.block_cache @@ -515,54 +520,13 @@ impl SstableStore { )) } - pub fn data_file_cache_refill_filter( - &self, - ) -> Option<&Arc>> { - self.data_file_cache_refill_filter.as_ref() + pub fn data_recent_filter(&self) -> Option<&Arc>> { + self.recent_filter.as_ref() } pub fn data_file_cache(&self) -> &FileCache> { &self.data_file_cache } - - pub async fn fill_data_file_cache(&self, sst: &Sstable) -> HummockResult<()> { - let object_id = sst.id; - - if let Some(filter) = self.data_file_cache_refill_filter.as_ref() { - filter.insert(object_id); - } - - let data = self - .store - .read(&self.get_sst_data_path(object_id), ..) - .await?; - - let mut tasks = vec![]; - for block_index in 0..sst.block_count() { - let (range, uncompressed_capacity) = sst.calculate_block_info(block_index); - let bytes = data.slice(range); - let block = Block::decode(bytes, uncompressed_capacity)?; - let block = Box::new(block); - - let key = SstableBlockIndex { - sst_id: object_id, - block_idx: block_index as u64, - }; - - let cache = self.data_file_cache.clone(); - let task = async move { - cache - .insert_force(key, block) - .await - .map_err(HummockError::file_cache) - }; - tasks.push(task); - } - - try_join_all(tasks).await?; - - Ok(()) - } } pub type SstableStoreRef = Arc; @@ -742,7 +706,7 @@ impl SstableWriter for BatchUploadWriter { .await?; self.sstable_store.insert_meta_cache(self.object_id, meta); - if let Some(filter) = self.sstable_store.data_file_cache_refill_filter.as_ref() { + if let Some(filter) = self.sstable_store.recent_filter.as_ref() { filter.insert(self.object_id); } diff --git a/src/storage/src/hummock/store/hummock_storage.rs b/src/storage/src/hummock/store/hummock_storage.rs index a4bafcdb99c07..5e51fa1170b12 100644 --- a/src/storage/src/hummock/store/hummock_storage.rs +++ b/src/storage/src/hummock/store/hummock_storage.rs @@ -82,6 +82,8 @@ pub struct HummockStorage { context: CompactorContext, + filter_key_extractor_manager: FilterKeyExtractorManager, + sstable_object_id_manager: SstableObjectIdManagerRef, buffer_tracker: BufferTracker, @@ -154,14 +156,13 @@ impl HummockStorage { pin_version_rx, hummock_meta_client.clone(), )); - + let filter_key_extractor_manager = FilterKeyExtractorManager::RpcFilterKeyExtractorManager( + filter_key_extractor_manager.clone(), + ); let compactor_context = CompactorContext::new_local_compact_context( options.clone(), sstable_store.clone(), compactor_metrics.clone(), - FilterKeyExtractorManager::RpcFilterKeyExtractorManager( - filter_key_extractor_manager.clone(), - ), ); let seal_epoch = Arc::new(AtomicU64::new(pinned_version.max_committed_epoch())); @@ -171,6 +172,7 @@ impl HummockStorage { event_rx, pinned_version, compactor_context.clone(), + filter_key_extractor_manager.clone(), sstable_object_id_manager.clone(), state_store_metrics.clone(), CacheRefillConfig { @@ -181,11 +183,14 @@ impl HummockStorage { .copied() .collect(), concurrency: options.cache_refill_concurrency, + unit: options.cache_refill_unit, + threshold: options.cache_refill_threshold, }, ); let instance = Self { context: compactor_context, + filter_key_extractor_manager: filter_key_extractor_manager.clone(), sstable_object_id_manager, buffer_tracker: hummock_event_handler.buffer_tracker().clone(), version_update_notifier_tx: hummock_event_handler.version_update_notifier_tx(), @@ -348,7 +353,7 @@ impl HummockStorage { } pub fn filter_key_extractor_manager(&self) -> &FilterKeyExtractorManager { - &self.context.filter_key_extractor_manager + &self.filter_key_extractor_manager } pub fn get_memory_limiter(&self) -> Arc { diff --git a/src/storage/src/hummock/store/local_hummock_storage.rs b/src/storage/src/hummock/store/local_hummock_storage.rs index 18ae72300cd67..94b536bf6919f 100644 --- a/src/storage/src/hummock/store/local_hummock_storage.rs +++ b/src/storage/src/hummock/store/local_hummock_storage.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use await_tree::InstrumentAwait; use bytes::Bytes; use parking_lot::RwLock; +use prometheus::IntGauge; use risingwave_common::catalog::{TableId, TableOption}; use risingwave_hummock_sdk::key::{TableKey, TableKeyRange}; use risingwave_hummock_sdk::HummockEpoch; @@ -88,6 +89,10 @@ pub struct LocalHummockStorage { write_limiter: WriteLimiterRef, version_update_notifier_tx: Arc>, + + mem_table_size: IntGauge, + + mem_table_item_count: IntGauge, } impl LocalHummockStorage { @@ -242,11 +247,20 @@ impl LocalStateStore for LocalHummockStorage { None => self.mem_table.insert(key, new_val)?, Some(old_val) => self.mem_table.update(key, old_val, new_val)?, }; + + self.mem_table_size + .set(self.mem_table.kv_size.size() as i64); + self.mem_table_item_count + .set(self.mem_table.buffer.len() as i64); Ok(()) } fn delete(&mut self, key: TableKey, old_val: Bytes) -> StorageResult<()> { self.mem_table.delete(key, old_val)?; + self.mem_table_size + .set(self.mem_table.kv_size.size() as i64); + self.mem_table_item_count + .set(self.mem_table.buffer.len() as i64); Ok(()) } @@ -254,6 +268,8 @@ impl LocalStateStore for LocalHummockStorage { &mut self, delete_ranges: Vec<(Bound, Bound)>, ) -> StorageResult { + self.mem_table_size.set(0); + self.mem_table_item_count.set(0); debug_assert!(delete_ranges .iter() .map(|(key, _)| key) @@ -454,6 +470,14 @@ impl LocalHummockStorage { version_update_notifier_tx: Arc>, ) -> Self { let stats = hummock_version_reader.stats().clone(); + let mem_table_size = stats.mem_table_memory_size.with_label_values(&[ + &option.table_id.to_string(), + &instance_guard.instance_id.to_string(), + ]); + let mem_table_item_count = stats.mem_table_item_count.with_label_values(&[ + &option.table_id.to_string(), + &instance_guard.instance_id.to_string(), + ]); Self { mem_table: MemTable::new(option.is_consistent_op), epoch: None, @@ -469,6 +493,8 @@ impl LocalHummockStorage { stats, write_limiter, version_update_notifier_tx, + mem_table_size, + mem_table_item_count, } } diff --git a/src/storage/src/hummock/store/version.rs b/src/storage/src/hummock/store/version.rs index 41273e51657c8..f2d5eca81b171 100644 --- a/src/storage/src/hummock/store/version.rs +++ b/src/storage/src/hummock/store/version.rs @@ -18,6 +18,7 @@ use std::collections::HashSet; use std::iter::once; use std::sync::Arc; +use await_tree::InstrumentAwait; use bytes::Bytes; use itertools::Itertools; use parking_lot::RwLock; @@ -924,7 +925,7 @@ impl HummockVersionReader { ); user_iter .rewind() - .instrument(tracing::trace_span!("rewind")) + .verbose_instrument_await("rewind") .await?; local_stats.found_key = user_iter.is_valid(); local_stats.sub_iter_count = local_stats.staging_imm_iter_count diff --git a/src/storage/src/lib.rs b/src/storage/src/lib.rs index 72b925170a6ef..c5ffe656ab893 100644 --- a/src/storage/src/lib.rs +++ b/src/storage/src/lib.rs @@ -14,7 +14,6 @@ #![feature(allocator_api)] #![feature(arc_unwrap_or_clone)] -#![feature(binary_heap_drain_sorted)] #![feature(bound_as_ref)] #![feature(bound_map)] #![feature(custom_test_frameworks)] diff --git a/src/storage/src/monitor/hummock_state_store_metrics.rs b/src/storage/src/monitor/hummock_state_store_metrics.rs index 77043b32ab455..1b4894256f11c 100644 --- a/src/storage/src/monitor/hummock_state_store_metrics.rs +++ b/src/storage/src/monitor/hummock_state_store_metrics.rs @@ -17,12 +17,18 @@ use std::sync::{Arc, OnceLock}; use prometheus::core::{AtomicU64, Collector, Desc, GenericCounter, GenericGauge}; use prometheus::{ exponential_buckets, histogram_opts, proto, register_histogram_vec_with_registry, - register_int_counter_vec_with_registry, register_int_gauge_with_registry, Gauge, IntGauge, - Opts, Registry, + register_int_counter_vec_with_registry, register_int_gauge_vec_with_registry, + register_int_gauge_with_registry, Gauge, IntGauge, IntGaugeVec, Opts, Registry, }; use risingwave_common::config::MetricLevel; -use risingwave_common::metrics::{RelabeledCounterVec, RelabeledHistogramVec}; +use risingwave_common::metrics::{ + RelabeledCounterVec, RelabeledGuardedHistogramVec, RelabeledGuardedIntCounterVec, + RelabeledHistogramVec, +}; use risingwave_common::monitor::GLOBAL_METRICS_REGISTRY; +use risingwave_common::{ + register_guarded_histogram_vec_with_registry, register_guarded_int_counter_vec_with_registry, +}; use tracing::warn; /// [`HummockStateStoreMetrics`] stores the performance and IO metrics of `XXXStore` such as @@ -35,17 +41,17 @@ pub struct HummockStateStoreMetrics { pub bloom_filter_true_negative_counts: RelabeledCounterVec, pub bloom_filter_check_counts: RelabeledCounterVec, pub iter_merge_sstable_counts: RelabeledHistogramVec, - pub sst_store_block_request_counts: RelabeledCounterVec, - pub iter_scan_key_counts: RelabeledCounterVec, + pub sst_store_block_request_counts: RelabeledGuardedIntCounterVec<2>, + pub iter_scan_key_counts: RelabeledGuardedIntCounterVec<2>, pub get_shared_buffer_hit_counts: RelabeledCounterVec, pub remote_read_time: RelabeledHistogramVec, - pub iter_fetch_meta_duration: RelabeledHistogramVec, + pub iter_fetch_meta_duration: RelabeledGuardedHistogramVec<1>, pub iter_fetch_meta_cache_unhits: IntGauge, pub iter_slow_fetch_meta_cache_unhits: IntGauge, - pub read_req_bloom_filter_positive_counts: RelabeledCounterVec, - pub read_req_positive_but_non_exist_counts: RelabeledCounterVec, - pub read_req_check_bloom_filter_counts: RelabeledCounterVec, + pub read_req_bloom_filter_positive_counts: RelabeledGuardedIntCounterVec<2>, + pub read_req_positive_but_non_exist_counts: RelabeledGuardedIntCounterVec<2>, + pub read_req_check_bloom_filter_counts: RelabeledGuardedIntCounterVec<2>, pub write_batch_tuple_counts: RelabeledCounterVec, pub write_batch_duration: RelabeledHistogramVec, @@ -67,6 +73,10 @@ pub struct HummockStateStoreMetrics { // uploading task pub uploader_uploading_task_size: GenericGauge, + + // memory + pub mem_table_memory_size: IntGaugeVec, + pub mem_table_item_count: IntGaugeVec, } pub static GLOBAL_HUMMOCK_STATE_STORE_METRICS: OnceLock = OnceLock::new(); @@ -81,6 +91,10 @@ impl HummockStateStoreMetrics { pub fn new(registry: &Registry, metric_level: MetricLevel) -> Self { // 10ms ~ max 2.7h let time_buckets = exponential_buckets(0.01, 10.0, 7).unwrap(); + + // 1ms - 100s + let state_store_read_time_buckets = exponential_buckets(0.001, 10.0, 5).unwrap(); + let bloom_filter_true_negative_counts = register_int_counter_vec_with_registry!( "state_store_bloom_filter_true_negative_counts", "Total number of sstables that have been considered true negative by bloom filters", @@ -122,27 +136,27 @@ impl HummockStateStoreMetrics { ); // ----- sst store ----- - let sst_store_block_request_counts = register_int_counter_vec_with_registry!( + let sst_store_block_request_counts = register_guarded_int_counter_vec_with_registry!( "state_store_sst_store_block_request_counts", "Total number of sst block requests that have been issued to sst store", &["table_id", "type"], registry ) .unwrap(); - let sst_store_block_request_counts = RelabeledCounterVec::with_metric_level( + let sst_store_block_request_counts = RelabeledGuardedIntCounterVec::with_metric_level( MetricLevel::Critical, sst_store_block_request_counts, metric_level, ); - let iter_scan_key_counts = register_int_counter_vec_with_registry!( + let iter_scan_key_counts = register_guarded_int_counter_vec_with_registry!( "state_store_iter_scan_key_counts", "Total number of keys read by iterator", &["table_id", "type"], registry ) .unwrap(); - let iter_scan_key_counts = RelabeledCounterVec::with_metric_level( + let iter_scan_key_counts = RelabeledGuardedIntCounterVec::with_metric_level( MetricLevel::Info, iter_scan_key_counts, metric_level, @@ -177,11 +191,11 @@ impl HummockStateStoreMetrics { let opts = histogram_opts!( "state_store_iter_fetch_meta_duration", "Histogram of iterator fetch SST meta time that have been issued to state store", - time_buckets.clone(), + state_store_read_time_buckets.clone(), ); let iter_fetch_meta_duration = - register_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap(); - let iter_fetch_meta_duration = RelabeledHistogramVec::with_metric_level( + register_guarded_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap(); + let iter_fetch_meta_duration = RelabeledGuardedHistogramVec::with_metric_level( MetricLevel::Info, iter_fetch_meta_duration, metric_level, @@ -302,45 +316,64 @@ impl HummockStateStoreMetrics { .register(Box::new(uploader_uploading_task_size.clone())) .unwrap(); - let read_req_bloom_filter_positive_counts = register_int_counter_vec_with_registry!( + let read_req_bloom_filter_positive_counts = register_guarded_int_counter_vec_with_registry!( "state_store_read_req_bloom_filter_positive_counts", "Total number of read request with at least one SST bloom filter check returns positive", &["table_id", "type"], registry ) .unwrap(); - let read_req_bloom_filter_positive_counts = RelabeledCounterVec::with_metric_level( - MetricLevel::Info, - read_req_bloom_filter_positive_counts, - metric_level, - ); + let read_req_bloom_filter_positive_counts = + RelabeledGuardedIntCounterVec::with_metric_level( + MetricLevel::Info, + read_req_bloom_filter_positive_counts, + metric_level, + ); - let read_req_positive_but_non_exist_counts = register_int_counter_vec_with_registry!( + let read_req_positive_but_non_exist_counts = register_guarded_int_counter_vec_with_registry!( "state_store_read_req_positive_but_non_exist_counts", "Total number of read request on non-existent key/prefix with at least one SST bloom filter check returns positive", &["table_id", "type"], registry ) .unwrap(); - let read_req_positive_but_non_exist_counts = RelabeledCounterVec::with_metric_level( - MetricLevel::Info, - read_req_positive_but_non_exist_counts, - metric_level, - ); + let read_req_positive_but_non_exist_counts = + RelabeledGuardedIntCounterVec::with_metric_level( + MetricLevel::Info, + read_req_positive_but_non_exist_counts, + metric_level, + ); - let read_req_check_bloom_filter_counts = register_int_counter_vec_with_registry!( + let read_req_check_bloom_filter_counts = register_guarded_int_counter_vec_with_registry!( "state_store_read_req_check_bloom_filter_counts", "Total number of read request that checks bloom filter with a prefix hint", &["table_id", "type"], registry ) .unwrap(); - let read_req_check_bloom_filter_counts = RelabeledCounterVec::with_metric_level( + + let read_req_check_bloom_filter_counts = RelabeledGuardedIntCounterVec::with_metric_level( MetricLevel::Info, read_req_check_bloom_filter_counts, metric_level, ); + let mem_table_memory_size = register_int_gauge_vec_with_registry!( + "state_store_mem_table_memory_size", + "Memory usage of mem_table", + &["table_id", "instance_id"], + registry + ) + .unwrap(); + + let mem_table_item_count = register_int_gauge_vec_with_registry!( + "state_store_mem_table_item_count", + "Item counts in mem_table", + &["table_id", "instance_id"], + registry + ) + .unwrap(); + Self { bloom_filter_true_negative_counts, bloom_filter_check_counts, @@ -365,6 +398,8 @@ impl HummockStateStoreMetrics { spill_task_size_from_sealed: spill_task_size.with_label_values(&["sealed"]), spill_task_size_from_unsealed: spill_task_size.with_label_values(&["unsealed"]), uploader_uploading_task_size, + mem_table_memory_size, + mem_table_item_count, } } diff --git a/src/storage/src/monitor/monitored_storage_metrics.rs b/src/storage/src/monitor/monitored_storage_metrics.rs index a1517d98918ac..1a33a8bcb6ac1 100644 --- a/src/storage/src/monitor/monitored_storage_metrics.rs +++ b/src/storage/src/monitor/monitored_storage_metrics.rs @@ -19,20 +19,23 @@ use prometheus::{ register_histogram_with_registry, register_int_counter_vec_with_registry, Histogram, Registry, }; use risingwave_common::config::MetricLevel; -use risingwave_common::metrics::{RelabeledCounterVec, RelabeledHistogramVec}; +use risingwave_common::metrics::{ + RelabeledCounterVec, RelabeledGuardedHistogramVec, RelabeledHistogramVec, +}; use risingwave_common::monitor::GLOBAL_METRICS_REGISTRY; +use risingwave_common::register_guarded_histogram_vec_with_registry; /// [`MonitoredStorageMetrics`] stores the performance and IO metrics of Storage. #[derive(Debug, Clone)] pub struct MonitoredStorageMetrics { - pub get_duration: RelabeledHistogramVec, + pub get_duration: RelabeledGuardedHistogramVec<1>, pub get_key_size: RelabeledHistogramVec, pub get_value_size: RelabeledHistogramVec, pub iter_size: RelabeledHistogramVec, pub iter_item: RelabeledHistogramVec, - pub iter_init_duration: RelabeledHistogramVec, - pub iter_scan_duration: RelabeledHistogramVec, + pub iter_init_duration: RelabeledGuardedHistogramVec<1>, + pub iter_scan_duration: RelabeledGuardedHistogramVec<1>, pub may_exist_duration: RelabeledHistogramVec, pub iter_in_process_counts: RelabeledCounterVec, @@ -88,15 +91,22 @@ impl MonitoredStorageMetrics { buckets.extend(exponential_buckets(0.001, 2.0, 5).unwrap()); // 1 ~ 16ms. buckets.extend(exponential_buckets(0.05, 4.0, 5).unwrap()); // 0.05 ~ 1.28s. buckets.push(16.0); // 16s + + // 1ms - 100s + let state_store_read_time_buckets = exponential_buckets(0.001, 10.0, 5).unwrap(); + let get_duration_opts = histogram_opts!( "state_store_get_duration", "Total latency of get that have been issued to state store", - buckets.clone(), + state_store_read_time_buckets.clone(), ); - let get_duration = - register_histogram_vec_with_registry!(get_duration_opts, &["table_id"], registry) - .unwrap(); - let get_duration = RelabeledHistogramVec::with_metric_level( + let get_duration = register_guarded_histogram_vec_with_registry!( + get_duration_opts, + &["table_id"], + registry + ) + .unwrap(); + let get_duration = RelabeledGuardedHistogramVec::with_metric_level( MetricLevel::Critical, get_duration, metric_level, @@ -125,11 +135,11 @@ impl MonitoredStorageMetrics { let opts = histogram_opts!( "state_store_iter_init_duration", "Histogram of the time spent on iterator initialization.", - buckets.clone(), + state_store_read_time_buckets.clone(), ); let iter_init_duration = - register_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap(); - let iter_init_duration = RelabeledHistogramVec::with_metric_level( + register_guarded_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap(); + let iter_init_duration = RelabeledGuardedHistogramVec::with_metric_level( MetricLevel::Critical, iter_init_duration, metric_level, @@ -138,11 +148,11 @@ impl MonitoredStorageMetrics { let opts = histogram_opts!( "state_store_iter_scan_duration", "Histogram of the time spent on iterator scanning.", - buckets.clone(), + state_store_read_time_buckets.clone(), ); let iter_scan_duration = - register_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap(); - let iter_scan_duration = RelabeledHistogramVec::with_metric_level( + register_guarded_histogram_vec_with_registry!(opts, &["table_id"], registry).unwrap(); + let iter_scan_duration = RelabeledGuardedHistogramVec::with_metric_level( MetricLevel::Critical, iter_scan_duration, metric_level, diff --git a/src/storage/src/opts.rs b/src/storage/src/opts.rs index a26bd6b96467a..a3243cbc8c465 100644 --- a/src/storage/src/opts.rs +++ b/src/storage/src/opts.rs @@ -75,13 +75,19 @@ pub struct StorageOpts { pub data_file_cache_recover_concurrency: usize, pub data_file_cache_lfu_window_to_cache_size_ratio: usize, pub data_file_cache_lfu_tiny_lru_capacity_ratio: f64, - pub data_file_cache_rated_random_rate_mb: usize, + pub data_file_cache_insert_rate_limit_mb: usize, pub data_file_cache_flush_rate_limit_mb: usize, pub data_file_cache_reclaim_rate_limit_mb: usize, + pub data_file_cache_allocation_bits: usize, + pub data_file_cache_allocation_timeout_ms: usize, pub cache_refill_data_refill_levels: Vec, pub cache_refill_timeout_ms: u64, pub cache_refill_concurrency: usize, + pub cache_refill_recent_filter_layers: usize, + pub cache_refill_recent_filter_rotate_interval_ms: usize, + pub cache_refill_unit: usize, + pub cache_refill_threshold: f64, pub meta_file_cache_dir: String, pub meta_file_cache_capacity_mb: usize, @@ -94,9 +100,11 @@ pub struct StorageOpts { pub meta_file_cache_recover_concurrency: usize, pub meta_file_cache_lfu_window_to_cache_size_ratio: usize, pub meta_file_cache_lfu_tiny_lru_capacity_ratio: f64, - pub meta_file_cache_rated_random_rate_mb: usize, + pub meta_file_cache_insert_rate_limit_mb: usize, pub meta_file_cache_flush_rate_limit_mb: usize, pub meta_file_cache_reclaim_rate_limit_mb: usize, + pub meta_file_cache_allocation_bits: usize, + pub meta_file_cache_allocation_timeout_ms: usize, /// The storage url for storing backups. pub backup_storage_url: String, @@ -173,9 +181,11 @@ impl From<(&RwConfig, &SystemParamsReader, &StorageMemoryConfig)> for StorageOpt .storage .data_file_cache .lfu_tiny_lru_capacity_ratio, - data_file_cache_rated_random_rate_mb: c.storage.data_file_cache.rated_random_rate_mb, + data_file_cache_insert_rate_limit_mb: c.storage.data_file_cache.insert_rate_limit_mb, data_file_cache_flush_rate_limit_mb: c.storage.data_file_cache.flush_rate_limit_mb, data_file_cache_reclaim_rate_limit_mb: c.storage.data_file_cache.reclaim_rate_limit_mb, + data_file_cache_allocation_bits: c.storage.data_file_cache.allocation_bits, + data_file_cache_allocation_timeout_ms: c.storage.data_file_cache.allocation_timeout_ms, meta_file_cache_dir: c.storage.meta_file_cache.dir.clone(), meta_file_cache_capacity_mb: c.storage.meta_file_cache.capacity_mb, meta_file_cache_file_capacity_mb: c.storage.meta_file_cache.file_capacity_mb, @@ -193,12 +203,21 @@ impl From<(&RwConfig, &SystemParamsReader, &StorageMemoryConfig)> for StorageOpt .storage .meta_file_cache .lfu_tiny_lru_capacity_ratio, - meta_file_cache_rated_random_rate_mb: c.storage.meta_file_cache.rated_random_rate_mb, + meta_file_cache_insert_rate_limit_mb: c.storage.meta_file_cache.insert_rate_limit_mb, meta_file_cache_flush_rate_limit_mb: c.storage.meta_file_cache.flush_rate_limit_mb, meta_file_cache_reclaim_rate_limit_mb: c.storage.meta_file_cache.reclaim_rate_limit_mb, + meta_file_cache_allocation_bits: c.storage.meta_file_cache.allocation_bits, + meta_file_cache_allocation_timeout_ms: c.storage.meta_file_cache.allocation_timeout_ms, cache_refill_data_refill_levels: c.storage.cache_refill.data_refill_levels.clone(), cache_refill_timeout_ms: c.storage.cache_refill.timeout_ms, cache_refill_concurrency: c.storage.cache_refill.concurrency, + cache_refill_recent_filter_layers: c.storage.cache_refill.recent_filter_layers, + cache_refill_recent_filter_rotate_interval_ms: c + .storage + .cache_refill + .recent_filter_rotate_interval_ms, + cache_refill_unit: c.storage.cache_refill.unit, + cache_refill_threshold: c.storage.cache_refill.threshold, max_preload_wait_time_mill: c.storage.max_preload_wait_time_mill, object_store_streaming_read_timeout_ms: c .storage diff --git a/src/storage/src/row_serde/value_serde.rs b/src/storage/src/row_serde/value_serde.rs index 43156500f3f7c..5d56cdba2d96d 100644 --- a/src/storage/src/row_serde/value_serde.rs +++ b/src/storage/src/row_serde/value_serde.rs @@ -27,8 +27,8 @@ use risingwave_common::util::value_encoding::column_aware_row_encoding::{ }; use risingwave_common::util::value_encoding::error::ValueEncodingError; use risingwave_common::util::value_encoding::{ - BasicSerde, BasicSerializer, EitherSerde, ValueRowDeserializer, ValueRowSerdeKind, - ValueRowSerializer, + BasicSerde, BasicSerializer, DatumFromProtoExt, EitherSerde, ValueRowDeserializer, + ValueRowSerdeKind, ValueRowSerializer, }; use risingwave_expr::expr::build_from_prost; use risingwave_pb::plan_common::column_desc::GeneratedOrDefaultColumn; @@ -98,21 +98,27 @@ impl ValueRowSerdeNew for ColumnAwareSerde { } let column_with_default = table_columns.iter().enumerate().filter_map(|(i, c)| { - if c.is_default() { - if let GeneratedOrDefaultColumn::DefaultColumn(DefaultColumnDesc { expr }) = - c.generated_or_default_column.clone().unwrap() - { - Some(( - i, - build_from_prost(&expr.expect("expr should not be none")) - .expect("build_from_prost error") - .eval_row_infallible(&OwnedRow::empty(), |_err| {}) - .now_or_never() - .expect("constant expression should not be async"), - )) + if let Some(GeneratedOrDefaultColumn::DefaultColumn(DefaultColumnDesc { + snapshot_value, + expr, + })) = c.generated_or_default_column.clone() + { + // TODO: may not panic on error + let value = if let Some(snapshot_value) = snapshot_value { + // If there's a `snapshot_value`, we can use it directly. + Datum::from_protobuf(&snapshot_value, &c.data_type) + .expect("invalid default value") } else { - unreachable!() - } + // For backward compatibility, default columns in old tables may not have `snapshot_value`. + // In this case, we need to evaluate the expression to get the default value. + // It's okay since we previously banned impure expressions in default columns. + build_from_prost(&expr.expect("expr should not be none")) + .expect("build_from_prost error") + .eval_row_infallible(&OwnedRow::empty()) + .now_or_never() + .expect("constant expression should not be async") + }; + Some((i, value)) } else { None } diff --git a/src/storage/src/store_impl.rs b/src/storage/src/store_impl.rs index b2fc43dc40b06..8460b75ebc0dc 100644 --- a/src/storage/src/store_impl.rs +++ b/src/storage/src/store_impl.rs @@ -15,6 +15,7 @@ use std::fmt::Debug; use std::path::PathBuf; use std::sync::Arc; +use std::time::Duration; use enum_as_inner::EnumAsInner; use risingwave_common::monitor::GLOBAL_METRICS_REGISTRY; @@ -23,10 +24,11 @@ use risingwave_object_store::object::parse_remote_object_store; use crate::error::StorageResult; use crate::filter_key_extractor::{RemoteTableAccessor, RpcFilterKeyExtractorManager}; +use crate::hummock::file_cache::preclude::*; use crate::hummock::hummock_meta_client::MonitoredHummockMetaClient; use crate::hummock::{ - set_foyer_metrics_registry, FileCache, FoyerRuntimeConfig, FoyerStoreConfig, HummockError, - HummockStorage, SstableStore, + set_foyer_metrics_registry, FileCache, FileCacheConfig, HummockError, HummockStorage, + RecentFilter, SstableStore, }; use crate::memory::sled::SledStateStore; use crate::memory::MemoryStateStore; @@ -525,12 +527,12 @@ impl StateStoreImpl { ) -> StorageResult { set_foyer_metrics_registry(GLOBAL_METRICS_REGISTRY.clone()); - let data_file_cache = if opts.data_file_cache_dir.is_empty() { - FileCache::none() + let (data_file_cache, recent_filter) = if opts.data_file_cache_dir.is_empty() { + (FileCache::none(), None) } else { const MB: usize = 1024 * 1024; - let foyer_store_config = FoyerStoreConfig { + let config = FileCacheConfig { name: "data".to_string(), dir: PathBuf::from(opts.data_file_cache_dir.clone()), capacity: opts.data_file_cache_capacity_mb * MB, @@ -540,22 +542,27 @@ impl StateStoreImpl { device_io_size: opts.data_file_cache_device_io_size, lfu_window_to_cache_size_ratio: opts.data_file_cache_lfu_window_to_cache_size_ratio, lfu_tiny_lru_capacity_ratio: opts.data_file_cache_lfu_tiny_lru_capacity_ratio, - rated_random_rate: opts.data_file_cache_rated_random_rate_mb * MB, + insert_rate_limit: opts.data_file_cache_insert_rate_limit_mb * MB, flushers: opts.data_file_cache_flushers, reclaimers: opts.data_file_cache_reclaimers, flush_rate_limit: opts.data_file_cache_flush_rate_limit_mb * MB, reclaim_rate_limit: opts.data_file_cache_reclaim_rate_limit_mb * MB, recover_concurrency: opts.data_file_cache_recover_concurrency, - event_listener: vec![], - enable_filter: !opts.cache_refill_data_refill_levels.is_empty(), - }; - let config = FoyerRuntimeConfig { - foyer_store_config, - runtime_worker_threads: None, + allocator_bits: opts.data_file_cache_allocation_bits, + allocation_timeout: Duration::from_millis( + opts.data_file_cache_allocation_timeout_ms as u64, + ), + admissions: vec![], + reinsertions: vec![], }; - FileCache::foyer(config) + let cache = FileCache::open(config) .await - .map_err(HummockError::file_cache)? + .map_err(HummockError::file_cache)?; + let filter = Some(Arc::new(RecentFilter::new( + opts.cache_refill_recent_filter_layers, + Duration::from_millis(opts.cache_refill_recent_filter_rotate_interval_ms as u64), + ))); + (cache, filter) }; let meta_file_cache = if opts.meta_file_cache_dir.is_empty() { @@ -563,7 +570,7 @@ impl StateStoreImpl { } else { const MB: usize = 1024 * 1024; - let foyer_store_config = FoyerStoreConfig { + let config = FileCacheConfig { name: "meta".to_string(), dir: PathBuf::from(opts.meta_file_cache_dir.clone()), capacity: opts.meta_file_cache_capacity_mb * MB, @@ -573,20 +580,20 @@ impl StateStoreImpl { device_io_size: opts.meta_file_cache_device_io_size, lfu_window_to_cache_size_ratio: opts.meta_file_cache_lfu_window_to_cache_size_ratio, lfu_tiny_lru_capacity_ratio: opts.meta_file_cache_lfu_tiny_lru_capacity_ratio, - rated_random_rate: opts.meta_file_cache_rated_random_rate_mb * MB, + insert_rate_limit: opts.meta_file_cache_insert_rate_limit_mb * MB, flushers: opts.meta_file_cache_flushers, reclaimers: opts.meta_file_cache_reclaimers, flush_rate_limit: opts.meta_file_cache_flush_rate_limit_mb * MB, reclaim_rate_limit: opts.meta_file_cache_reclaim_rate_limit_mb * MB, recover_concurrency: opts.meta_file_cache_recover_concurrency, - event_listener: vec![], - enable_filter: false, - }; - let config = FoyerRuntimeConfig { - foyer_store_config, - runtime_worker_threads: None, + allocator_bits: opts.meta_file_cache_allocation_bits, + allocation_timeout: Duration::from_millis( + opts.meta_file_cache_allocation_timeout_ms as u64, + ), + admissions: vec![], + reinsertions: vec![], }; - FileCache::foyer(config) + FileCache::open(config) .await .map_err(HummockError::file_cache)? }; @@ -614,6 +621,7 @@ impl StateStoreImpl { opts.high_priority_ratio, data_file_cache, meta_file_cache, + recent_filter, )); let notification_client = RpcNotificationClient::new(hummock_meta_client.get_inner().clone()); diff --git a/src/storage/src/table/batch_table/storage_table.rs b/src/storage/src/table/batch_table/storage_table.rs index e4eb65b8b9f42..f784c4a5e7ca2 100644 --- a/src/storage/src/table/batch_table/storage_table.rs +++ b/src/storage/src/table/batch_table/storage_table.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; use std::ops::Bound::{self, Excluded, Included, Unbounded}; use std::ops::{Index, RangeBounds}; use std::sync::Arc; @@ -46,7 +45,7 @@ use crate::row_serde::value_serde::{ValueRowSerde, ValueRowSerdeNew}; use crate::row_serde::{find_columns_by_ids, ColumnMapping}; use crate::store::{PrefetchOptions, ReadOptions}; use crate::table::merge_sort::merge_sort; -use crate::table::{compute_vnode, Distribution, KeyedRow, TableIter, DEFAULT_VNODE}; +use crate::table::{compute_vnode, Distribution, KeyedRow, TableIter}; use crate::StateStore; /// [`StorageTableInner`] is the interface accessing relational data in KV(`StateStore`) with @@ -418,24 +417,7 @@ impl StorageTableInner { _ => CachePolicy::Fill(CachePriority::High), }; - let raw_key_ranges = if !ordered - && matches!(encoded_key_range.start_bound(), Unbounded) - && matches!(encoded_key_range.end_bound(), Unbounded) - { - // If the range is unbounded and order is not required, we can create a single iterator - // for each continuous vnode range. - - // In this case, the `vnode_hint` must be default for singletons and `None` for - // distributed tables. - assert_eq!(vnode_hint.unwrap_or(DEFAULT_VNODE), DEFAULT_VNODE); - - Either::Left(self.vnodes.vnode_ranges().map(|r| { - let start = Included(Bytes::copy_from_slice(&r.start().to_be_bytes()[..])); - let end = end_bound_of_prefix(&r.end().to_be_bytes()); - assert_matches!(end, Excluded(_) | Unbounded); - (start, end) - })) - } else { + let raw_key_ranges = { // Vnodes that are set and should be accessed. let vnodes = match vnode_hint { // If `vnode_hint` is set, we can only access this single vnode. @@ -443,9 +425,7 @@ impl StorageTableInner { // Otherwise, we need to access all vnodes of this table. None => Either::Right(self.vnodes.iter_vnodes()), }; - Either::Right( - vnodes.map(|vnode| prefixed_range(encoded_key_range.clone(), &vnode.to_be_bytes())), - ) + vnodes.map(|vnode| prefixed_range(encoded_key_range.clone(), &vnode.to_be_bytes())) }; // For each key range, construct an iterator. @@ -493,7 +473,10 @@ impl StorageTableInner { 0 => unreachable!(), 1 => iterators.into_iter().next().unwrap(), // Concat all iterators if not to preserve order. - _ if !ordered => futures::stream::iter(iterators).flatten(), + _ if !ordered => { + futures::stream::iter(iterators.into_iter().map(Box::pin).collect_vec()) + .flatten_unordered(1024) + } // Merge all iterators if to preserve order. _ => merge_sort(iterators.into_iter().map(Box::pin).collect()), }; diff --git a/src/storage/src/table/mod.rs b/src/storage/src/table/mod.rs index d50e4ec0277ab..b6407528d5272 100644 --- a/src/storage/src/table/mod.rs +++ b/src/storage/src/table/mod.rs @@ -162,7 +162,7 @@ pub fn compute_vnode(row: impl Row, indices: &[usize], vnodes: &Bitmap) -> Virtu vnode }; - tracing::trace!(target: "events::storage::storage_table", "compute vnode: {:?} key {:?} => {}", row, indices, vnode); + tracing::debug!(target: "events::storage::storage_table", "compute vnode: {:?} key {:?} => {}", row, indices, vnode); vnode } diff --git a/src/storage/src/write_batch.rs b/src/storage/src/write_batch.rs deleted file mode 100644 index dc143f4a93855..0000000000000 --- a/src/storage/src/write_batch.rs +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright 2023 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::ops::Bound; - -use bytes::Bytes; -use risingwave_hummock_sdk::key::next_key; - -use crate::error::StorageResult; -use crate::hummock::HummockError; -use crate::storage_value::StorageValue; -use crate::store::{StateStoreWrite, WriteOptions}; - -/// [`WriteBatch`] wraps a list of key-value pairs and an associated [`crate::StateStore`]. -pub struct WriteBatch<'a, S: StateStoreWrite> { - store: &'a S, - - batch: Vec<(Bytes, StorageValue)>, - - delete_ranges: Vec<(Bound, Bound)>, - - write_options: WriteOptions, -} - -impl<'a, S: StateStoreWrite> WriteBatch<'a, S> { - /// Constructs a new, empty [`WriteBatch`] with the given `store`. - pub fn new(store: &'a S, write_options: WriteOptions) -> Self { - Self { - store, - batch: vec![], - delete_ranges: vec![], - write_options, - } - } - - /// Constructs a new, empty [`WriteBatch`] with the given `store` and specified capacity. - pub fn with_capacity(store: &'a S, capacity: usize, write_options: WriteOptions) -> Self { - Self { - store, - batch: Vec::with_capacity(capacity), - delete_ranges: vec![], - write_options, - } - } - - /// Puts a value. - pub fn put(&mut self, key: impl AsRef<[u8]>, value: StorageValue) { - self.do_push(key.as_ref(), value); - } - - /// Deletes a value. - pub fn delete(&mut self, key: impl AsRef<[u8]>) { - self.do_push(key.as_ref(), StorageValue::new_delete()); - } - - /// Delete all keys starting with `prefix`. - pub fn delete_prefix(&mut self, prefix: impl AsRef<[u8]>) { - let start_key = Bytes::from(prefix.as_ref().to_owned()); - let end_key = Bytes::from(next_key(&start_key)); - self.delete_ranges - .push((Bound::Included(start_key), Bound::Excluded(end_key))); - } - - /// Delete all keys in this range. - pub fn delete_range(&mut self, start: Bound>, end: Bound>) { - self.delete_ranges.push(( - start.map(|start| Bytes::from(start.as_ref().to_owned())), - end.map(|end| Bytes::from(end.as_ref().to_owned())), - )); - } - - /// Reserves capacity for at least `additional` more key-value pairs to be inserted in the - /// batch. - pub fn reserve(&mut self, additional: usize) { - self.batch.reserve(additional); - } - - /// Returns the number of key-value pairs in the batch. - pub fn len(&self) -> usize { - self.batch.len() - } - - /// Preprocesses the batch to make it sorted. It returns `false` if duplicate keys are found. - fn preprocess(&mut self) -> StorageResult<()> { - let original_length = self.batch.len(); - self.batch.sort_by(|(k1, _), (k2, _)| k1.cmp(k2)); - self.batch.dedup_by(|(k1, _), (k2, _)| k1 == k2); - - if original_length == self.batch.len() { - Ok(()) - } else { - Err(HummockError::invalid_write_batch().into()) - } - } - - /// Returns `true` if the batch contains no key-value pairs. - pub fn is_empty(&self) -> bool { - self.batch.is_empty() && self.delete_ranges.is_empty() - } - - /// Ingests this batch into the associated state store. - pub async fn ingest(mut self) -> StorageResult<()> { - if !self.is_empty() { - self.preprocess()?; - self.store - .ingest_batch(self.batch, self.delete_ranges, self.write_options) - .await?; - } - Ok(()) - } - - /// Pushes `key` and `value` into the `WriteBatch`. - fn do_push(&mut self, key: &[u8], value: StorageValue) { - let key = Bytes::from(key.to_vec()); - self.batch.push((key, value)); - } -} - -#[cfg(test)] -mod tests { - use bytes::Bytes; - - use crate::memory::MemoryStateStore; - use crate::storage_value::StorageValue; - use crate::store::{StateStoreWrite, WriteOptions}; - - #[tokio::test] - async fn test_invalid_write_batch() { - let state_store = MemoryStateStore::new(); - let mut batch = state_store.start_write_batch(WriteOptions { - epoch: 1, - table_id: Default::default(), - }); - - batch.put(Bytes::from("aa"), StorageValue::new_put("444")); - batch.put(Bytes::from("cc"), StorageValue::new_put("444")); - batch.put(Bytes::from("bb"), StorageValue::new_put("444")); - batch.delete(Bytes::from("aa")); - - batch - .ingest() - .await - .expect_err("Should panic here because of duplicate key."); - } -} diff --git a/src/stream/Cargo.toml b/src/stream/Cargo.toml index 79db63474cfd4..9e9e77b92ceec 100644 --- a/src/stream/Cargo.toml +++ b/src/stream/Cargo.toml @@ -21,32 +21,31 @@ async-stream = "0.3" async-trait = "0.1" await-tree = { workspace = true } bytes = "1" -dyn-clone = "1" educe = "0.4" either = "1" enum-as-inner = "0.6" futures = { version = "0.3", default-features = false, features = ["alloc"] } futures-async-stream = { workspace = true } -governor = { version = "0.6", default-features = false, features = ["std", "dashmap", "jitter"] } +governor = { version = "0.6", default-features = false, features = [ + "std", + "dashmap", + "jitter", +] } hytra = "0.1.2" -iter-chunks = "0.1" itertools = "0.11" local_stats_alloc = { path = "../utils/local_stats_alloc" } lru = { git = "https://github.com/risingwavelabs/lru-rs.git", rev = "cb2d7c7" } maplit = "1.0.2" memcomparable = "0.2" -multimap = "0.8" -num-traits = "0.2" +multimap = "0.9" parking_lot = "0.12" -parse-display = "0.8" pin-project = "1" prometheus = { version = "0.13", features = ["process"] } -prost = "0.11" +prost = { workspace = true } rand = "0.8" risingwave_common = { workspace = true } risingwave_connector = { workspace = true } risingwave_expr = { workspace = true } -risingwave_frontend = { workspace = true } risingwave_hummock_sdk = { workspace = true } risingwave_pb = { workspace = true } risingwave_rpc_client = { workspace = true } @@ -54,7 +53,6 @@ risingwave_source = { workspace = true } risingwave_storage = { workspace = true } serde_json = "1" smallvec = "1" -spin = "0.9" static_assertions = "1" thiserror = "1" tokio = { version = "0.2", package = "madsim-tokio", features = [ @@ -81,6 +79,7 @@ workspace-hack = { path = "../workspace-hack" } assert_matches = "1" criterion = { workspace = true, features = ["async_tokio", "async"] } expect-test = "1" +risingwave_expr_impl = { workspace = true } risingwave_hummock_test = { path = "../storage/hummock_test", features = [ "test", ] } diff --git a/src/stream/benches/stream_hash_agg.rs b/src/stream/benches/stream_hash_agg.rs index a5392f011afbb..62c45421c7f60 100644 --- a/src/stream/benches/stream_hash_agg.rs +++ b/src/stream/benches/stream_hash_agg.rs @@ -20,7 +20,7 @@ use risingwave_common::catalog::{Field, Schema}; use risingwave_common::field_generator::VarcharProperty; use risingwave_common::test_prelude::StreamChunkTestExt; use risingwave_common::types::DataType; -use risingwave_expr::agg::AggCall; +use risingwave_expr::aggregate::AggCall; use risingwave_expr::expr::*; use risingwave_storage::memory::MemoryStateStore; use risingwave_storage::StateStore; diff --git a/src/stream/clippy.toml b/src/stream/clippy.toml index 6f4d9099676af..a6969d5bd607b 100644 --- a/src/stream/clippy.toml +++ b/src/stream/clippy.toml @@ -1,7 +1,10 @@ disallowed-methods = [ { path = "std::iter::Iterator::zip", reason = "Please use Itertools::zip_eq instead." }, - { path = "risingwave_expr::expr::Expression::eval", reason = "Please use InfallibleExpression::eval_infallible instead." }, - { path = "risingwave_expr::expr::Expression::eval_row", reason = "Please use InfallibleExpression::eval_row_infallible instead." }, + + { path = "risingwave_expr::expr::build_from_prost", reason = "Expressions in streaming must be in non-strict mode. Please use `build_non_strict_from_prost` instead." }, + { path = "risingwave_expr::expr::build_func", reason = "Expressions in streaming must be in non-strict mode. Please use `build_func_non_strict` instead." }, + { path = "risingwave_expr::expr::Expression::eval", reason = "Please use `Expression::eval_infallible` instead." }, + { path = "risingwave_expr::expr::Expression::eval_row", reason = "Please use `Expression::eval_row_infallible` instead." }, { path = "risingwave_common::error::internal_err", reason = "Please use per-crate error type instead." }, { path = "risingwave_common::error::internal_error", reason = "Please use per-crate error type instead." }, diff --git a/src/stream/src/common/log_store/kv_log_store/mod.rs b/src/stream/src/common/log_store/kv_log_store/mod.rs deleted file mode 100644 index dc27d1f63b3e8..0000000000000 --- a/src/stream/src/common/log_store/kv_log_store/mod.rs +++ /dev/null @@ -1,561 +0,0 @@ -// Copyright 2023 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use risingwave_common::buffer::Bitmap; -use risingwave_common::catalog::{TableId, TableOption}; -use risingwave_pb::catalog::Table; -use risingwave_storage::store::NewLocalOptions; -use risingwave_storage::StateStore; - -use crate::common::log_store::kv_log_store::buffer::new_log_store_buffer; -use crate::common::log_store::kv_log_store::reader::KvLogStoreReader; -use crate::common::log_store::kv_log_store::serde::LogStoreRowSerde; -use crate::common::log_store::kv_log_store::writer::KvLogStoreWriter; -use crate::common::log_store::LogStoreFactory; - -mod buffer; -mod reader; -mod serde; -#[cfg(test)] -mod test_utils; -mod writer; - -type SeqIdType = i32; -type RowOpCodeType = i16; - -const FIRST_SEQ_ID: SeqIdType = 0; - -/// Readers truncate the offset at the granularity of seq id. -/// None `SeqIdType` means that the whole epoch is truncated. -type ReaderTruncationOffsetType = (u64, Option); - -pub struct KvLogStoreFactory { - state_store: S, - - table_catalog: Table, - - vnodes: Option>, - - max_stream_chunk_count: usize, -} - -impl KvLogStoreFactory { - pub fn new( - state_store: S, - table_catalog: Table, - vnodes: Option>, - max_stream_chunk_count: usize, - ) -> Self { - Self { - state_store, - table_catalog, - vnodes, - max_stream_chunk_count, - } - } -} - -impl LogStoreFactory for KvLogStoreFactory { - type Reader = KvLogStoreReader; - type Writer = KvLogStoreWriter; - - async fn build(self) -> (Self::Reader, Self::Writer) { - let table_id = TableId::new(self.table_catalog.id); - let serde = LogStoreRowSerde::new(&self.table_catalog, self.vnodes); - let local_state_store = self - .state_store - .new_local(NewLocalOptions { - table_id: TableId { - table_id: self.table_catalog.id, - }, - is_consistent_op: false, - table_option: TableOption { - retention_seconds: None, - }, - is_replicated: false, - }) - .await; - - let (tx, rx) = new_log_store_buffer(self.max_stream_chunk_count); - - let reader = KvLogStoreReader::new(table_id, self.state_store, serde.clone(), rx); - - let writer = KvLogStoreWriter::new(table_id, local_state_store, serde, tx); - - (reader, writer) - } -} - -#[cfg(test)] -mod tests { - use risingwave_common::util::epoch::EpochPair; - use risingwave_hummock_sdk::HummockReadEpoch; - use risingwave_hummock_test::test_utils::prepare_hummock_test_env; - use risingwave_storage::store::SyncResult; - use risingwave_storage::StateStore; - - use crate::common::log_store::kv_log_store::test_utils::{ - gen_stream_chunk, gen_test_log_store_table, - }; - use crate::common::log_store::kv_log_store::KvLogStoreFactory; - use crate::common::log_store::{ - LogReader, LogStoreFactory, LogStoreReadItem, LogWriter, TruncateOffset, - }; - - #[tokio::test] - async fn test_basic() { - for count in 0..20 { - test_basic_inner(count).await - } - } - - async fn test_basic_inner(max_stream_chunk_count: usize) { - let test_env = prepare_hummock_test_env().await; - - let table = gen_test_log_store_table(); - - test_env.register_table(table.clone()).await; - - let factory = KvLogStoreFactory::new( - test_env.storage.clone(), - table.clone(), - None, - max_stream_chunk_count, - ); - let (mut reader, mut writer) = factory.build().await; - - let stream_chunk1 = gen_stream_chunk(0); - let stream_chunk2 = gen_stream_chunk(10); - - let epoch1 = test_env - .storage - .get_pinned_version() - .version() - .max_committed_epoch - + 1; - writer - .init(EpochPair::new_test_epoch(epoch1)) - .await - .unwrap(); - writer.write_chunk(stream_chunk1.clone()).await.unwrap(); - let epoch2 = epoch1 + 1; - writer.flush_current_epoch(epoch2, false).await.unwrap(); - writer.write_chunk(stream_chunk2.clone()).await.unwrap(); - let epoch3 = epoch2 + 1; - writer.flush_current_epoch(epoch3, true).await.unwrap(); - - test_env.storage.seal_epoch(epoch1, false); - test_env.storage.seal_epoch(epoch2, true); - let sync_result: SyncResult = test_env.storage.sync(epoch2).await.unwrap(); - assert!(!sync_result.uncommitted_ssts.is_empty()); - - reader.init().await.unwrap(); - match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - .. - }, - ) => { - assert_eq!(epoch, epoch1); - assert_eq!(stream_chunk1, read_stream_chunk); - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { - assert_eq!(epoch, epoch1); - assert!(!is_checkpoint) - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - .. - }, - ) => { - assert_eq!(epoch, epoch2); - assert_eq!(stream_chunk2, read_stream_chunk); - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { - assert_eq!(epoch, epoch2); - assert!(is_checkpoint) - } - _ => unreachable!(), - } - } - - #[tokio::test] - async fn test_recovery() { - for count in 0..20 { - test_recovery_inner(count).await - } - } - - async fn test_recovery_inner(max_stream_chunk_count: usize) { - let test_env = prepare_hummock_test_env().await; - - let table = gen_test_log_store_table(); - - test_env.register_table(table.clone()).await; - - let factory = KvLogStoreFactory::new( - test_env.storage.clone(), - table.clone(), - None, - max_stream_chunk_count, - ); - let (mut reader, mut writer) = factory.build().await; - - let stream_chunk1 = gen_stream_chunk(0); - let stream_chunk2 = gen_stream_chunk(10); - - let epoch1 = test_env - .storage - .get_pinned_version() - .version() - .max_committed_epoch - + 1; - writer - .init(EpochPair::new_test_epoch(epoch1)) - .await - .unwrap(); - writer.write_chunk(stream_chunk1.clone()).await.unwrap(); - let epoch2 = epoch1 + 1; - writer.flush_current_epoch(epoch2, false).await.unwrap(); - writer.write_chunk(stream_chunk2.clone()).await.unwrap(); - let epoch3 = epoch2 + 1; - writer.flush_current_epoch(epoch3, true).await.unwrap(); - - test_env.storage.seal_epoch(epoch1, false); - - reader.init().await.unwrap(); - match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - .. - }, - ) => { - assert_eq!(epoch, epoch1); - assert_eq!(stream_chunk1, read_stream_chunk); - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { - assert_eq!(epoch, epoch1); - assert!(!is_checkpoint) - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - .. - }, - ) => { - assert_eq!(epoch, epoch2); - assert_eq!(stream_chunk2, read_stream_chunk); - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { - assert_eq!(epoch, epoch2); - assert!(is_checkpoint) - } - _ => unreachable!(), - } - - test_env.commit_epoch(epoch2).await; - // The truncate does not work because it is after the sync - reader - .truncate(TruncateOffset::Barrier { epoch: epoch2 }) - .await - .unwrap(); - test_env - .storage - .try_wait_epoch(HummockReadEpoch::Committed(epoch2)) - .await - .unwrap(); - - // Recovery - test_env.storage.clear_shared_buffer().await.unwrap(); - - // Rebuild log reader and writer in recovery - let factory = KvLogStoreFactory::new( - test_env.storage.clone(), - table.clone(), - None, - max_stream_chunk_count, - ); - let (mut reader, mut writer) = factory.build().await; - writer - .init(EpochPair::new_test_epoch(epoch3)) - .await - .unwrap(); - reader.init().await.unwrap(); - match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - .. - }, - ) => { - assert_eq!(epoch, epoch1); - assert_eq!(stream_chunk1, read_stream_chunk); - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { - assert_eq!(epoch, epoch1); - assert!(!is_checkpoint) - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - .. - }, - ) => { - assert_eq!(epoch, epoch2); - assert_eq!(stream_chunk2, read_stream_chunk); - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { - assert_eq!(epoch, epoch2); - assert!(is_checkpoint) - } - _ => unreachable!(), - } - } - - #[tokio::test] - async fn test_truncate() { - for count in 2..10 { - test_truncate_inner(count).await - } - } - - async fn test_truncate_inner(max_stream_chunk_count: usize) { - let test_env = prepare_hummock_test_env().await; - - let table = gen_test_log_store_table(); - - test_env.register_table(table.clone()).await; - - let factory = KvLogStoreFactory::new( - test_env.storage.clone(), - table.clone(), - None, - max_stream_chunk_count, - ); - let (mut reader, mut writer) = factory.build().await; - - let stream_chunk1_1 = gen_stream_chunk(0); - let stream_chunk1_2 = gen_stream_chunk(10); - let stream_chunk2 = gen_stream_chunk(20); - - let epoch1 = test_env - .storage - .get_pinned_version() - .version() - .max_committed_epoch - + 1; - writer - .init(EpochPair::new_test_epoch(epoch1)) - .await - .unwrap(); - writer.write_chunk(stream_chunk1_1.clone()).await.unwrap(); - writer.write_chunk(stream_chunk1_2.clone()).await.unwrap(); - let epoch2 = epoch1 + 1; - writer.flush_current_epoch(epoch2, true).await.unwrap(); - writer.write_chunk(stream_chunk2.clone()).await.unwrap(); - - test_env.commit_epoch(epoch1).await; - - reader.init().await.unwrap(); - let chunk_id1 = match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - chunk_id, - }, - ) => { - assert_eq!(epoch, epoch1); - assert_eq!(stream_chunk1_1, read_stream_chunk); - chunk_id - } - _ => unreachable!(), - }; - let chunk_id2 = match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - chunk_id, - }, - ) => { - assert_eq!(epoch, epoch1); - assert_eq!(stream_chunk1_2, read_stream_chunk); - chunk_id - } - _ => unreachable!(), - }; - assert!(chunk_id2 > chunk_id1); - match reader.next_item().await.unwrap() { - (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { - assert_eq!(epoch, epoch1); - assert!(is_checkpoint) - } - _ => unreachable!(), - } - - match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - .. - }, - ) => { - assert_eq!(epoch, epoch2); - assert_eq!(stream_chunk2, read_stream_chunk); - } - _ => unreachable!(), - } - - // The truncate should work because it is before the flush - reader - .truncate(TruncateOffset::Chunk { - epoch: epoch1, - chunk_id: chunk_id1, - }) - .await - .unwrap(); - let epoch3 = epoch2 + 1; - writer.flush_current_epoch(epoch3, true).await.unwrap(); - - match reader.next_item().await.unwrap() { - (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { - assert_eq!(epoch, epoch2); - assert!(is_checkpoint) - } - _ => unreachable!(), - } - - // Truncation on epoch1 should work because it is before this sync - test_env.commit_epoch(epoch2).await; - test_env - .storage - .try_wait_epoch(HummockReadEpoch::Committed(epoch2)) - .await - .unwrap(); - - // Recovery - test_env.storage.clear_shared_buffer().await.unwrap(); - - // Rebuild log reader and writer in recovery - let factory = KvLogStoreFactory::new( - test_env.storage.clone(), - table.clone(), - None, - max_stream_chunk_count, - ); - let (mut reader, mut writer) = factory.build().await; - - writer - .init(EpochPair::new_test_epoch(epoch3)) - .await - .unwrap(); - let stream_chunk3 = gen_stream_chunk(30); - writer.write_chunk(stream_chunk3.clone()).await.unwrap(); - - reader.init().await.unwrap(); - match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - .. - }, - ) => { - assert_eq!(epoch, epoch1); - assert_eq!(stream_chunk1_2, read_stream_chunk); - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { - assert_eq!(epoch, epoch1); - assert!(is_checkpoint) - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - .. - }, - ) => { - assert_eq!(epoch, epoch2); - assert_eq!(stream_chunk2, read_stream_chunk); - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { - assert_eq!(epoch, epoch2); - assert!(is_checkpoint) - } - _ => unreachable!(), - } - match reader.next_item().await.unwrap() { - ( - epoch, - LogStoreReadItem::StreamChunk { - chunk: read_stream_chunk, - .. - }, - ) => { - assert_eq!(epoch, epoch3); - assert_eq!(stream_chunk3, read_stream_chunk); - } - _ => unreachable!(), - } - } -} diff --git a/src/stream/src/common/log_store/kv_log_store/test_utils.rs b/src/stream/src/common/log_store/kv_log_store/test_utils.rs deleted file mode 100644 index 8eb3a82fb742d..0000000000000 --- a/src/stream/src/common/log_store/kv_log_store/test_utils.rs +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2023 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use itertools::Itertools; -use risingwave_common::array::{Op, StreamChunk}; -use risingwave_common::catalog::{ColumnDesc, ColumnId, TableId}; -use risingwave_common::row::OwnedRow; -use risingwave_common::types::{DataType, ScalarImpl, ScalarRef}; -use risingwave_common::util::chunk_coalesce::DataChunkBuilder; -use risingwave_common::util::sort_util::OrderType; -use risingwave_pb::catalog::PbTable; - -use crate::common::table::test_utils::gen_prost_table; - -pub(crate) const TEST_TABLE_ID: TableId = TableId { table_id: 233 }; - -pub(crate) fn gen_test_data(base: i64) -> (Vec, Vec) { - let ops = vec![Op::Insert, Op::Delete, Op::UpdateDelete, Op::UpdateInsert]; - let rows = vec![ - OwnedRow::new(vec![ - Some(ScalarImpl::Int64(1 + base)), - Some(ScalarImpl::Utf8("name1".to_owned_scalar())), - ]), - OwnedRow::new(vec![ - Some(ScalarImpl::Int64(2 + base)), - Some(ScalarImpl::Utf8("name2".to_owned_scalar())), - ]), - OwnedRow::new(vec![ - Some(ScalarImpl::Int64(3 + base)), - Some(ScalarImpl::Utf8("name3".to_owned_scalar())), - ]), - OwnedRow::new(vec![ - Some(ScalarImpl::Int64(3 + base)), - Some(ScalarImpl::Utf8("name4".to_owned_scalar())), - ]), - ]; - (ops, rows) -} - -pub(crate) fn test_payload_schema() -> Vec { - vec![ - ColumnDesc::unnamed(ColumnId::from(3), DataType::Int64), // id - ColumnDesc::unnamed(ColumnId::from(2), DataType::Varchar), // name - ] -} - -pub(crate) fn test_log_store_table_schema() -> Vec { - let mut column_descs = vec![ - ColumnDesc::unnamed(ColumnId::from(0), DataType::Int64), // epoch - ColumnDesc::unnamed(ColumnId::from(1), DataType::Int32), // Seq id - ColumnDesc::unnamed(ColumnId::from(2), DataType::Int16), // op code - ]; - column_descs.extend(test_payload_schema()); - column_descs -} - -pub(crate) fn gen_stream_chunk(base: i64) -> StreamChunk { - let (ops, rows) = gen_test_data(base); - let mut builder = DataChunkBuilder::new( - test_payload_schema() - .iter() - .map(|col| col.data_type.clone()) - .collect_vec(), - 1000000, - ); - for row in &rows { - assert!(builder.append_one_row(row).is_none()); - } - let data_chunk = builder.consume_all().unwrap(); - StreamChunk::from_parts(ops, data_chunk) -} - -pub(crate) fn gen_test_log_store_table() -> PbTable { - let schema = test_log_store_table_schema(); - let order_types = vec![OrderType::ascending(), OrderType::ascending_nulls_last()]; - let pk_index = vec![0_usize, 1_usize]; - let read_prefix_len_hint = 0; - gen_prost_table( - TEST_TABLE_ID, - schema, - order_types, - pk_index, - read_prefix_len_hint, - ) -} diff --git a/src/stream/src/common/log_store/mod.rs b/src/stream/src/common/log_store/mod.rs deleted file mode 100644 index 35f1a4145ec55..0000000000000 --- a/src/stream/src/common/log_store/mod.rs +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright 2023 RisingWave Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod in_mem; -pub mod kv_log_store; - -use std::cmp::Ordering; -use std::fmt::Debug; -use std::future::Future; -use std::sync::Arc; - -use anyhow::anyhow; -use risingwave_common::array::StreamChunk; -use risingwave_common::buffer::Bitmap; -use risingwave_common::util::epoch::EpochPair; -use risingwave_common::util::value_encoding::error::ValueEncodingError; -use risingwave_storage::error::StorageError; - -#[derive(thiserror::Error, Debug)] -pub enum LogStoreError { - #[error("EndOfLogStream")] - EndOfLogStream, - - #[error("Storage error: {0}")] - StorageError(#[from] StorageError), - - #[error(transparent)] - Internal(#[from] anyhow::Error), - - #[error("Value encoding error: {0}")] - ValueEncoding(#[from] ValueEncodingError), -} - -pub type LogStoreResult = Result; -pub type ChunkId = usize; - -#[derive(Debug, PartialEq, Copy, Clone)] -pub enum TruncateOffset { - Chunk { epoch: u64, chunk_id: ChunkId }, - Barrier { epoch: u64 }, -} - -impl PartialOrd for TruncateOffset { - fn partial_cmp(&self, other: &Self) -> Option { - let extract = |offset: &TruncateOffset| match offset { - TruncateOffset::Chunk { epoch, chunk_id } => (*epoch, *chunk_id), - TruncateOffset::Barrier { epoch } => (*epoch, usize::MAX), - }; - let this = extract(self); - let other = extract(other); - this.partial_cmp(&other) - } -} - -impl TruncateOffset { - pub fn next_chunk_id(&self) -> ChunkId { - match self { - TruncateOffset::Chunk { chunk_id, .. } => chunk_id + 1, - TruncateOffset::Barrier { .. } => 0, - } - } - - pub fn epoch(&self) -> u64 { - match self { - TruncateOffset::Chunk { epoch, .. } | TruncateOffset::Barrier { epoch } => *epoch, - } - } - - pub fn check_next_item_epoch(&self, epoch: u64) -> LogStoreResult<()> { - match self { - TruncateOffset::Chunk { - epoch: offset_epoch, - .. - } => { - if epoch != *offset_epoch { - return Err(anyhow!( - "new item epoch {} not match current chunk offset epoch {}", - epoch, - offset_epoch - ) - .into()); - } - } - TruncateOffset::Barrier { - epoch: offset_epoch, - } => { - if epoch <= *offset_epoch { - return Err(anyhow!( - "new item epoch {} not exceed barrier offset epoch {}", - epoch, - offset_epoch - ) - .into()); - } - } - } - Ok(()) - } -} - -#[derive(Debug)] -pub enum LogStoreReadItem { - StreamChunk { - chunk: StreamChunk, - chunk_id: ChunkId, - }, - Barrier { - is_checkpoint: bool, - }, - UpdateVnodeBitmap(Arc), -} - -pub trait LogWriter { - /// Initialize the log writer with an epoch - fn init(&mut self, epoch: EpochPair) -> impl Future> + Send + '_; - - /// Write a stream chunk to the log writer - fn write_chunk( - &mut self, - chunk: StreamChunk, - ) -> impl Future> + Send + '_; - - /// Mark current epoch as finished and sealed, and flush the unconsumed log data. - fn flush_current_epoch( - &mut self, - next_epoch: u64, - is_checkpoint: bool, - ) -> impl Future> + Send + '_; - - /// Update the vnode bitmap of the log writer - fn update_vnode_bitmap( - &mut self, - new_vnodes: Arc, - ) -> impl Future> + Send + '_; -} - -pub trait LogReader { - /// Initialize the log reader. Usually function as waiting for log writer to be initialized. - fn init(&mut self) -> impl Future> + Send + '_; - - /// Emit the next item. - fn next_item( - &mut self, - ) -> impl Future> + Send + '_; - - /// Mark that all items emitted so far have been consumed and it is safe to truncate the log - /// from the current offset. - fn truncate( - &mut self, - offset: TruncateOffset, - ) -> impl Future> + Send + '_; -} - -pub trait LogStoreFactory: 'static { - type Reader: LogReader + Send + 'static; - type Writer: LogWriter + Send + 'static; - - fn build(self) -> impl Future + Send; -} - -#[cfg(test)] -mod tests { - use crate::common::log_store::TruncateOffset; - - #[test] - fn test_truncate_offset_cmp() { - assert!( - TruncateOffset::Barrier { epoch: 232 } - < TruncateOffset::Chunk { - epoch: 233, - chunk_id: 1 - } - ); - assert_eq!( - TruncateOffset::Chunk { - epoch: 1, - chunk_id: 1 - }, - TruncateOffset::Chunk { - epoch: 1, - chunk_id: 1 - } - ); - assert!( - TruncateOffset::Chunk { - epoch: 1, - chunk_id: 1 - } < TruncateOffset::Chunk { - epoch: 1, - chunk_id: 2 - } - ); - assert!( - TruncateOffset::Barrier { epoch: 1 } - > TruncateOffset::Chunk { - epoch: 1, - chunk_id: 2 - } - ); - assert!( - TruncateOffset::Chunk { - epoch: 1, - chunk_id: 2 - } < TruncateOffset::Barrier { epoch: 1 } - ); - assert!( - TruncateOffset::Chunk { - epoch: 2, - chunk_id: 2 - } > TruncateOffset::Barrier { epoch: 1 } - ); - assert!(TruncateOffset::Barrier { epoch: 2 } > TruncateOffset::Barrier { epoch: 1 }); - } -} diff --git a/src/stream/src/common/log_store/in_mem.rs b/src/stream/src/common/log_store_impl/in_mem.rs similarity index 96% rename from src/stream/src/common/log_store/in_mem.rs rename to src/stream/src/common/log_store_impl/in_mem.rs index 01192f951b843..35040be82c93b 100644 --- a/src/stream/src/common/log_store/in_mem.rs +++ b/src/stream/src/common/log_store_impl/in_mem.rs @@ -18,16 +18,15 @@ use anyhow::anyhow; use risingwave_common::array::StreamChunk; use risingwave_common::buffer::Bitmap; use risingwave_common::util::epoch::{EpochPair, INVALID_EPOCH}; +use risingwave_connector::sink::log_store::{ + LogReader, LogStoreFactory, LogStoreReadItem, LogStoreResult, LogWriter, TruncateOffset, +}; use tokio::sync::mpsc::{ channel, unbounded_channel, Receiver, Sender, UnboundedReceiver, UnboundedSender, }; use tokio::sync::oneshot; -use crate::common::log_store::in_mem::LogReaderEpochProgress::{AwaitingTruncate, Consuming}; -use crate::common::log_store::{ - LogReader, LogStoreError, LogStoreFactory, LogStoreReadItem, LogStoreResult, LogWriter, - TruncateOffset, -}; +use crate::common::log_store_impl::in_mem::LogReaderEpochProgress::{AwaitingTruncate, Consuming}; enum InMemLogStoreItem { StreamChunk(StreamChunk), @@ -193,10 +192,9 @@ impl LogReader for BoundedInMemLogStoreReader { }, AwaitingTruncate { .. } => Err(anyhow!( "should not call next_item on checkpoint barrier for in-mem log store" - ) - .into()), + )), }, - None => Err(LogStoreError::EndOfLogStream), + None => Err(anyhow!("end of log stream")), } } @@ -207,8 +205,7 @@ impl LogReader for BoundedInMemLogStoreReader { "truncate offset {:?} but prev truncate offset is {:?}", offset, self.truncate_offset - ) - .into()); + )); } // check the truncate offset does not exceed the latest possible offset @@ -217,8 +214,7 @@ impl LogReader for BoundedInMemLogStoreReader { "truncate at {:?} but latest offset is {:?}", offset, self.latest_offset - ) - .into()); + )); } if let AwaitingTruncate { @@ -288,11 +284,10 @@ impl LogWriter for BoundedInMemLogStoreWriter { } async fn update_vnode_bitmap(&mut self, new_vnodes: Arc) -> LogStoreResult<()> { - Ok(self - .item_tx + self.item_tx .send(InMemLogStoreItem::UpdateVnodeBitmap(new_vnodes)) .await - .map_err(|_| anyhow!("unable to send vnode bitmap"))?) + .map_err(|_| anyhow!("unable to send vnode bitmap")) } } @@ -305,11 +300,11 @@ mod tests { use risingwave_common::array::Op; use risingwave_common::types::{DataType, ScalarImpl}; use risingwave_common::util::epoch::EpochPair; - - use crate::common::log_store::in_mem::BoundedInMemLogStoreFactory; - use crate::common::log_store::{ + use risingwave_connector::sink::log_store::{ LogReader, LogStoreFactory, LogStoreReadItem, LogWriter, TruncateOffset, }; + + use crate::common::log_store_impl::in_mem::BoundedInMemLogStoreFactory; use crate::common::StreamChunkBuilder; #[tokio::test] diff --git a/src/stream/src/common/log_store/kv_log_store/buffer.rs b/src/stream/src/common/log_store_impl/kv_log_store/buffer.rs similarity index 89% rename from src/stream/src/common/log_store/kv_log_store/buffer.rs rename to src/stream/src/common/log_store_impl/kv_log_store/buffer.rs index b478123e6d9cb..ed1c495c81d75 100644 --- a/src/stream/src/common/log_store/kv_log_store/buffer.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/buffer.rs @@ -19,10 +19,10 @@ use std::sync::Arc; use parking_lot::{Mutex, MutexGuard}; use risingwave_common::array::StreamChunk; use risingwave_common::buffer::Bitmap; +use risingwave_connector::sink::log_store::{ChunkId, LogStoreResult, TruncateOffset}; use tokio::sync::{oneshot, Notify}; -use crate::common::log_store::kv_log_store::{ReaderTruncationOffsetType, SeqIdType}; -use crate::common::log_store::{ChunkId, LogStoreResult, TruncateOffset}; +use crate::common::log_store_impl::kv_log_store::{ReaderTruncationOffsetType, SeqIdType}; #[derive(Clone)] pub(crate) enum LogStoreBufferItem { @@ -54,23 +54,26 @@ struct LogStoreBufferInner { unconsumed_queue: VecDeque<(u64, LogStoreBufferItem)>, /// Items already read by log reader by not truncated. Newer item at the front consumed_queue: VecDeque<(u64, LogStoreBufferItem)>, - stream_chunk_count: usize, - max_stream_chunk_count: usize, + row_count: usize, + max_row_count: usize, - updated_truncation: Option, + truncation_list: VecDeque, next_chunk_id: ChunkId, } impl LogStoreBufferInner { fn can_add_stream_chunk(&self) -> bool { - self.stream_chunk_count < self.max_stream_chunk_count + self.row_count < self.max_row_count } fn add_item(&mut self, epoch: u64, item: LogStoreBufferItem) { if let LogStoreBufferItem::StreamChunk { .. } = item { unreachable!("StreamChunk should call try_add_item") } + if let LogStoreBufferItem::Barrier { .. } = &item { + self.next_chunk_id = 0; + } self.unconsumed_queue.push_front((epoch, item)); } @@ -86,7 +89,7 @@ impl LogStoreBufferInner { } else { let chunk_id = self.next_chunk_id; self.next_chunk_id += 1; - self.stream_chunk_count += 1; + self.row_count += chunk.cardinality(); self.unconsumed_queue.push_front(( epoch, LogStoreBufferItem::StreamChunk { @@ -244,8 +247,13 @@ impl LogStoreBufferSender { self.update_notify.notify_waiters(); } - pub(crate) fn pop_truncation(&self) -> Option { - self.buffer.inner().updated_truncation.take() + pub(crate) fn pop_truncation(&self, curr_epoch: u64) -> Option { + let mut inner = self.buffer.inner(); + let mut ret = None; + while let Some((epoch, _)) = inner.truncation_list.front() && *epoch < curr_epoch { + ret = inner.truncation_list.pop_front(); + } + ret } pub(crate) fn flush_all_unflushed( @@ -321,6 +329,7 @@ impl LogStoreBufferReceiver { chunk_id, flushed, end_seq_id, + chunk, .. } => { let chunk_offset = TruncateOffset::Chunk { @@ -330,7 +339,7 @@ impl LogStoreBufferReceiver { let flushed = *flushed; let end_seq_id = *end_seq_id; if chunk_offset <= offset { - inner.stream_chunk_count -= 1; + inner.row_count -= chunk.cardinality(); inner.consumed_queue.pop_back(); if flushed { latest_offset = Some((epoch, Some(end_seq_id))); @@ -370,21 +379,25 @@ impl LogStoreBufferReceiver { } } } - if let Some(offset) = latest_offset { - inner.updated_truncation = Some(offset); + if let Some((epoch, seq_id)) = latest_offset { + if let Some((prev_epoch, ref mut prev_seq_id)) = inner.truncation_list.back_mut() && *prev_epoch == epoch { + *prev_seq_id = seq_id; + } else { + inner.truncation_list.push_back((epoch, seq_id)); + } } } } pub(crate) fn new_log_store_buffer( - max_stream_chunk_count: usize, + max_row_count: usize, ) -> (LogStoreBufferSender, LogStoreBufferReceiver) { let buffer = SharedMutex::new(LogStoreBufferInner { unconsumed_queue: VecDeque::new(), consumed_queue: VecDeque::new(), - stream_chunk_count: 0, - max_stream_chunk_count, - updated_truncation: None, + row_count: 0, + max_row_count, + truncation_list: VecDeque::new(), next_chunk_id: 0, }); let update_notify = Arc::new(Notify::new()); diff --git a/src/stream/src/common/log_store_impl/kv_log_store/mod.rs b/src/stream/src/common/log_store_impl/kv_log_store/mod.rs new file mode 100644 index 0000000000000..4256da4ca9325 --- /dev/null +++ b/src/stream/src/common/log_store_impl/kv_log_store/mod.rs @@ -0,0 +1,975 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use risingwave_common::buffer::Bitmap; +use risingwave_common::catalog::{TableId, TableOption}; +use risingwave_common::metrics::LabelGuardedIntCounter; +use risingwave_connector::sink::log_store::LogStoreFactory; +use risingwave_connector::sink::{SinkParam, SinkWriterParam}; +use risingwave_pb::catalog::Table; +use risingwave_storage::store::NewLocalOptions; +use risingwave_storage::StateStore; + +use crate::common::log_store_impl::kv_log_store::buffer::new_log_store_buffer; +use crate::common::log_store_impl::kv_log_store::reader::KvLogStoreReader; +use crate::common::log_store_impl::kv_log_store::serde::LogStoreRowSerde; +use crate::common::log_store_impl::kv_log_store::writer::KvLogStoreWriter; +use crate::executor::monitor::StreamingMetrics; + +mod buffer; +mod reader; +mod serde; +#[cfg(test)] +mod test_utils; +mod writer; + +type SeqIdType = i32; +type RowOpCodeType = i16; + +const FIRST_SEQ_ID: SeqIdType = 0; + +/// Readers truncate the offset at the granularity of seq id. +/// None `SeqIdType` means that the whole epoch is truncated. +type ReaderTruncationOffsetType = (u64, Option); + +#[derive(Clone)] +pub(crate) struct KvLogStoreReadMetrics { + pub storage_read_count: LabelGuardedIntCounter<4>, + pub storage_read_size: LabelGuardedIntCounter<4>, +} + +impl KvLogStoreReadMetrics { + #[cfg(test)] + pub(crate) fn for_test() -> Self { + Self { + storage_read_count: LabelGuardedIntCounter::test_int_counter(), + storage_read_size: LabelGuardedIntCounter::test_int_counter(), + } + } +} + +#[derive(Clone)] +pub(crate) struct KvLogStoreMetrics { + pub storage_write_count: LabelGuardedIntCounter<3>, + pub storage_write_size: LabelGuardedIntCounter<3>, + pub persistent_log_read_metrics: KvLogStoreReadMetrics, + pub flushed_buffer_read_metrics: KvLogStoreReadMetrics, +} + +impl KvLogStoreMetrics { + pub(crate) fn new( + metrics: &StreamingMetrics, + writer_param: &SinkWriterParam, + sink_param: &SinkParam, + connector: &'static str, + ) -> Self { + let executor_id = format!("{}", writer_param.executor_id); + let sink_id = format!("{}", sink_param.sink_id.sink_id); + let storage_write_size = metrics.kv_log_store_storage_write_size.with_label_values(&[ + executor_id.as_str(), + connector, + sink_id.as_str(), + ]); + let storage_write_count = metrics + .kv_log_store_storage_write_count + .with_label_values(&[executor_id.as_str(), connector, sink_id.as_str()]); + + const READ_PERSISTENT_LOG: &str = "persistent_log"; + const READ_FLUSHED_BUFFER: &str = "flushed_buffer"; + + let persistent_log_read_size = metrics.kv_log_store_storage_read_size.with_label_values(&[ + executor_id.as_str(), + connector, + sink_id.as_str(), + READ_PERSISTENT_LOG, + ]); + let persistent_log_read_count = + metrics.kv_log_store_storage_read_count.with_label_values(&[ + executor_id.as_str(), + connector, + sink_id.as_str(), + READ_PERSISTENT_LOG, + ]); + + let flushed_buffer_read_size = metrics.kv_log_store_storage_read_size.with_label_values(&[ + executor_id.as_str(), + connector, + sink_id.as_str(), + READ_FLUSHED_BUFFER, + ]); + let flushed_buffer_read_count = + metrics.kv_log_store_storage_read_count.with_label_values(&[ + executor_id.as_str(), + connector, + sink_id.as_str(), + READ_FLUSHED_BUFFER, + ]); + + Self { + storage_write_size, + storage_write_count, + persistent_log_read_metrics: KvLogStoreReadMetrics { + storage_read_size: persistent_log_read_size, + storage_read_count: persistent_log_read_count, + }, + flushed_buffer_read_metrics: KvLogStoreReadMetrics { + storage_read_count: flushed_buffer_read_count, + storage_read_size: flushed_buffer_read_size, + }, + } + } + + #[cfg(test)] + fn for_test() -> Self { + KvLogStoreMetrics { + storage_write_count: LabelGuardedIntCounter::test_int_counter(), + storage_write_size: LabelGuardedIntCounter::test_int_counter(), + persistent_log_read_metrics: KvLogStoreReadMetrics::for_test(), + flushed_buffer_read_metrics: KvLogStoreReadMetrics::for_test(), + } + } +} + +pub(crate) struct FlushInfo { + pub(crate) flush_size: usize, + pub(crate) flush_count: usize, +} + +impl FlushInfo { + pub(crate) fn new() -> Self { + FlushInfo { + flush_count: 0, + flush_size: 0, + } + } + + pub(crate) fn flush_one(&mut self, size: usize) { + self.flush_size += size; + self.flush_count += 1; + } + + pub(crate) fn report(self, metrics: &KvLogStoreMetrics) { + metrics.storage_write_count.inc_by(self.flush_count as _); + metrics.storage_write_size.inc_by(self.flush_size as _); + } +} + +pub struct KvLogStoreFactory { + state_store: S, + + table_catalog: Table, + + vnodes: Option>, + + max_row_count: usize, + + metrics: KvLogStoreMetrics, +} + +impl KvLogStoreFactory { + pub(crate) fn new( + state_store: S, + table_catalog: Table, + vnodes: Option>, + max_row_count: usize, + metrics: KvLogStoreMetrics, + ) -> Self { + Self { + state_store, + table_catalog, + vnodes, + max_row_count, + metrics, + } + } +} + +impl LogStoreFactory for KvLogStoreFactory { + type Reader = KvLogStoreReader; + type Writer = KvLogStoreWriter; + + async fn build(self) -> (Self::Reader, Self::Writer) { + let table_id = TableId::new(self.table_catalog.id); + let serde = LogStoreRowSerde::new(&self.table_catalog, self.vnodes); + let local_state_store = self + .state_store + .new_local(NewLocalOptions { + table_id: TableId { + table_id: self.table_catalog.id, + }, + is_consistent_op: false, + table_option: TableOption { + retention_seconds: None, + }, + is_replicated: false, + }) + .await; + + let (tx, rx) = new_log_store_buffer(self.max_row_count); + + let reader = KvLogStoreReader::new( + table_id, + self.state_store, + serde.clone(), + rx, + self.metrics.clone(), + ); + + let writer = KvLogStoreWriter::new(table_id, local_state_store, serde, tx, self.metrics); + + (reader, writer) + } +} + +#[cfg(test)] +mod tests { + use std::future::{poll_fn, Future}; + use std::pin::pin; + use std::sync::Arc; + use std::task::Poll; + + use risingwave_common::buffer::{Bitmap, BitmapBuilder}; + use risingwave_common::hash::VirtualNode; + use risingwave_common::util::epoch::EpochPair; + use risingwave_connector::sink::log_store::{ + LogReader, LogStoreFactory, LogStoreReadItem, LogWriter, TruncateOffset, + }; + use risingwave_hummock_sdk::HummockReadEpoch; + use risingwave_hummock_test::test_utils::prepare_hummock_test_env; + use risingwave_storage::store::SyncResult; + use risingwave_storage::StateStore; + + use crate::common::log_store_impl::kv_log_store::test_utils::{ + calculate_vnode_bitmap, check_rows_eq, check_stream_chunk_eq, + gen_multi_vnode_stream_chunks, gen_stream_chunk, gen_test_log_store_table, TEST_DATA_SIZE, + }; + use crate::common::log_store_impl::kv_log_store::{KvLogStoreFactory, KvLogStoreMetrics}; + + #[tokio::test] + async fn test_basic() { + for count in 0..20 { + test_basic_inner(count * TEST_DATA_SIZE).await + } + } + + async fn test_basic_inner(max_row_count: usize) { + let test_env = prepare_hummock_test_env().await; + + let table = gen_test_log_store_table(); + + test_env.register_table(table.clone()).await; + + let stream_chunk1 = gen_stream_chunk(0); + let stream_chunk2 = gen_stream_chunk(10); + let bitmap = calculate_vnode_bitmap(stream_chunk1.rows().chain(stream_chunk2.rows())); + + let factory = KvLogStoreFactory::new( + test_env.storage.clone(), + table.clone(), + Some(Arc::new(bitmap)), + max_row_count, + KvLogStoreMetrics::for_test(), + ); + let (mut reader, mut writer) = factory.build().await; + + let epoch1 = test_env + .storage + .get_pinned_version() + .version() + .max_committed_epoch + + 1; + writer + .init(EpochPair::new_test_epoch(epoch1)) + .await + .unwrap(); + writer.write_chunk(stream_chunk1.clone()).await.unwrap(); + let epoch2 = epoch1 + 1; + writer.flush_current_epoch(epoch2, false).await.unwrap(); + writer.write_chunk(stream_chunk2.clone()).await.unwrap(); + let epoch3 = epoch2 + 1; + writer.flush_current_epoch(epoch3, true).await.unwrap(); + + test_env.storage.seal_epoch(epoch1, false); + test_env.storage.seal_epoch(epoch2, true); + let sync_result: SyncResult = test_env.storage.sync(epoch2).await.unwrap(); + assert!(!sync_result.uncommitted_ssts.is_empty()); + + reader.init().await.unwrap(); + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch1); + assert!(check_stream_chunk_eq(&stream_chunk1, &read_stream_chunk)); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch1); + assert!(!is_checkpoint) + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch2); + assert!(check_stream_chunk_eq(&stream_chunk2, &read_stream_chunk)); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch2); + assert!(is_checkpoint) + } + _ => unreachable!(), + } + } + + #[tokio::test] + async fn test_recovery() { + for count in 0..20 { + test_recovery_inner(count * TEST_DATA_SIZE).await + } + } + + async fn test_recovery_inner(max_row_count: usize) { + let test_env = prepare_hummock_test_env().await; + + let table = gen_test_log_store_table(); + + test_env.register_table(table.clone()).await; + + let stream_chunk1 = gen_stream_chunk(0); + let stream_chunk2 = gen_stream_chunk(10); + let bitmap = calculate_vnode_bitmap(stream_chunk1.rows().chain(stream_chunk2.rows())); + let bitmap = Arc::new(bitmap); + + let factory = KvLogStoreFactory::new( + test_env.storage.clone(), + table.clone(), + Some(bitmap.clone()), + max_row_count, + KvLogStoreMetrics::for_test(), + ); + let (mut reader, mut writer) = factory.build().await; + + let epoch1 = test_env + .storage + .get_pinned_version() + .version() + .max_committed_epoch + + 1; + writer + .init(EpochPair::new_test_epoch(epoch1)) + .await + .unwrap(); + writer.write_chunk(stream_chunk1.clone()).await.unwrap(); + let epoch2 = epoch1 + 1; + writer.flush_current_epoch(epoch2, false).await.unwrap(); + writer.write_chunk(stream_chunk2.clone()).await.unwrap(); + let epoch3 = epoch2 + 1; + writer.flush_current_epoch(epoch3, true).await.unwrap(); + + test_env.storage.seal_epoch(epoch1, false); + + reader.init().await.unwrap(); + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch1); + assert!(check_stream_chunk_eq(&stream_chunk1, &read_stream_chunk)); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch1); + assert!(!is_checkpoint) + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch2); + assert!(check_stream_chunk_eq(&stream_chunk2, &read_stream_chunk)); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch2); + assert!(is_checkpoint) + } + _ => unreachable!(), + } + + test_env.commit_epoch(epoch2).await; + // The truncate does not work because it is after the sync + reader + .truncate(TruncateOffset::Barrier { epoch: epoch2 }) + .await + .unwrap(); + test_env + .storage + .try_wait_epoch(HummockReadEpoch::Committed(epoch2)) + .await + .unwrap(); + + // Recovery + test_env.storage.clear_shared_buffer().await.unwrap(); + + // Rebuild log reader and writer in recovery + let factory = KvLogStoreFactory::new( + test_env.storage.clone(), + table.clone(), + Some(bitmap), + max_row_count, + KvLogStoreMetrics::for_test(), + ); + let (mut reader, mut writer) = factory.build().await; + writer + .init(EpochPair::new_test_epoch(epoch3)) + .await + .unwrap(); + reader.init().await.unwrap(); + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch1); + assert!(check_stream_chunk_eq(&stream_chunk1, &read_stream_chunk)); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch1); + assert!(!is_checkpoint) + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch2); + assert!(check_stream_chunk_eq(&stream_chunk2, &read_stream_chunk)); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch2); + assert!(is_checkpoint) + } + _ => unreachable!(), + } + } + + #[tokio::test] + async fn test_truncate() { + for count in 2..10 { + test_truncate_inner(count).await + } + } + + async fn test_truncate_inner(max_row_count: usize) { + let test_env = prepare_hummock_test_env().await; + + let table = gen_test_log_store_table(); + + test_env.register_table(table.clone()).await; + + let stream_chunk1_1 = gen_stream_chunk(0); + let stream_chunk1_2 = gen_stream_chunk(10); + let stream_chunk2 = gen_stream_chunk(20); + let stream_chunk3 = gen_stream_chunk(20); + let bitmap = calculate_vnode_bitmap( + stream_chunk1_1 + .rows() + .chain(stream_chunk1_2.rows()) + .chain(stream_chunk2.rows()) + .chain(stream_chunk3.rows()), + ); + let bitmap = Arc::new(bitmap); + + let factory = KvLogStoreFactory::new( + test_env.storage.clone(), + table.clone(), + Some(bitmap.clone()), + max_row_count, + KvLogStoreMetrics::for_test(), + ); + let (mut reader, mut writer) = factory.build().await; + + let epoch1 = test_env + .storage + .get_pinned_version() + .version() + .max_committed_epoch + + 1; + writer + .init(EpochPair::new_test_epoch(epoch1)) + .await + .unwrap(); + writer.write_chunk(stream_chunk1_1.clone()).await.unwrap(); + writer.write_chunk(stream_chunk1_2.clone()).await.unwrap(); + let epoch2 = epoch1 + 1; + writer.flush_current_epoch(epoch2, true).await.unwrap(); + writer.write_chunk(stream_chunk2.clone()).await.unwrap(); + + test_env.commit_epoch(epoch1).await; + + reader.init().await.unwrap(); + let chunk_id1 = match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + chunk_id, + }, + ) => { + assert_eq!(epoch, epoch1); + assert!(check_stream_chunk_eq(&stream_chunk1_1, &read_stream_chunk)); + chunk_id + } + _ => unreachable!(), + }; + let chunk_id2 = match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + chunk_id, + }, + ) => { + assert_eq!(epoch, epoch1); + assert!(check_stream_chunk_eq(&stream_chunk1_2, &read_stream_chunk)); + chunk_id + } + _ => unreachable!(), + }; + assert!(chunk_id2 > chunk_id1); + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch1); + assert!(is_checkpoint) + } + _ => unreachable!(), + } + + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch2); + assert!(check_stream_chunk_eq(&stream_chunk2, &read_stream_chunk)); + } + _ => unreachable!(), + } + + // The truncate should work because it is before the flush + reader + .truncate(TruncateOffset::Chunk { + epoch: epoch1, + chunk_id: chunk_id1, + }) + .await + .unwrap(); + let epoch3 = epoch2 + 1; + writer.flush_current_epoch(epoch3, true).await.unwrap(); + + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch2); + assert!(is_checkpoint) + } + _ => unreachable!(), + } + + // Truncation on epoch1 should work because it is before this sync + test_env.commit_epoch(epoch2).await; + test_env + .storage + .try_wait_epoch(HummockReadEpoch::Committed(epoch2)) + .await + .unwrap(); + + // Recovery + test_env.storage.clear_shared_buffer().await.unwrap(); + + // Rebuild log reader and writer in recovery + let factory = KvLogStoreFactory::new( + test_env.storage.clone(), + table.clone(), + Some(bitmap), + max_row_count, + KvLogStoreMetrics::for_test(), + ); + let (mut reader, mut writer) = factory.build().await; + + writer + .init(EpochPair::new_test_epoch(epoch3)) + .await + .unwrap(); + + writer.write_chunk(stream_chunk3.clone()).await.unwrap(); + + reader.init().await.unwrap(); + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch1); + assert!(check_stream_chunk_eq(&stream_chunk1_2, &read_stream_chunk)); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch1); + assert!(is_checkpoint) + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch2); + assert!(check_stream_chunk_eq(&stream_chunk2, &read_stream_chunk)); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch2); + assert!(is_checkpoint) + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch3); + assert!(check_stream_chunk_eq(&stream_chunk3, &read_stream_chunk)); + } + _ => unreachable!(), + } + } + + #[tokio::test] + async fn test_update_vnode_recover() { + let test_env = prepare_hummock_test_env().await; + + let table = gen_test_log_store_table(); + + test_env.register_table(table.clone()).await; + + fn build_bitmap(indexes: impl Iterator) -> Arc { + let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT); + for i in indexes { + builder.set(i, true); + } + Arc::new(builder.finish()) + } + + let vnodes1 = build_bitmap((0..VirtualNode::COUNT).filter(|i| i % 2 == 0)); + let vnodes2 = build_bitmap((0..VirtualNode::COUNT).filter(|i| i % 2 == 1)); + + let factory1 = KvLogStoreFactory::new( + test_env.storage.clone(), + table.clone(), + Some(vnodes1), + 10 * TEST_DATA_SIZE, + KvLogStoreMetrics::for_test(), + ); + let factory2 = KvLogStoreFactory::new( + test_env.storage.clone(), + table.clone(), + Some(vnodes2), + 10 * TEST_DATA_SIZE, + KvLogStoreMetrics::for_test(), + ); + let (mut reader1, mut writer1) = factory1.build().await; + let (mut reader2, mut writer2) = factory2.build().await; + + let epoch1 = test_env + .storage + .get_pinned_version() + .version() + .max_committed_epoch + + 1; + writer1 + .init(EpochPair::new_test_epoch(epoch1)) + .await + .unwrap(); + writer2 + .init(EpochPair::new_test_epoch(epoch1)) + .await + .unwrap(); + reader1.init().await.unwrap(); + reader2.init().await.unwrap(); + let [chunk1_1, chunk1_2] = gen_multi_vnode_stream_chunks::<2>(0, 100); + writer1.write_chunk(chunk1_1.clone()).await.unwrap(); + writer2.write_chunk(chunk1_2.clone()).await.unwrap(); + let epoch2 = epoch1 + 1; + writer1.flush_current_epoch(epoch2, false).await.unwrap(); + writer2.flush_current_epoch(epoch2, false).await.unwrap(); + let [chunk2_1, chunk2_2] = gen_multi_vnode_stream_chunks::<2>(200, 100); + writer1.write_chunk(chunk2_1.clone()).await.unwrap(); + writer2.write_chunk(chunk2_2.clone()).await.unwrap(); + + match reader1.next_item().await.unwrap() { + (epoch, LogStoreReadItem::StreamChunk { chunk, .. }) => { + assert_eq!(epoch, epoch1); + assert!(check_stream_chunk_eq(&chunk1_1, &chunk)); + } + _ => unreachable!(), + }; + match reader1.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch1); + assert!(!is_checkpoint); + } + _ => unreachable!(), + } + + match reader2.next_item().await.unwrap() { + (epoch, LogStoreReadItem::StreamChunk { chunk, .. }) => { + assert_eq!(epoch, epoch1); + assert!(check_stream_chunk_eq(&chunk1_2, &chunk)); + } + _ => unreachable!(), + } + match reader2.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch1); + assert!(!is_checkpoint); + } + _ => unreachable!(), + } + + // Only reader1 will truncate + reader1 + .truncate(TruncateOffset::Barrier { epoch: epoch1 }) + .await + .unwrap(); + + match reader1.next_item().await.unwrap() { + (epoch, LogStoreReadItem::StreamChunk { chunk, .. }) => { + assert_eq!(epoch, epoch2); + assert!(check_stream_chunk_eq(&chunk2_1, &chunk)); + } + _ => unreachable!(), + } + match reader2.next_item().await.unwrap() { + (epoch, LogStoreReadItem::StreamChunk { chunk, .. }) => { + assert_eq!(epoch, epoch2); + assert!(check_stream_chunk_eq(&chunk2_2, &chunk)); + } + _ => unreachable!(), + } + + let epoch3 = epoch2 + 1; + writer1.flush_current_epoch(epoch3, true).await.unwrap(); + writer2.flush_current_epoch(epoch3, true).await.unwrap(); + + match reader1.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch2); + assert!(is_checkpoint); + } + _ => unreachable!(), + } + match reader2.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch2); + assert!(is_checkpoint); + } + _ => unreachable!(), + } + + // Truncation of reader1 on epoch1 should work because it is before this sync + test_env.storage.seal_epoch(epoch1, false); + test_env.commit_epoch(epoch2).await; + test_env + .storage + .try_wait_epoch(HummockReadEpoch::Committed(epoch2)) + .await + .unwrap(); + + // Recovery + test_env.storage.clear_shared_buffer().await.unwrap(); + + let vnodes = build_bitmap(0..VirtualNode::COUNT); + let factory = KvLogStoreFactory::new( + test_env.storage.clone(), + table.clone(), + Some(vnodes), + 10 * TEST_DATA_SIZE, + KvLogStoreMetrics::for_test(), + ); + let (mut reader, mut writer) = factory.build().await; + writer.init(EpochPair::new(epoch3, epoch2)).await.unwrap(); + reader.init().await.unwrap(); + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::StreamChunk { chunk, .. }) => { + assert_eq!(epoch, epoch1); + assert!(check_stream_chunk_eq(&chunk1_2, &chunk)); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch1); + assert!(!is_checkpoint); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::StreamChunk { chunk, .. }) => { + assert_eq!(epoch, epoch2); + assert!(check_rows_eq( + chunk2_1.rows().chain(chunk2_2.rows()), + chunk.rows() + )); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch2); + assert!(is_checkpoint); + } + _ => unreachable!(), + } + } + + #[tokio::test] + async fn test_cancellation_safe() { + let test_env = prepare_hummock_test_env().await; + + let table = gen_test_log_store_table(); + + test_env.register_table(table.clone()).await; + + let stream_chunk1 = gen_stream_chunk(0); + let stream_chunk2 = gen_stream_chunk(10); + let bitmap = calculate_vnode_bitmap(stream_chunk1.rows().chain(stream_chunk2.rows())); + + let factory = KvLogStoreFactory::new( + test_env.storage.clone(), + table.clone(), + Some(Arc::new(bitmap)), + 0, + KvLogStoreMetrics::for_test(), + ); + let (mut reader, mut writer) = factory.build().await; + + let epoch1 = test_env + .storage + .get_pinned_version() + .version() + .max_committed_epoch + + 1; + writer + .init(EpochPair::new_test_epoch(epoch1)) + .await + .unwrap(); + writer.write_chunk(stream_chunk1.clone()).await.unwrap(); + let epoch2 = epoch1 + 1; + writer.flush_current_epoch(epoch2, true).await.unwrap(); + + reader.init().await.unwrap(); + + { + let mut future = pin!(reader.next_item()); + assert!(poll_fn(|cx| Poll::Ready(future.as_mut().poll(cx))) + .await + .is_pending()); + } + + match reader.next_item().await.unwrap() { + ( + epoch, + LogStoreReadItem::StreamChunk { + chunk: read_stream_chunk, + .. + }, + ) => { + assert_eq!(epoch, epoch1); + assert!(check_stream_chunk_eq(&stream_chunk1, &read_stream_chunk)); + } + _ => unreachable!(), + } + match reader.next_item().await.unwrap() { + (epoch, LogStoreReadItem::Barrier { is_checkpoint }) => { + assert_eq!(epoch, epoch1); + assert!(is_checkpoint) + } + _ => unreachable!(), + } + } +} diff --git a/src/stream/src/common/log_store/kv_log_store/reader.rs b/src/stream/src/common/log_store_impl/kv_log_store/reader.rs similarity index 57% rename from src/stream/src/common/log_store/kv_log_store/reader.rs rename to src/stream/src/common/log_store_impl/kv_log_store/reader.rs index 2d01ab3bc0b9d..cb7fc402168d4 100644 --- a/src/stream/src/common/log_store/kv_log_store/reader.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/reader.rs @@ -17,24 +17,29 @@ use std::pin::Pin; use anyhow::anyhow; use bytes::Bytes; -use futures::future::try_join_all; +use futures::future::{try_join_all, BoxFuture}; use futures::stream::select_all; +use futures::FutureExt; +use risingwave_common::array::StreamChunk; use risingwave_common::cache::CachePriority; use risingwave_common::catalog::TableId; use risingwave_common::hash::VnodeBitmapExt; +use risingwave_connector::sink::log_store::{ + ChunkId, LogReader, LogStoreReadItem, LogStoreResult, TruncateOffset, +}; use risingwave_hummock_sdk::key::TableKey; use risingwave_storage::hummock::CachePolicy; use risingwave_storage::store::{PrefetchOptions, ReadOptions}; use risingwave_storage::StateStore; use tokio_stream::StreamExt; -use crate::common::log_store::kv_log_store::buffer::{LogStoreBufferItem, LogStoreBufferReceiver}; -use crate::common::log_store::kv_log_store::serde::{ - new_log_store_item_stream, KvLogStoreItem, LogStoreItemStream, LogStoreRowSerde, +use crate::common::log_store_impl::kv_log_store::buffer::{ + LogStoreBufferItem, LogStoreBufferReceiver, }; -use crate::common::log_store::{ - LogReader, LogStoreError, LogStoreReadItem, LogStoreResult, TruncateOffset, +use crate::common::log_store_impl::kv_log_store::serde::{ + merge_log_store_item_stream, KvLogStoreItem, LogStoreItemMergeStream, LogStoreRowSerde, }; +use crate::common::log_store_impl::kv_log_store::KvLogStoreMetrics; pub struct KvLogStoreReader { table_id: TableId, @@ -49,11 +54,21 @@ pub struct KvLogStoreReader { first_write_epoch: Option, /// `Some` means consuming historical log data - state_store_stream: Option>>>, + state_store_stream: Option>>>, + + /// Store the future that attempts to read a flushed stream chunk. + /// This is for cancellation safety. Since it is possible that the future of `next_item` + /// gets dropped after it takes an flushed item out from the buffer, but before it successfully + /// read the stream chunk out from the storage. Therefore we store the future so that it can continue + /// reading the stream chunk after the next `next_item` is called. + read_flushed_chunk_future: + Option>>, latest_offset: TruncateOffset, truncate_offset: TruncateOffset, + + metrics: KvLogStoreMetrics, } impl KvLogStoreReader { @@ -62,16 +77,33 @@ impl KvLogStoreReader { state_store: S, serde: LogStoreRowSerde, rx: LogStoreBufferReceiver, + metrics: KvLogStoreMetrics, ) -> Self { Self { table_id, state_store, serde, rx, + read_flushed_chunk_future: None, first_write_epoch: None, state_store_stream: None, latest_offset: TruncateOffset::Barrier { epoch: 0 }, truncate_offset: TruncateOffset::Barrier { epoch: 0 }, + metrics, + } + } + + async fn may_continue_read_flushed_chunk( + &mut self, + ) -> LogStoreResult> { + if let Some(future) = self.read_flushed_chunk_future.as_mut() { + let result = future.await; + self.read_flushed_chunk_future + .take() + .expect("future not None"); + Ok(Some(result?)) + } else { + Ok(None) } } } @@ -106,10 +138,11 @@ impl LogReader for KvLogStoreReader { "should not init twice" ); // TODO: set chunk size by config - self.state_store_stream = Some(Box::pin(new_log_store_item_stream( + self.state_store_stream = Some(Box::pin(merge_log_store_item_stream( streams, self.serde.clone(), 1024, + self.metrics.persistent_log_read_metrics.clone(), ))); Ok(()) } @@ -138,6 +171,22 @@ impl LogReader for KvLogStoreReader { } } + // It is possible that the future gets dropped after it pops a flushed + // item but before it reads a stream chunk. Therefore, we may continue + // driving the future to continue reading the stream chunk. + if let Some((chunk_id, chunk, item_epoch)) = self.may_continue_read_flushed_chunk().await? { + let offset = TruncateOffset::Chunk { + epoch: item_epoch, + chunk_id, + }; + assert!(offset > self.latest_offset); + self.latest_offset = offset; + return Ok(( + item_epoch, + LogStoreReadItem::StreamChunk { chunk, chunk_id }, + )); + } + // Now the historical state store has been consumed. let (item_epoch, item) = self.rx.next_item().await; self.latest_offset.check_next_item_epoch(item_epoch)?; @@ -162,40 +211,78 @@ impl LogReader for KvLogStoreReader { end_seq_id, chunk_id, } => { - let streams = try_join_all(vnode_bitmap.iter_vnodes().map(|vnode| { - let range_start = - self.serde - .serialize_log_store_pk(vnode, item_epoch, Some(start_seq_id)); - let range_end = - self.serde - .serialize_log_store_pk(vnode, item_epoch, Some(end_seq_id)); + let read_flushed_chunk_future = { + let serde = self.serde.clone(); let state_store = self.state_store.clone(); let table_id = self.table_id; - // Use u64::MAX here because the epoch to consume may be below the safe - // epoch + let read_metrics = self.metrics.flushed_buffer_read_metrics.clone(); async move { - Ok::<_, LogStoreError>(Box::pin( - state_store - .iter( - (Included(range_start), Included(range_end)), - u64::MAX, - ReadOptions { - prefetch_options: PrefetchOptions::new_for_exhaust_iter(), - cache_policy: CachePolicy::Fill(CachePriority::Low), - table_id, - ..Default::default() - }, - ) - .await?, - )) + let streams = try_join_all(vnode_bitmap.iter_vnodes().map(|vnode| { + let range_start = + serde.serialize_log_store_pk(vnode, item_epoch, Some(start_seq_id)); + let range_end = + serde.serialize_log_store_pk(vnode, item_epoch, Some(end_seq_id)); + let state_store = &state_store; + + // Use u64::MAX here because the epoch to consume may be below the safe + // epoch + async move { + Ok::<_, anyhow::Error>(Box::pin( + state_store + .iter( + (Included(range_start), Included(range_end)), + u64::MAX, + ReadOptions { + prefetch_options: + PrefetchOptions::new_for_exhaust_iter(), + cache_policy: CachePolicy::Fill(CachePriority::Low), + table_id, + ..Default::default() + }, + ) + .await?, + )) + } + })) + .await?; + let combined_stream = select_all(streams); + + let chunk = serde + .deserialize_stream_chunk( + combined_stream, + start_seq_id, + end_seq_id, + item_epoch, + &read_metrics, + ) + .await?; + + Ok((chunk_id, chunk, item_epoch)) } - })) - .await?; - let combined_stream = select_all(streams); - let chunk = self - .serde - .deserialize_stream_chunk(combined_stream, start_seq_id, end_seq_id, item_epoch) - .await?; + .boxed() + }; + + // Store the future in case that in the subsequent pending await point, + // the future is cancelled, and we lose an flushed item. + assert!(self + .read_flushed_chunk_future + .replace(read_flushed_chunk_future) + .is_none()); + + // for cancellation test + #[cfg(test)] + { + use std::time::Duration; + + use tokio::time::sleep; + sleep(Duration::from_secs(1)).await; + } + + let (_, chunk, _) = self + .may_continue_read_flushed_chunk() + .await? + .expect("future just insert. unlikely to be none"); + let offset = TruncateOffset::Chunk { epoch: item_epoch, chunk_id, @@ -233,16 +320,14 @@ impl LogReader for KvLogStoreReader { "truncate at a later offset {:?} than the current latest offset {:?}", offset, self.latest_offset - ) - .into()); + )); } if offset <= self.truncate_offset { return Err(anyhow!( "truncate offset {:?} earlier than prev truncate offset {:?}", offset, self.truncate_offset - ) - .into()); + )); } if offset.epoch() >= self.first_write_epoch.expect("should have init") { self.rx.truncate(offset); diff --git a/src/stream/src/common/log_store/kv_log_store/serde.rs b/src/stream/src/common/log_store_impl/kv_log_store/serde.rs similarity index 67% rename from src/stream/src/common/log_store/kv_log_store/serde.rs rename to src/stream/src/common/log_store_impl/kv_log_store/serde.rs index 627aee6c22f2f..d3102aa936fad 100644 --- a/src/stream/src/common/log_store/kv_log_store/serde.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/serde.rs @@ -18,7 +18,7 @@ use std::sync::Arc; use anyhow::anyhow; use bytes::Bytes; -use futures::stream::{FuturesUnordered, StreamFuture}; +use futures::stream::{FuturesUnordered, Peekable, StreamFuture}; use futures::{pin_mut, Stream, StreamExt, TryStreamExt}; use futures_async_stream::try_stream; use itertools::Itertools; @@ -29,6 +29,7 @@ use risingwave_common::constants::log_store::{ EPOCH_COLUMN_INDEX, EPOCH_COLUMN_TYPE, KV_LOG_STORE_PREDEFINED_COLUMNS, PK_TYPES, ROW_OP_COLUMN_INDEX, SEQ_ID_COLUMN_INDEX, }; +use risingwave_common::estimate_size::EstimateSize; use risingwave_common::hash::VirtualNode; use risingwave_common::row::{OwnedRow, Row, RowExt}; use risingwave_common::types::{DataType, ScalarImpl}; @@ -38,17 +39,18 @@ use risingwave_common::util::sort_util::OrderType; use risingwave_common::util::value_encoding::{ BasicSerde, ValueRowDeserializer, ValueRowSerializer, }; +use risingwave_connector::sink::log_store::LogStoreResult; use risingwave_hummock_sdk::key::{next_key, TableKey}; use risingwave_pb::catalog::Table; +use risingwave_storage::error::StorageError; use risingwave_storage::row_serde::row_serde_util::serialize_pk_with_vnode; use risingwave_storage::row_serde::value_serde::ValueRowSerdeNew; use risingwave_storage::store::StateStoreReadIterStream; use risingwave_storage::table::{compute_vnode, Distribution}; -use crate::common::log_store::kv_log_store::{ - ReaderTruncationOffsetType, RowOpCodeType, SeqIdType, +use crate::common::log_store_impl::kv_log_store::{ + KvLogStoreReadMetrics, ReaderTruncationOffsetType, RowOpCodeType, SeqIdType, }; -use crate::common::log_store::{LogStoreError, LogStoreResult}; const INSERT_OP_CODE: RowOpCodeType = 1; const DELETE_OP_CODE: RowOpCodeType = 2; @@ -57,6 +59,32 @@ const UPDATE_DELETE_OP_CODE: RowOpCodeType = 4; const BARRIER_OP_CODE: RowOpCodeType = 5; const CHECKPOINT_BARRIER_OP_CODE: RowOpCodeType = 6; +struct ReadInfo { + read_size: usize, + read_count: usize, +} + +impl ReadInfo { + fn new() -> Self { + Self { + read_count: 0, + read_size: 0, + } + } + + fn read_one_row(&mut self, size: usize) { + self.read_count += 1; + self.read_size += size; + } + + fn report(&mut self, metrics: &KvLogStoreReadMetrics) { + metrics.storage_read_size.inc_by(self.read_size as _); + metrics.storage_read_count.inc_by(self.read_count as _); + self.read_size = 0; + self.read_count = 0; + } +} + #[derive(Eq, PartialEq, Debug)] enum LogStoreRowOp { Row { op: Op, row: OwnedRow }, @@ -303,46 +331,49 @@ impl LogStoreRowSerde { start_seq_id: SeqIdType, end_seq_id: SeqIdType, expected_epoch: u64, + metrics: &KvLogStoreReadMetrics, ) -> LogStoreResult { pin_mut!(stream); let size_bound = (end_seq_id - start_seq_id + 1) as usize; let mut data_chunk_builder = DataChunkBuilder::new(self.payload_schema.clone(), size_bound + 1); let mut ops = Vec::with_capacity(size_bound); - while let Some((_, value)) = stream.try_next().await? { + let mut read_info = ReadInfo::new(); + while let Some((key, value)) = stream.try_next().await? { + read_info + .read_one_row(key.user_key.table_key.estimated_size() + value.estimated_size()); match self.deserialize(value)? { (epoch, LogStoreRowOp::Row { op, row }) => { if epoch != expected_epoch { - return Err(LogStoreError::Internal(anyhow!( + return Err(anyhow!( "decoded epoch {} not match expected epoch {}", epoch, expected_epoch - ))); + )); } ops.push(op); if ops.len() > size_bound { - return Err(LogStoreError::Internal(anyhow!( + return Err(anyhow!( "row count {} exceed size bound {}", ops.len(), size_bound - ))); + )); } assert!(data_chunk_builder.append_one_row(row).is_none()); } (_, LogStoreRowOp::Barrier { .. }) => { - return Err(LogStoreError::Internal(anyhow!( - "should not get barrier when decoding stream chunk" - ))); + return Err(anyhow!("should not get barrier when decoding stream chunk")); } } } if ops.is_empty() { - return Err(LogStoreError::Internal(anyhow!( + return Err(anyhow!( "should not get empty row when decoding stream chunk. start seq id: {}, end seq id {}", start_seq_id, - end_seq_id)) + end_seq_id) ); } + read_info.report(metrics); Ok(StreamChunk::from_parts( ops, data_chunk_builder @@ -373,59 +404,41 @@ pub(crate) enum KvLogStoreItem { Barrier { is_checkpoint: bool }, } +type BoxPeekableLogStoreItemStream = Pin>>>; + struct LogStoreRowOpStream { serde: LogStoreRowSerde, /// Streams that have not reached a barrier - row_streams: FuturesUnordered>>>, + row_streams: FuturesUnordered>>, /// Streams that have reached a barrier - barrier_streams: Vec>>, + barrier_streams: Vec>, + + not_started_streams: Vec<(u64, BoxPeekableLogStoreItemStream)>, stream_state: StreamState, + + metrics: KvLogStoreReadMetrics, } impl LogStoreRowOpStream { - pub(crate) fn new(streams: Vec, serde: LogStoreRowSerde) -> Self { + pub(crate) fn new( + streams: Vec, + serde: LogStoreRowSerde, + metrics: KvLogStoreReadMetrics, + ) -> Self { assert!(!streams.is_empty()); Self { - serde, - barrier_streams: Vec::with_capacity(streams.len()), - row_streams: streams + serde: serde.clone(), + barrier_streams: streams .into_iter() - .map(|s| Box::pin(s).into_future()) + .map(|s| Box::pin(deserialize_stream(s, serde.clone()).peekable())) .collect(), + row_streams: FuturesUnordered::new(), + not_started_streams: Vec::new(), stream_state: StreamState::Uninitialized, - } - } - - fn check_epoch(&self, epoch: u64) -> LogStoreResult<()> { - match &self.stream_state { - StreamState::Uninitialized => Ok(()), - StreamState::AllConsumingRow { curr_epoch } - | StreamState::BarrierAligning { curr_epoch, .. } => { - if *curr_epoch != epoch { - Err(LogStoreError::Internal(anyhow!( - "epoch {} does not match with current epoch {}", - epoch, - curr_epoch - ))) - } else { - Ok(()) - } - } - - StreamState::BarrierEmitted { prev_epoch } => { - if *prev_epoch >= epoch { - Err(LogStoreError::Internal(anyhow!( - "epoch {} should be greater than prev epoch {}", - epoch, - prev_epoch - ))) - } else { - Ok(()) - } - } + metrics, } } @@ -438,32 +451,40 @@ impl LogStoreRowOpStream { if is_checkpoint == *curr_is_checkpoint { Ok(()) } else { - Err(LogStoreError::Internal(anyhow!( + Err(anyhow!( "current aligning barrier is_checkpoint: {}, current barrier is_checkpoint {}", curr_is_checkpoint, is_checkpoint - ))) + )) } } else { Ok(()) } } - #[try_stream(ok = (u64, KvLogStoreItem), error = LogStoreError)] - async fn into_log_store_item_stream(self, chunk_size: usize) { + #[try_stream(ok = (u64, KvLogStoreItem), error = anyhow::Error)] + async fn into_log_store_item_stream(mut self, chunk_size: usize) { let mut ops = Vec::with_capacity(chunk_size); let mut data_chunk_builder = DataChunkBuilder::new(self.serde.payload_schema.clone(), chunk_size); + if !self.init().await? { + // no data in all stream + return Ok(()); + } + let this = self; pin_mut!(this); - while let Some((epoch, row_op)) = this.next_op().await? { + while let Some((epoch, row_op, row_read_size)) = this.next_op().await? { + let mut read_info = ReadInfo::new(); + read_info.read_one_row(row_read_size); match row_op { LogStoreRowOp::Row { op, row } => { ops.push(op); if let Some(chunk) = data_chunk_builder.append_one_row(row) { let ops = replace(&mut ops, Vec::with_capacity(chunk_size)); + read_info.report(&this.metrics); yield ( epoch, KvLogStoreItem::StreamChunk(StreamChunk::from_parts(ops, chunk)), @@ -471,6 +492,7 @@ impl LogStoreRowOpStream { } } LogStoreRowOp::Barrier { is_checkpoint } => { + read_info.report(&this.metrics); if let Some(chunk) = data_chunk_builder.consume_all() { let ops = replace(&mut ops, Vec::with_capacity(chunk_size)); yield ( @@ -485,39 +507,157 @@ impl LogStoreRowOpStream { } } -pub(crate) type LogStoreItemStream = impl Stream>; -pub(crate) fn new_log_store_item_stream( +pub(crate) type LogStoreItemMergeStream = + impl Stream>; +pub(crate) fn merge_log_store_item_stream( streams: Vec, serde: LogStoreRowSerde, chunk_size: usize, + metrics: KvLogStoreReadMetrics, +) -> LogStoreItemMergeStream { + LogStoreRowOpStream::new(streams, serde, metrics).into_log_store_item_stream(chunk_size) +} + +type LogStoreItemStream = + impl Stream> + Send; +fn deserialize_stream( + stream: S, + serde: LogStoreRowSerde, ) -> LogStoreItemStream { - LogStoreRowOpStream::new(streams, serde).into_log_store_item_stream(chunk_size) + stream.map( + move |result: Result<_, StorageError>| -> LogStoreResult<(u64, LogStoreRowOp, usize)> { + match result { + Ok((key, value)) => { + let read_size = + key.user_key.table_key.estimated_size() + value.estimated_size(); + let (epoch, op) = serde.deserialize(value)?; + Ok((epoch, op, read_size)) + } + Err(e) => Err(e.into()), + } + }, + ) } impl LogStoreRowOpStream { - async fn next_op(&mut self) -> LogStoreResult> { - assert!(!self.row_streams.is_empty()); + // Return Ok(false) means all streams have reach the end. + async fn init(&mut self) -> LogStoreResult { + match &self.stream_state { + StreamState::Uninitialized => {} + _ => unreachable!("cannot call init for twice"), + }; + + // before init, all streams are in `barrier_streams` + assert!( + self.row_streams.is_empty(), + "when uninitialized, row_streams should be empty" + ); + assert!(self.not_started_streams.is_empty()); + assert!(!self.barrier_streams.is_empty()); + + for mut stream in self.barrier_streams.drain(..) { + match stream.as_mut().peek().await { + Some(Ok((epoch, _, _))) => { + self.not_started_streams.push((*epoch, stream)); + } + Some(Err(_)) => match stream.next().await { + Some(Err(e)) => { + return Err(e); + } + _ => unreachable!("on peek we have checked it's Some(Err(_))"), + }, + None => { + continue; + } + } + } + + if self.not_started_streams.is_empty() { + // No stream has data + return Ok(false); + } + + // sorted by epoch descending. Earlier epoch at the end + self.not_started_streams + .sort_by_key(|(epoch, _)| u64::MAX - *epoch); + + let (epoch, stream) = self + .not_started_streams + .pop() + .expect("have check non-empty"); + self.row_streams.push(stream.into_future()); + while let Some((stream_epoch, _)) = self.not_started_streams.last() && *stream_epoch == epoch { + let (_, stream) = self.not_started_streams.pop().expect("should not be empty"); + self.row_streams.push(stream.into_future()); + } + self.stream_state = StreamState::AllConsumingRow { curr_epoch: epoch }; + Ok(true) + } + + fn may_init_epoch(&mut self, epoch: u64) -> LogStoreResult<()> { + let prev_epoch = match &self.stream_state { + StreamState::Uninitialized => unreachable!("should have init"), + StreamState::BarrierEmitted { prev_epoch } => *prev_epoch, + StreamState::AllConsumingRow { curr_epoch } + | StreamState::BarrierAligning { curr_epoch, .. } => { + return if *curr_epoch != epoch { + Err(anyhow!( + "epoch {} does not match with current epoch {}", + epoch, + curr_epoch + )) + } else { + Ok(()) + }; + } + }; + + if prev_epoch >= epoch { + return Err(anyhow!( + "epoch {} should be greater than prev epoch {}", + epoch, + prev_epoch + )); + } + + while let Some((stream_epoch, _)) = self.not_started_streams.last() { + if *stream_epoch > epoch { + // Current epoch has not reached the first epoch of + // the stream. Later streams must also have greater epoch, so break here. + break; + } + if *stream_epoch < epoch { + return Err(anyhow!( + "current epoch {} has exceed epoch {} of stream not started", + epoch, + stream_epoch + )); + } + let (_, stream) = self.not_started_streams.pop().expect("should not be empty"); + self.row_streams.push(stream.into_future()); + } + + self.stream_state = StreamState::AllConsumingRow { curr_epoch: epoch }; + Ok(()) + } + + async fn next_op(&mut self) -> LogStoreResult> { while let (Some(result), stream) = self .row_streams .next() .await .expect("row stream should not be empty when polled") { - let (_key, value): (_, Bytes) = result?; - let (decoded_epoch, op) = self.serde.deserialize(value)?; - self.check_epoch(decoded_epoch)?; + let (decoded_epoch, op, read_size) = result?; + self.may_init_epoch(decoded_epoch)?; match op { LogStoreRowOp::Row { op, row } => { - match &self.stream_state { - StreamState::Uninitialized | StreamState::BarrierEmitted { .. } => { - self.stream_state = StreamState::AllConsumingRow { - curr_epoch: decoded_epoch, - } - } - _ => {} - }; self.row_streams.push(stream.into_future()); - return Ok(Some((decoded_epoch, LogStoreRowOp::Row { op, row }))); + return Ok(Some(( + decoded_epoch, + LogStoreRowOp::Row { op, row }, + read_size, + ))); } LogStoreRowOp::Barrier { is_checkpoint } => { self.check_is_checkpoint(is_checkpoint)?; @@ -534,6 +674,7 @@ impl LogStoreRowOpStream { return Ok(Some(( decoded_epoch, LogStoreRowOp::Barrier { is_checkpoint }, + read_size, ))); } else { self.stream_state = StreamState::BarrierAligning { @@ -547,11 +688,11 @@ impl LogStoreRowOpStream { } // End of stream match &self.stream_state { - StreamState::BarrierEmitted { .. } | StreamState::Uninitialized => {}, - s => return Err(LogStoreError::Internal( + StreamState::BarrierEmitted { .. } => {}, + s => return Err( anyhow!( "when any of the stream reaches the end, it should be right after emitting an barrier. Current state: {:?}", - s) + s ) ), } @@ -559,11 +700,16 @@ impl LogStoreRowOpStream { self.barrier_streams.is_empty(), "should not have any pending barrier received stream after barrier emit" ); + if !self.not_started_streams.is_empty() { + return Err(anyhow!( + "a stream has reached the end but some other stream has not started yet" + )); + } if cfg!(debug_assertion) { while let Some((opt, _stream)) = self.row_streams.next().await { if let Some(result) = opt { - return Err(LogStoreError::Internal( - anyhow!("when any of the stream reaches the end, other stream should also reaches the end, but poll result: {:?}", result)) + return Err( + anyhow!("when any of the stream reaches the end, other stream should also reaches the end, but poll result: {:?}", result) ); } } @@ -574,15 +720,20 @@ impl LogStoreRowOpStream { #[cfg(test)] mod tests { + use std::cmp::min; use std::future::poll_fn; + use std::sync::Arc; use std::task::Poll; + use bytes::Bytes; use futures::stream::empty; use futures::{pin_mut, stream, StreamExt, TryStreamExt}; use itertools::Itertools; use rand::prelude::SliceRandom; use rand::thread_rng; use risingwave_common::array::{Op, StreamChunk}; + use risingwave_common::buffer::Bitmap; + use risingwave_common::hash::VirtualNode; use risingwave_common::row::{OwnedRow, Row}; use risingwave_common::types::DataType; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; @@ -592,23 +743,24 @@ mod tests { use tokio::sync::oneshot; use tokio::sync::oneshot::Sender; - use crate::common::log_store::kv_log_store::serde::{ - new_log_store_item_stream, KvLogStoreItem, LogStoreRowOp, LogStoreRowOpStream, + use crate::common::log_store_impl::kv_log_store::serde::{ + merge_log_store_item_stream, KvLogStoreItem, LogStoreRowOp, LogStoreRowOpStream, LogStoreRowSerde, }; - use crate::common::log_store::kv_log_store::test_utils::{ - gen_test_data, gen_test_log_store_table, TEST_TABLE_ID, + use crate::common::log_store_impl::kv_log_store::test_utils::{ + check_rows_eq, gen_test_data, gen_test_log_store_table, TEST_TABLE_ID, }; - use crate::common::log_store::kv_log_store::SeqIdType; + use crate::common::log_store_impl::kv_log_store::{KvLogStoreReadMetrics, SeqIdType}; - const EPOCH1: u64 = 233; + const EPOCH0: u64 = 233; + const EPOCH1: u64 = EPOCH0 + 1; const EPOCH2: u64 = EPOCH1 + 1; #[test] fn test_serde() { let table = gen_test_log_store_table(); - let serde = LogStoreRowSerde::new(&table, None); + let serde = LogStoreRowSerde::new(&table, Some(Arc::new(Bitmap::ones(VirtualNode::COUNT)))); let (ops, rows) = gen_test_data(0); @@ -624,12 +776,17 @@ mod tests { let mut serialized_keys = vec![]; let mut seq_id = 1; - let delete_range_right1 = - serde.serialize_truncation_offset_watermark(DEFAULT_VNODE, (epoch, None)); + fn remove_vnode_prefix(key: &Bytes) -> Bytes { + key.slice(VirtualNode::SIZE..) + } + let delete_range_right1 = remove_vnode_prefix( + &serde.serialize_truncation_offset_watermark(DEFAULT_VNODE, (epoch, None)), + ); for (op, row) in stream_chunk.rows() { let (_, key, value) = serde.serialize_data_row(epoch, seq_id, op, row); - assert!(key.as_ref() < delete_range_right1); + let key = remove_vnode_prefix(&key.0); + assert!(key < delete_range_right1); serialized_keys.push(key); let (decoded_epoch, row_op) = serde.deserialize(value).unwrap(); assert_eq!(decoded_epoch, epoch); @@ -647,6 +804,7 @@ mod tests { } let (key, encoded_barrier) = serde.serialize_barrier(epoch, DEFAULT_VNODE, false); + let key = remove_vnode_prefix(&key.0); match serde.deserialize(encoded_barrier).unwrap() { (decoded_epoch, LogStoreRowOp::Barrier { is_checkpoint }) => { assert!(!is_checkpoint); @@ -660,13 +818,15 @@ mod tests { seq_id = 1; epoch += 1; - let delete_range_right2 = - serde.serialize_truncation_offset_watermark(DEFAULT_VNODE, (epoch, None)); + let delete_range_right2 = remove_vnode_prefix( + &serde.serialize_truncation_offset_watermark(DEFAULT_VNODE, (epoch, None)), + ); for (op, row) in stream_chunk.rows() { let (_, key, value) = serde.serialize_data_row(epoch, seq_id, op, row); - assert!(key.as_ref() >= delete_range_right1); - assert!(key.as_ref() < delete_range_right2); + let key = remove_vnode_prefix(&key.0); + assert!(key >= delete_range_right1); + assert!(key < delete_range_right2); serialized_keys.push(key); let (decoded_epoch, row_op) = serde.deserialize(value).unwrap(); assert_eq!(decoded_epoch, epoch); @@ -684,6 +844,7 @@ mod tests { } let (key, encoded_checkpoint_barrier) = serde.serialize_barrier(epoch, DEFAULT_VNODE, true); + let key = remove_vnode_prefix(&key.0); match serde.deserialize(encoded_checkpoint_barrier).unwrap() { (decoded_epoch, LogStoreRowOp::Barrier { is_checkpoint }) => { assert_eq!(decoded_epoch, epoch); @@ -729,8 +890,7 @@ mod tests { #[tokio::test] async fn test_deserialize_stream_chunk() { let table = gen_test_log_store_table(); - let serde = LogStoreRowSerde::new(&table, None); - + let serde = LogStoreRowSerde::new(&table, Some(Arc::new(Bitmap::ones(VirtualNode::COUNT)))); let (ops, rows) = gen_test_data(0); let mut seq_id = 1; @@ -746,7 +906,13 @@ mod tests { let end_seq_id = seq_id - 1; tx.send(()).unwrap(); let chunk = serde - .deserialize_stream_chunk(stream, start_seq_id, end_seq_id, EPOCH1) + .deserialize_stream_chunk( + stream, + start_seq_id, + end_seq_id, + EPOCH1, + &KvLogStoreReadMetrics::for_test(), + ) .await .unwrap(); for (i, (op, row)) in chunk.rows().enumerate() { @@ -790,25 +956,34 @@ mod tests { impl StateStoreReadIterStream, oneshot::Sender<()>, oneshot::Sender<()>, + Vec, + Vec, ) { let (ops, rows) = gen_test_data(base); + let first_barrier = { + let (key, value) = serde.serialize_barrier(EPOCH0, DEFAULT_VNODE, true); + Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH0), value)) + }; + let stream = stream::once(async move { first_barrier }); let (row_stream, tx1) = gen_row_stream(serde.clone(), ops.clone(), rows.clone(), EPOCH1, seq_id); - let stream = row_stream.chain(stream::once({ + let stream = stream.chain(row_stream); + let stream = stream.chain(stream::once({ let serde = serde.clone(); async move { let (key, value) = serde.serialize_barrier(EPOCH1, DEFAULT_VNODE, false); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH1), value)) } })); - let (row_stream, tx2) = gen_row_stream(serde.clone(), ops, rows, EPOCH2, seq_id); + let (row_stream, tx2) = + gen_row_stream(serde.clone(), ops.clone(), rows.clone(), EPOCH2, seq_id); let stream = stream.chain(row_stream).chain(stream::once({ async move { let (key, value) = serde.serialize_barrier(EPOCH2, DEFAULT_VNODE, true); Ok((FullKey::new(TEST_TABLE_ID, key, EPOCH2), value)) } })); - (stream, tx1, tx2) + (stream, tx1, tx2, ops, rows) } #[allow(clippy::type_complexity)] @@ -826,17 +1001,19 @@ mod tests { let mut streams = Vec::new(); let mut tx1 = Vec::new(); let mut tx2 = Vec::new(); + let mut ops = Vec::new(); + let mut rows = Vec::new(); for i in 0..size { - let (s, t1, t2) = gen_single_test_stream(serde.clone(), &mut seq_id, (100 * i) as _); + let (s, t1, t2, op_list, row_list) = + gen_single_test_stream(serde.clone(), &mut seq_id, (100 * i) as _); streams.push(s); tx1.push(Some(t1)); tx2.push(Some(t2)); + ops.push(op_list); + rows.push(row_list); } - let stream = LogStoreRowOpStream::new(streams, serde); - - let mut ops = Vec::new(); - let mut rows = Vec::new(); + let stream = LogStoreRowOpStream::new(streams, serde, KvLogStoreReadMetrics::for_test()); for i in 0..size { let (o, r) = gen_test_data((100 * i) as _); @@ -851,20 +1028,35 @@ mod tests { async fn test_row_stream_basic() { let table = gen_test_log_store_table(); - let serde = LogStoreRowSerde::new(&table, None); + let serde = LogStoreRowSerde::new(&table, Some(Arc::new(Bitmap::ones(VirtualNode::COUNT)))); const MERGE_SIZE: usize = 10; - let (stream, mut tx1, mut tx2, ops, rows) = gen_multi_test_stream(serde, MERGE_SIZE); + let (mut stream, mut tx1, mut tx2, ops, rows) = gen_multi_test_stream(serde, MERGE_SIZE); + + stream.init().await.unwrap(); pin_mut!(stream); + let (epoch, op, _) = stream.next_op().await.unwrap().unwrap(); + + assert_eq!( + ( + EPOCH0, + LogStoreRowOp::Barrier { + is_checkpoint: true + } + ), + (epoch, op) + ); + let mut index = (0..MERGE_SIZE).collect_vec(); index.shuffle(&mut thread_rng()); for i in index { tx1[i].take().unwrap().send(()).unwrap(); for j in 0..ops[i].len() { + let (epoch, op, _) = stream.next_op().await.unwrap().unwrap(); assert_eq!( ( EPOCH1, @@ -873,11 +1065,13 @@ mod tests { row: rows[i][j].clone(), } ), - stream.next_op().await.unwrap().unwrap() + (epoch, op) ); } } + let (epoch, op, _) = stream.next_op().await.unwrap().unwrap(); + assert_eq!( ( EPOCH1, @@ -885,7 +1079,7 @@ mod tests { is_checkpoint: false } ), - stream.next_op().await.unwrap().unwrap() + (epoch, op) ); let mut index = (0..MERGE_SIZE).collect_vec(); @@ -894,6 +1088,7 @@ mod tests { for i in index { tx2[i].take().unwrap().send(()).unwrap(); for j in 0..ops[i].len() { + let (epoch, op, _) = stream.next_op().await.unwrap().unwrap(); assert_eq!( ( EPOCH2, @@ -902,11 +1097,12 @@ mod tests { row: rows[i][j].clone(), } ), - stream.next_op().await.unwrap().unwrap() + (epoch, op) ); } } + let (epoch, op, _) = stream.next_op().await.unwrap().unwrap(); assert_eq!( ( EPOCH2, @@ -914,7 +1110,7 @@ mod tests { is_checkpoint: true, } ), - stream.next_op().await.unwrap().unwrap() + (epoch, op) ); assert!(stream.next_op().await.unwrap().is_none()); @@ -924,48 +1120,56 @@ mod tests { async fn test_log_store_stream_basic() { let table = gen_test_log_store_table(); - let serde = LogStoreRowSerde::new(&table, None); + let serde = LogStoreRowSerde::new(&table, Some(Arc::new(Bitmap::ones(VirtualNode::COUNT)))); let mut seq_id = 1; - let (stream, tx1, tx2) = gen_single_test_stream(serde.clone(), &mut seq_id, 0); - let (ops, rows) = gen_test_data(0); + let (stream, tx1, tx2, ops, rows) = gen_single_test_stream(serde.clone(), &mut seq_id, 0); const CHUNK_SIZE: usize = 3; - let stream = new_log_store_item_stream(vec![stream], serde, CHUNK_SIZE); + let stream = merge_log_store_item_stream( + vec![stream], + serde, + CHUNK_SIZE, + KvLogStoreReadMetrics::for_test(), + ); pin_mut!(stream); + let (epoch, item): (_, KvLogStoreItem) = stream.try_next().await.unwrap().unwrap(); + assert_eq!(EPOCH0, epoch); + match item { + KvLogStoreItem::StreamChunk(_) => unreachable!(), + KvLogStoreItem::Barrier { is_checkpoint } => { + assert!(is_checkpoint); + } + } + assert!(poll_fn(|cx| Poll::Ready(stream.poll_next_unpin(cx))) .await .is_pending()); tx1.send(()).unwrap(); - let (epoch, item): (_, KvLogStoreItem) = stream.try_next().await.unwrap().unwrap(); - assert_eq!(EPOCH1, epoch); - match item { - KvLogStoreItem::StreamChunk(chunk) => { - assert_eq!(chunk.cardinality(), CHUNK_SIZE); - for (i, (op, row)) in chunk.rows().enumerate() { - assert_eq!(op, ops[i]); - assert_eq!(row.to_owned_row(), rows[i]); - } - } - _ => unreachable!(), - } - - let (epoch, item): (_, KvLogStoreItem) = stream.try_next().await.unwrap().unwrap(); - assert_eq!(EPOCH1, epoch); - match item { - KvLogStoreItem::StreamChunk(chunk) => { - assert_eq!(chunk.cardinality(), ops.len() - CHUNK_SIZE); - for (i, (op, row)) in chunk.rows().skip(CHUNK_SIZE).enumerate() { - assert_eq!(op, ops[i + CHUNK_SIZE]); - assert_eq!(row.to_owned_row(), rows[i + CHUNK_SIZE]); + { + let mut remain = ops.len(); + while remain > 0 { + let size = min(remain, CHUNK_SIZE); + let start_index = ops.len() - remain; + remain -= size; + let (epoch, item): (_, KvLogStoreItem) = stream.try_next().await.unwrap().unwrap(); + assert_eq!(EPOCH1, epoch); + match item { + KvLogStoreItem::StreamChunk(chunk) => { + assert_eq!(chunk.cardinality(), size); + assert!(check_rows_eq( + chunk.rows(), + (start_index..(start_index + size)).map(|i| (ops[i], &rows[i])) + )); + } + _ => unreachable!(), } } - _ => unreachable!(), } let (epoch, item): (_, KvLogStoreItem) = stream.try_next().await.unwrap().unwrap(); @@ -983,30 +1187,25 @@ mod tests { tx2.send(()).unwrap(); - let (epoch, item): (_, KvLogStoreItem) = stream.try_next().await.unwrap().unwrap(); - assert_eq!(EPOCH2, epoch); - match item { - KvLogStoreItem::StreamChunk(chunk) => { - assert_eq!(chunk.cardinality(), CHUNK_SIZE); - for (i, (op, row)) in chunk.rows().enumerate() { - assert_eq!(op, ops[i]); - assert_eq!(row.to_owned_row(), rows[i]); - } - } - _ => unreachable!(), - } - - let (epoch, item): (_, KvLogStoreItem) = stream.try_next().await.unwrap().unwrap(); - assert_eq!(EPOCH2, epoch); - match item { - KvLogStoreItem::StreamChunk(chunk) => { - assert_eq!(chunk.cardinality(), ops.len() - CHUNK_SIZE); - for (i, (op, row)) in chunk.rows().skip(CHUNK_SIZE).enumerate() { - assert_eq!(op, ops[i + CHUNK_SIZE]); - assert_eq!(row.to_owned_row(), rows[i + CHUNK_SIZE]); + { + let mut remain = ops.len(); + while remain > 0 { + let size = min(remain, CHUNK_SIZE); + let start_index = ops.len() - remain; + remain -= size; + let (epoch, item): (_, KvLogStoreItem) = stream.try_next().await.unwrap().unwrap(); + assert_eq!(EPOCH2, epoch); + match item { + KvLogStoreItem::StreamChunk(chunk) => { + assert_eq!(chunk.cardinality(), size); + assert!(check_rows_eq( + chunk.rows(), + (start_index..(start_index + size)).map(|i| (ops[i], &rows[i])) + )); + } + _ => unreachable!(), } } - _ => unreachable!(), } let (epoch, item): (_, KvLogStoreItem) = stream.try_next().await.unwrap().unwrap(); @@ -1025,11 +1224,16 @@ mod tests { async fn test_empty_stream() { let table = gen_test_log_store_table(); - let serde = LogStoreRowSerde::new(&table, None); + let serde = LogStoreRowSerde::new(&table, Some(Arc::new(Bitmap::ones(VirtualNode::COUNT)))); const CHUNK_SIZE: usize = 3; - let stream = new_log_store_item_stream(vec![empty(), empty()], serde, CHUNK_SIZE); + let stream = merge_log_store_item_stream( + vec![empty(), empty()], + serde, + CHUNK_SIZE, + KvLogStoreReadMetrics::for_test(), + ); pin_mut!(stream); diff --git a/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs b/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs new file mode 100644 index 0000000000000..809b5b42129d2 --- /dev/null +++ b/src/stream/src/common/log_store_impl/kv_log_store/test_utils.rs @@ -0,0 +1,207 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use itertools::{zip_eq, Itertools}; +use rand::RngCore; +use risingwave_common::array::{Op, RowRef, StreamChunk}; +use risingwave_common::buffer::{Bitmap, BitmapBuilder}; +use risingwave_common::catalog::{ColumnDesc, ColumnId, TableId}; +use risingwave_common::constants::log_store::KV_LOG_STORE_PREDEFINED_COLUMNS; +use risingwave_common::hash::VirtualNode; +use risingwave_common::row::{OwnedRow, Row}; +use risingwave_common::types::{DataType, ScalarImpl, ScalarRef}; +use risingwave_common::util::chunk_coalesce::DataChunkBuilder; +use risingwave_common::util::sort_util::OrderType; +use risingwave_pb::catalog::PbTable; + +use crate::common::table::test_utils::gen_prost_table_with_dist_key; + +pub(crate) const TEST_TABLE_ID: TableId = TableId { table_id: 233 }; +pub(crate) const TEST_DATA_SIZE: usize = 10; + +pub(crate) fn gen_test_data(base: i64) -> (Vec, Vec) { + gen_sized_test_data(base, TEST_DATA_SIZE) +} + +pub(crate) fn gen_sized_test_data(base: i64, max_count: usize) -> (Vec, Vec) { + let mut ops = Vec::new(); + let mut rows = Vec::new(); + while ops.len() < max_count - 1 { + let index = ops.len() as i64; + match rand::thread_rng().next_u32() % 3 { + 0 => { + ops.push(Op::Insert); + rows.push(OwnedRow::new(vec![ + Some(ScalarImpl::Int64(index + base)), + Some(ScalarImpl::Utf8( + format!("name{}", index).as_str().to_owned_scalar(), + )), + ])); + } + 1 => { + ops.push(Op::Delete); + rows.push(OwnedRow::new(vec![ + Some(ScalarImpl::Int64(index + base)), + Some(ScalarImpl::Utf8( + format!("name{}", index).as_str().to_owned_scalar(), + )), + ])); + } + 2 => { + ops.push(Op::UpdateDelete); + rows.push(OwnedRow::new(vec![ + Some(ScalarImpl::Int64(index + base)), + Some(ScalarImpl::Utf8( + format!("name{}", index).as_str().to_owned_scalar(), + )), + ])); + ops.push(Op::UpdateInsert); + rows.push(OwnedRow::new(vec![ + Some(ScalarImpl::Int64(index + base)), + Some(ScalarImpl::Utf8( + format!("name{}", index + 1).as_str().to_owned_scalar(), + )), + ])); + } + _ => unreachable!(), + } + } + (ops, rows) +} + +pub(crate) fn test_payload_schema() -> Vec { + vec![ + ColumnDesc::unnamed(ColumnId::from(3), DataType::Int64), // id + ColumnDesc::unnamed(ColumnId::from(4), DataType::Varchar), // name + ] +} + +pub(crate) fn test_log_store_table_schema() -> Vec { + let mut column_descs = vec![ + ColumnDesc::unnamed(ColumnId::from(0), DataType::Int64), // epoch + ColumnDesc::unnamed(ColumnId::from(1), DataType::Int32), // Seq id + ColumnDesc::unnamed(ColumnId::from(2), DataType::Int16), // op code + ]; + column_descs.extend(test_payload_schema()); + column_descs +} + +pub(crate) fn gen_stream_chunk(base: i64) -> StreamChunk { + let (ops, rows) = gen_test_data(base); + let mut builder = DataChunkBuilder::new( + test_payload_schema() + .iter() + .map(|col| col.data_type.clone()) + .collect_vec(), + 1000000, + ); + for row in &rows { + assert!(builder.append_one_row(row).is_none()); + } + let data_chunk = builder.consume_all().unwrap(); + StreamChunk::from_parts(ops, data_chunk) +} + +pub(crate) fn gen_multi_vnode_stream_chunks( + base: i64, + max_count: usize, +) -> [StreamChunk; MOD_COUNT] { + let mut data_builder = (0..MOD_COUNT) + .map(|_| { + ( + Vec::new() as Vec, + DataChunkBuilder::new( + test_payload_schema() + .iter() + .map(|col| col.data_type.clone()) + .collect_vec(), + max_count, + ), + ) + }) + .collect_vec(); + let (ops, rows) = gen_sized_test_data(base, max_count); + for (op, row) in zip_eq(ops, rows) { + let vnode = VirtualNode::compute_row(&row, &[TEST_SCHEMA_DIST_KEY_INDEX]); + let (ops, builder) = &mut data_builder[vnode.to_index() % MOD_COUNT]; + ops.push(op); + assert!(builder.append_one_row(row).is_none()); + } + + data_builder + .into_iter() + .map(|(ops, mut builder)| StreamChunk::from_parts(ops, builder.consume_all().unwrap())) + .collect_vec() + .try_into() + .unwrap() +} + +pub(crate) const TEST_SCHEMA_DIST_KEY_INDEX: usize = 0; + +pub(crate) fn gen_test_log_store_table() -> PbTable { + let schema = test_log_store_table_schema(); + let order_types = vec![OrderType::ascending(), OrderType::ascending_nulls_last()]; + let pk_index = vec![0_usize, 1_usize]; + let read_prefix_len_hint = 0; + gen_prost_table_with_dist_key( + TEST_TABLE_ID, + schema, + order_types, + pk_index, + read_prefix_len_hint, + vec![TEST_SCHEMA_DIST_KEY_INDEX + KV_LOG_STORE_PREDEFINED_COLUMNS.len()], // id field + ) +} + +pub(crate) fn calculate_vnode_bitmap<'a>( + test_data: impl Iterator)>, +) -> Bitmap { + let mut builder = BitmapBuilder::zeroed(VirtualNode::COUNT); + for vnode in + test_data.map(|(_, row)| VirtualNode::compute_row(row, &[TEST_SCHEMA_DIST_KEY_INDEX])) + { + builder.set(vnode.to_index(), true); + } + builder.finish() +} + +pub(crate) fn check_rows_eq( + first: impl Iterator, + second: impl Iterator, +) -> bool { + for ((op1, row1), (op2, row2)) in zip_eq( + first.sorted_by_key(|(_, row)| { + row.datum_at(TEST_SCHEMA_DIST_KEY_INDEX) + .unwrap() + .into_int64() + }), + second.sorted_by_key(|(_, row)| { + row.datum_at(TEST_SCHEMA_DIST_KEY_INDEX) + .unwrap() + .into_int64() + }), + ) { + if op1 != op2 { + return false; + } + if row1.to_owned_row() != row2.to_owned_row() { + return false; + } + } + true +} + +pub(crate) fn check_stream_chunk_eq(first: &StreamChunk, second: &StreamChunk) -> bool { + check_rows_eq(first.rows(), second.rows()) +} diff --git a/src/stream/src/common/log_store/kv_log_store/writer.rs b/src/stream/src/common/log_store_impl/kv_log_store/writer.rs similarity index 83% rename from src/stream/src/common/log_store/kv_log_store/writer.rs rename to src/stream/src/common/log_store_impl/kv_log_store/writer.rs index 54d7db38b8570..1e6e8681fcd77 100644 --- a/src/stream/src/common/log_store/kv_log_store/writer.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/writer.rs @@ -19,14 +19,17 @@ use bytes::Bytes; use risingwave_common::array::StreamChunk; use risingwave_common::buffer::{Bitmap, BitmapBuilder}; use risingwave_common::catalog::TableId; +use risingwave_common::estimate_size::EstimateSize; use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; use risingwave_common::util::epoch::EpochPair; +use risingwave_connector::sink::log_store::{LogStoreResult, LogWriter}; use risingwave_storage::store::{InitOptions, LocalStateStore}; -use crate::common::log_store::kv_log_store::buffer::LogStoreBufferSender; -use crate::common::log_store::kv_log_store::serde::LogStoreRowSerde; -use crate::common::log_store::kv_log_store::{SeqIdType, FIRST_SEQ_ID}; -use crate::common::log_store::{LogStoreResult, LogWriter}; +use crate::common::log_store_impl::kv_log_store::buffer::LogStoreBufferSender; +use crate::common::log_store_impl::kv_log_store::serde::LogStoreRowSerde; +use crate::common::log_store_impl::kv_log_store::{ + FlushInfo, KvLogStoreMetrics, SeqIdType, FIRST_SEQ_ID, +}; pub struct KvLogStoreWriter { _table_id: TableId, @@ -38,6 +41,8 @@ pub struct KvLogStoreWriter { serde: LogStoreRowSerde, tx: LogStoreBufferSender, + + metrics: KvLogStoreMetrics, } impl KvLogStoreWriter { @@ -46,6 +51,7 @@ impl KvLogStoreWriter { state_store: LS, serde: LogStoreRowSerde, tx: LogStoreBufferSender, + metrics: KvLogStoreMetrics, ) -> Self { Self { _table_id: table_id, @@ -53,6 +59,7 @@ impl KvLogStoreWriter { state_store, serde, tx, + metrics, } } } @@ -68,7 +75,9 @@ impl LogWriter for KvLogStoreWriter { } async fn write_chunk(&mut self, chunk: StreamChunk) -> LogStoreResult<()> { - assert!(chunk.cardinality() > 0); + if chunk.cardinality() == 0 { + return Ok(()); + } let epoch = self.state_store.epoch(); let start_seq_id = self.seq_id; self.seq_id += chunk.cardinality() as SeqIdType; @@ -80,13 +89,16 @@ impl LogWriter for KvLogStoreWriter { // When enter this branch, the chunk cannot be added directly, and should be add to // state store and flush let mut vnode_bitmap_builder = BitmapBuilder::zeroed(VirtualNode::COUNT); + let mut flush_info = FlushInfo::new(); for (i, (op, row)) in chunk.rows().enumerate() { let seq_id = start_seq_id + (i as SeqIdType); assert!(seq_id <= end_seq_id); let (vnode, key, value) = self.serde.serialize_data_row(epoch, seq_id, op, row); vnode_bitmap_builder.set(vnode.to_index(), true); + flush_info.flush_one(key.estimated_size() + value.estimated_size()); self.state_store.insert(key, value, None)?; } + flush_info.report(&self.metrics); self.state_store.flush(Vec::new()).await?; let vnode_bitmap = vnode_bitmap_builder.finish(); @@ -102,8 +114,10 @@ impl LogWriter for KvLogStoreWriter { is_checkpoint: bool, ) -> LogStoreResult<()> { let epoch = self.state_store.epoch(); + let mut flush_info = FlushInfo::new(); for vnode in self.serde.vnodes().iter_vnodes() { let (key, value) = self.serde.serialize_barrier(epoch, vnode, is_checkpoint); + flush_info.flush_one(key.estimated_size() + value.estimated_size()); self.state_store.insert(key, value, None)?; } self.tx @@ -112,12 +126,14 @@ impl LogWriter for KvLogStoreWriter { let seq_id = start_seq_id + (i as SeqIdType); assert!(seq_id <= end_seq_id); let (_, key, value) = self.serde.serialize_data_row(epoch, seq_id, op, row); + flush_info.flush_one(key.estimated_size() + value.estimated_size()); self.state_store.insert(key, value, None)?; } Ok(()) })?; + flush_info.report(&self.metrics); let mut delete_range = Vec::with_capacity(self.serde.vnodes().count_ones()); - if let Some(truncation_offset) = self.tx.pop_truncation() { + if let Some(truncation_offset) = self.tx.pop_truncation(epoch) { for vnode in self.serde.vnodes().iter_vnodes() { let range_begin = Bytes::from(vnode.to_be_bytes().to_vec()); let range_end = self diff --git a/src/stream/src/common/log_store_impl/mod.rs b/src/stream/src/common/log_store_impl/mod.rs new file mode 100644 index 0000000000000..633fa07f2617d --- /dev/null +++ b/src/stream/src/common/log_store_impl/mod.rs @@ -0,0 +1,16 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod in_mem; +pub mod kv_log_store; diff --git a/src/stream/src/common/mod.rs b/src/stream/src/common/mod.rs index e865214cb0990..7f5111c29e03e 100644 --- a/src/stream/src/common/mod.rs +++ b/src/stream/src/common/mod.rs @@ -18,6 +18,6 @@ pub use column_mapping::*; mod builder; pub mod cache; mod column_mapping; -pub mod log_store; +pub mod log_store_impl; pub mod metrics; pub mod table; diff --git a/src/stream/src/common/table/state_table.rs b/src/stream/src/common/table/state_table.rs index aadbccc1b97f6..37e788a3e7abd 100644 --- a/src/stream/src/common/table/state_table.rs +++ b/src/stream/src/common/table/state_table.rs @@ -562,8 +562,8 @@ where &self.value_indices } - pub fn is_dirty(&self) -> bool { - self.local_store.is_dirty() + fn is_dirty(&self) -> bool { + self.local_store.is_dirty() || self.state_clean_watermark.is_some() } pub fn vnode_bitmap(&self) -> &Bitmap { @@ -908,9 +908,15 @@ where // Tick the watermark buffer here because state table is expected to be committed once // per epoch. self.watermark_buffer_strategy.tick(); - self.seal_current_epoch(new_epoch.curr) - .instrument(tracing::info_span!("state_table_commit")) - .await?; + if !self.is_dirty() { + // If the state table is not modified, go fast path. + self.local_store.seal_current_epoch(new_epoch.curr); + return Ok(()); + } else { + self.seal_current_epoch(new_epoch.curr) + .instrument(tracing::info_span!("state_table_commit")) + .await?; + } // Refresh watermark cache if it is out of sync. if USE_WATERMARK_CACHE && !self.watermark_cache.is_synced() { @@ -930,7 +936,7 @@ where let mut streams = vec![]; for vnode in self.vnodes().iter_vnodes() { let stream = self - .iter_row_with_pk_range(&range, vnode, PrefetchOptions::default()) + .iter_with_vnode(vnode, &range, PrefetchOptions::default()) .await?; streams.push(Box::pin(stream)); } @@ -1088,38 +1094,16 @@ where S: StateStore, SD: ValueRowSerde, { - /// This function scans rows from the relational table. - pub async fn iter_row( - &self, - prefetch_options: PrefetchOptions, - ) -> StreamExecutorResult> { - self.iter_row_with_pk_prefix(row::empty(), prefetch_options) - .await - } - - /// This function scans rows from the relational table with specific `pk_prefix`. - /// `pk_prefix` is used to identify the exact vnode the scan should perform on. - pub async fn iter_row_with_pk_prefix( - &self, - pk_prefix: impl Row, - prefetch_options: PrefetchOptions, - ) -> StreamExecutorResult> { - Ok(deserialize_keyed_row_stream( - self.iter_kv_with_pk_prefix(pk_prefix, prefetch_options) - .await?, - &self.row_serde, - )) - } - /// This function scans rows from the relational table with specific `pk_range` under the same /// `vnode`. - pub async fn iter_row_with_pk_range( + pub async fn iter_with_vnode( &self, - pk_range: &(Bound, Bound), + // Optional vnode that returns an iterator only over the given range under that vnode. // For now, we require this parameter, and will panic. In the future, when `None`, we can // iterate over each vnode that the `StateTableInner` owns. vnode: VirtualNode, + pk_range: &(Bound, Bound), prefetch_options: PrefetchOptions, ) -> StreamExecutorResult> { Ok(deserialize_keyed_row_stream( @@ -1149,13 +1133,18 @@ where Ok(self.local_store.iter(table_key_range, read_options).await?) } - /// This function scans raw key-values from the relational table with specific `pk_prefix`. + /// This function scans rows from the relational table with specific `prefix` and `sub_range` under the same + /// `vnode`. If `sub_range` is (Unbounded, Unbounded), it scans rows from the relational table with specific `pk_prefix`. /// `pk_prefix` is used to identify the exact vnode the scan should perform on. - async fn iter_kv_with_pk_prefix( + + /// This function scans rows from the relational table with specific `prefix` and `pk_sub_range` under the same + /// `vnode`. + pub async fn iter_with_prefix( &self, pk_prefix: impl Row, + sub_range: &(Bound, Bound), prefetch_options: PrefetchOptions, - ) -> StreamExecutorResult<::IterStream<'_>> { + ) -> StreamExecutorResult> { let prefix_serializer = self.pk_serde.prefix(pk_prefix.len()); let encoded_prefix = serialize_pk(&pk_prefix, &prefix_serializer); let encoded_key_range = range_of_prefix(&encoded_prefix); @@ -1190,27 +1179,18 @@ where "storage_iter_with_prefix" ); - self.iter_kv(encoded_key_range_with_vnode, prefix_hint, prefetch_options) - .await - } - - /// This function scans rows from the relational table with specific `prefix` and `pk_sub_range` under the same - /// `vnode`. - pub async fn iter_row_with_pk_prefix_sub_range( - &self, - pk_prefix: impl Row, - sub_range: &(Bound, Bound), - prefetch_options: PrefetchOptions, - ) -> StreamExecutorResult> { - let vnode = self.compute_prefix_vnode(&pk_prefix).to_be_bytes(); - let memcomparable_range = prefix_and_sub_range_to_memcomparable(&self.pk_serde, sub_range, pk_prefix); let memcomparable_range_with_vnode = prefixed_range(memcomparable_range, &vnode); + Ok(deserialize_keyed_row_stream( - self.iter_kv(memcomparable_range_with_vnode, None, prefetch_options) - .await?, + self.iter_kv( + memcomparable_range_with_vnode, + prefix_hint, + prefetch_options, + ) + .await?, &self.row_serde, )) } diff --git a/src/stream/src/common/table/test_state_table.rs b/src/stream/src/common/table/test_state_table.rs index c0a07ebcb2f02..7b6d1dce99f21 100644 --- a/src/stream/src/common/table/test_state_table.rs +++ b/src/stream/src/common/table/test_state_table.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::ops::Bound; +use std::ops::Bound::{self, *}; use futures::{pin_mut, StreamExt}; use risingwave_common::array::{Op, StreamChunk}; @@ -282,8 +282,9 @@ async fn test_state_table_iter_with_prefix() { ])); let pk_prefix = OwnedRow::new(vec![Some(1_i32.into())]); + let sub_range: &(Bound, Bound) = &(Bound::Unbounded, Bound::Unbounded); let iter = state_table - .iter_row_with_pk_prefix(&pk_prefix, Default::default()) + .iter_with_prefix(&pk_prefix, sub_range, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -414,7 +415,7 @@ async fn test_state_table_iter_with_pk_range() { std::ops::Bound::Included(OwnedRow::new(vec![Some(4_i32.into())])), ); let iter = state_table - .iter_row_with_pk_range(&pk_range, DEFAULT_VNODE, Default::default()) + .iter_with_vnode(DEFAULT_VNODE, &pk_range, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -439,7 +440,7 @@ async fn test_state_table_iter_with_pk_range() { std::ops::Bound::::Unbounded, ); let iter = state_table - .iter_row_with_pk_range(&pk_range, DEFAULT_VNODE, Default::default()) + .iter_with_vnode(DEFAULT_VNODE, &pk_range, Default::default()) .await .unwrap(); pin_mut!(iter); @@ -576,9 +577,12 @@ async fn test_state_table_iter_with_value_indices() { Some(99_i32.into()), Some(999_i32.into()), ])); - + let sub_range: &(Bound, Bound) = &(Unbounded, Unbounded); { - let iter = state_table.iter_row(Default::default()).await.unwrap(); + let iter = state_table + .iter_with_prefix(row::empty(), sub_range, Default::default()) + .await + .unwrap(); pin_mut!(iter); let res = iter.next().await.unwrap().unwrap(); @@ -633,7 +637,10 @@ async fn test_state_table_iter_with_value_indices() { Some(888_i32.into()), ])); - let iter = state_table.iter_row(Default::default()).await.unwrap(); + let iter = state_table + .iter_with_prefix(row::empty(), sub_range, Default::default()) + .await + .unwrap(); pin_mut!(iter); let res = iter.next().await.unwrap().unwrap(); @@ -737,9 +744,12 @@ async fn test_state_table_iter_with_shuffle_value_indices() { Some(99_i32.into()), Some(999_i32.into()), ])); - + let sub_range: &(Bound, Bound) = &(Unbounded, Unbounded); { - let iter = state_table.iter_row(Default::default()).await.unwrap(); + let iter = state_table + .iter_with_prefix(row::empty(), sub_range, Default::default()) + .await + .unwrap(); pin_mut!(iter); let res = iter.next().await.unwrap().unwrap(); @@ -815,7 +825,10 @@ async fn test_state_table_iter_with_shuffle_value_indices() { Some(888_i32.into()), ])); - let iter = state_table.iter_row(Default::default()).await.unwrap(); + let iter = state_table + .iter_with_prefix(row::empty(), sub_range, Default::default()) + .await + .unwrap(); pin_mut!(iter); let res = iter.next().await.unwrap().unwrap(); @@ -1000,9 +1013,13 @@ async fn test_state_table_write_chunk() { ); state_table.write_chunk(chunk); - + let sub_range: &(Bound, Bound) = &(Unbounded, Unbounded); let rows: Vec<_> = state_table - .iter_row(PrefetchOptions::new_for_exhaust_iter()) + .iter_with_prefix( + row::empty(), + sub_range, + PrefetchOptions::new_for_exhaust_iter(), + ) .await .unwrap() .collect::>() @@ -1114,9 +1131,13 @@ async fn test_state_table_write_chunk_visibility() { StreamChunk::with_visibility(ops, columns, Bitmap::from_iter([true, true, true, false])); state_table.write_chunk(chunk); - + let sub_range: &(Bound, Bound) = &(Unbounded, Unbounded); let rows: Vec<_> = state_table - .iter_row(PrefetchOptions::new_for_exhaust_iter()) + .iter_with_prefix( + row::empty(), + sub_range, + PrefetchOptions::new_for_exhaust_iter(), + ) .await .unwrap() .collect::>() @@ -1226,9 +1247,13 @@ async fn test_state_table_write_chunk_value_indices() { ); state_table.write_chunk(chunk); - + let sub_range: &(Bound, Bound) = &(Unbounded, Unbounded); let rows: Vec<_> = state_table - .iter_row(PrefetchOptions::new_for_exhaust_iter()) + .iter_with_prefix( + row::empty(), + sub_range, + PrefetchOptions::new_for_exhaust_iter(), + ) .await .unwrap() .collect::>() @@ -1508,9 +1533,13 @@ async fn test_state_table_watermark_cache_ignore_null() { let chunk = StreamChunk::from_rows(&rows, &data_types); state_table.write_chunk(chunk); - + let sub_range: &(Bound, Bound) = &(Unbounded, Unbounded); let inserted_rows: Vec<_> = state_table - .iter_row(PrefetchOptions::new_for_exhaust_iter()) + .iter_with_prefix( + row::empty(), + sub_range, + PrefetchOptions::new_for_exhaust_iter(), + ) .await .unwrap() .collect::>() @@ -1795,9 +1824,13 @@ async fn test_state_table_watermark_cache_refill() { for row in &rows { state_table.insert(row); } - + let sub_range: &(Bound, Bound) = &(Unbounded, Unbounded); let inserted_rows: Vec<_> = state_table - .iter_row(PrefetchOptions::new_for_exhaust_iter()) + .iter_with_prefix( + row::empty(), + sub_range, + PrefetchOptions::new_for_exhaust_iter(), + ) .await .unwrap() .collect::>() @@ -1895,7 +1928,7 @@ async fn test_state_table_iter_prefix_and_sub_range() { ); let iter = state_table - .iter_row_with_pk_prefix_sub_range(pk_prefix, &sub_range1, Default::default()) + .iter_with_prefix(pk_prefix, &sub_range1, Default::default()) .await .unwrap(); @@ -1933,7 +1966,7 @@ async fn test_state_table_iter_prefix_and_sub_range() { let pk_prefix = OwnedRow::new(vec![Some(1_i32.into())]); let iter = state_table - .iter_row_with_pk_prefix_sub_range(pk_prefix, &sub_range2, Default::default()) + .iter_with_prefix(pk_prefix, &sub_range2, Default::default()) .await .unwrap(); @@ -1971,7 +2004,7 @@ async fn test_state_table_iter_prefix_and_sub_range() { let pk_prefix = OwnedRow::new(vec![Some(1_i32.into())]); let iter = state_table - .iter_row_with_pk_prefix_sub_range(pk_prefix, &sub_range3, Default::default()) + .iter_with_prefix(pk_prefix, &sub_range3, Default::default()) .await .unwrap(); diff --git a/src/stream/src/common/table/test_utils.rs b/src/stream/src/common/table/test_utils.rs index 526f6864b3a99..90e7886df26bf 100644 --- a/src/stream/src/common/table/test_utils.rs +++ b/src/stream/src/common/table/test_utils.rs @@ -38,6 +38,26 @@ pub(crate) fn gen_prost_table( ) } +pub(crate) fn gen_prost_table_with_dist_key( + table_id: TableId, + column_descs: Vec, + order_types: Vec, + pk_index: Vec, + read_prefix_len_hint: u32, + distribution_key: Vec, +) -> PbTable { + let col_len = column_descs.len() as i32; + gen_prost_table_inner( + table_id, + column_descs, + order_types, + pk_index, + read_prefix_len_hint, + (0..col_len).collect_vec(), + distribution_key, + ) +} + pub(crate) fn gen_prost_table_with_value_indices( table_id: TableId, column_descs: Vec, @@ -45,6 +65,26 @@ pub(crate) fn gen_prost_table_with_value_indices( pk_index: Vec, read_prefix_len_hint: u32, value_indices: Vec, +) -> PbTable { + gen_prost_table_inner( + table_id, + column_descs, + order_types, + pk_index, + read_prefix_len_hint, + value_indices, + Vec::default(), + ) +} + +pub(crate) fn gen_prost_table_inner( + table_id: TableId, + column_descs: Vec, + order_types: Vec, + pk_index: Vec, + read_prefix_len_hint: u32, + value_indices: Vec, + distribution_key: Vec, ) -> PbTable { let prost_pk = pk_index .iter() @@ -62,12 +102,15 @@ pub(crate) fn gen_prost_table_with_value_indices( }) .collect(); + let distribution_key = distribution_key.into_iter().map(|i| i as i32).collect_vec(); + PbTable { id: table_id.table_id(), columns: prost_columns, pk: prost_pk, read_prefix_len_hint, value_indices, + distribution_key, ..Default::default() } } diff --git a/src/stream/src/error.rs b/src/stream/src/error.rs index b737de4d2560b..2930cda31747e 100644 --- a/src/stream/src/error.rs +++ b/src/stream/src/error.rs @@ -16,6 +16,7 @@ use std::backtrace::Backtrace; use risingwave_common::array::ArrayError; use risingwave_connector::error::ConnectorError; +use risingwave_connector::sink::SinkError; use risingwave_expr::ExprError; use risingwave_pb::PbFieldNotFound; use risingwave_storage::error::StorageError; @@ -58,6 +59,9 @@ enum ErrorKind { #[error("Executor error: {0:?}")] Executor(#[source] StreamExecutorError), + #[error("Sink error: {0:?}")] + Sink(#[source] SinkError), + #[error(transparent)] Internal(anyhow::Error), } @@ -115,6 +119,12 @@ impl From for StreamError { } } +impl From for StreamError { + fn from(value: SinkError) -> Self { + ErrorKind::Sink(value).into() + } +} + impl From for StreamError { fn from(err: PbFieldNotFound) -> Self { Self::from(anyhow::anyhow!( diff --git a/src/stream/src/executor/actor.rs b/src/stream/src/executor/actor.rs index 17f874acb0fc6..85846557a3c4a 100644 --- a/src/stream/src/executor/actor.rs +++ b/src/stream/src/executor/actor.rs @@ -37,9 +37,11 @@ pub struct ActorContext { pub id: ActorId, pub fragment_id: u32, + // TODO(eric): these seem to be useless now? last_mem_val: Arc, cur_mem_val: Arc, total_mem_val: Arc>, + pub streaming_metrics: Arc, pub error_suppressor: Arc>, } @@ -78,7 +80,8 @@ impl ActorContext { } pub fn on_compute_error(&self, err: ExprError, identity: &str) { - tracing::error!("Compute error: {}, executor: {identity}", err); + tracing::error!(identity, %err, "failed to evaluate expression"); + let executor_name = identity.split(' ').next().unwrap_or("name_not_found"); let mut err_str = err.to_string(); diff --git a/src/stream/src/executor/agg_common.rs b/src/stream/src/executor/agg_common.rs index d6d63b4d65b8a..fbaa80c3fbeb7 100644 --- a/src/stream/src/executor/agg_common.rs +++ b/src/stream/src/executor/agg_common.rs @@ -15,7 +15,7 @@ use std::collections::HashMap; use std::sync::Arc; -use risingwave_expr::agg::AggCall; +use risingwave_expr::aggregate::AggCall; use risingwave_storage::StateStore; use super::aggregation::AggStateStorage; @@ -57,6 +57,7 @@ impl AggExecutorExtraArgs for SimpleAggExecutorExtraArgs {} pub struct HashAggExecutorExtraArgs { pub group_key_indices: Vec, pub chunk_size: usize, + pub max_dirty_groups_heap_size: usize, pub emit_on_window_close: bool, } impl AggExecutorExtraArgs for HashAggExecutorExtraArgs {} diff --git a/src/stream/src/executor/aggregation/agg_group.rs b/src/stream/src/executor/aggregation/agg_group.rs index afc395a7ab128..d854969120919 100644 --- a/src/stream/src/executor/aggregation/agg_group.rs +++ b/src/stream/src/executor/aggregation/agg_group.rs @@ -24,7 +24,7 @@ use risingwave_common::estimate_size::EstimateSize; use risingwave_common::must_match; use risingwave_common::row::{OwnedRow, Row, RowExt}; use risingwave_common::util::iter_util::ZipEqFast; -use risingwave_expr::agg::{AggCall, BoxedAggregateFunction}; +use risingwave_expr::aggregate::{AggCall, BoxedAggregateFunction}; use risingwave_storage::StateStore; use super::agg_state::{AggState, AggStateStorage}; diff --git a/src/stream/src/executor/aggregation/agg_state.rs b/src/stream/src/executor/aggregation/agg_state.rs index 36690b872da58..0c1932c58831c 100644 --- a/src/stream/src/executor/aggregation/agg_state.rs +++ b/src/stream/src/executor/aggregation/agg_state.rs @@ -18,7 +18,7 @@ use risingwave_common::catalog::Schema; use risingwave_common::estimate_size::EstimateSize; use risingwave_common::must_match; use risingwave_common::types::Datum; -use risingwave_expr::agg::{AggCall, AggregateState, BoxedAggregateFunction}; +use risingwave_expr::aggregate::{AggCall, AggregateState, BoxedAggregateFunction}; use risingwave_storage::StateStore; use super::minput::MaterializedInputState; diff --git a/src/stream/src/executor/aggregation/distinct.rs b/src/stream/src/executor/aggregation/distinct.rs index bcc12d065169e..9e1d8d66da848 100644 --- a/src/stream/src/executor/aggregation/distinct.rs +++ b/src/stream/src/executor/aggregation/distinct.rs @@ -69,6 +69,7 @@ impl ColumnDeduplicater { .map(|_| BitmapBuilder::zeroed(column.len())) .collect_vec(); let actor_id_str = ctx.id.to_string(); + let fragment_id_str = ctx.fragment_id.to_string(); let table_id_str = dedup_table.table_id().to_string(); for (datum_idx, (op, datum)) in ops.iter().zip_eq_fast(column.iter()).enumerate() { // skip if this item is hidden to all agg calls (this is likely to happen) @@ -85,7 +86,7 @@ impl ColumnDeduplicater { self.metrics_info .metrics .agg_distinct_total_cache_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); // TODO(yuhao): avoid this `contains`. // https://github.com/risingwavelabs/risingwave/issues/9233 @@ -95,7 +96,7 @@ impl ColumnDeduplicater { self.metrics_info .metrics .agg_distinct_cache_miss_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); // load from table into the cache let counts = if let Some(counts_row) = @@ -190,11 +191,12 @@ impl ColumnDeduplicater { // WARN: if you want to change to batching the write to table. please remember to change // `self.cache.evict()` too. let actor_id_str = ctx.id.to_string(); + let fragment_id_str = ctx.fragment_id.to_string(); let table_id_str = dedup_table.table_id().to_string(); self.metrics_info .metrics .agg_distinct_cached_entry_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .set(self.cache.len() as i64); self.cache.evict(); } diff --git a/src/stream/src/executor/aggregation/minput.rs b/src/stream/src/executor/aggregation/minput.rs index 0b50875adf847..1329f08eb6d99 100644 --- a/src/stream/src/executor/aggregation/minput.rs +++ b/src/stream/src/executor/aggregation/minput.rs @@ -12,17 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::ops::Bound::{self}; + use futures::{pin_mut, StreamExt}; use futures_async_stream::for_await; use itertools::Itertools; use risingwave_common::array::StreamChunk; use risingwave_common::catalog::Schema; use risingwave_common::estimate_size::EstimateSize; -use risingwave_common::row::RowExt; +use risingwave_common::row::{OwnedRow, RowExt}; use risingwave_common::types::Datum; use risingwave_common::util::row_serde::OrderedRowSerde; use risingwave_common::util::sort_util::OrderType; -use risingwave_expr::agg::{AggCall, AggKind, BoxedAggregateFunction}; +use risingwave_expr::aggregate::{AggCall, AggKind, BoxedAggregateFunction}; use risingwave_storage::store::PrefetchOptions; use risingwave_storage::StateStore; @@ -137,7 +139,10 @@ impl MaterializedInputState { agg_call.args.arg_types(), )) } - AggKind::StringAgg | AggKind::ArrayAgg => Box::new(GenericAggStateCache::new( + AggKind::StringAgg + | AggKind::ArrayAgg + | AggKind::JsonbAgg + | AggKind::JsonbObjectAgg => Box::new(GenericAggStateCache::new( OrderedStateCache::new(), agg_call.args.arg_types(), )), @@ -182,10 +187,12 @@ impl MaterializedInputState { ) -> StreamExecutorResult { if !self.cache.is_synced() { let mut cache_filler = self.cache.begin_syncing(); - + let sub_range: &(Bound, Bound) = + &(Bound::Unbounded, Bound::Unbounded); let all_data_iter = state_table - .iter_row_with_pk_prefix( + .iter_with_prefix( group_key.map(GroupKey::table_pk), + sub_range, PrefetchOptions { exhaust_iter: cache_filler.capacity().is_none(), }, @@ -247,7 +254,7 @@ mod tests { use risingwave_common::types::{DataType, ScalarImpl}; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::sort_util::OrderType; - use risingwave_expr::agg::{build_append_only, AggCall}; + use risingwave_expr::aggregate::{build_append_only, AggCall}; use risingwave_storage::memory::MemoryStateStore; use risingwave_storage::StateStore; diff --git a/src/stream/src/executor/aggregation/mod.rs b/src/stream/src/executor/aggregation/mod.rs index fb1884ed8a972..dd0ce9d01c544 100644 --- a/src/stream/src/executor/aggregation/mod.rs +++ b/src/stream/src/executor/aggregation/mod.rs @@ -20,10 +20,9 @@ use risingwave_common::array::DataChunk; use risingwave_common::bail; use risingwave_common::buffer::Bitmap; use risingwave_common::catalog::{Field, Schema}; -use risingwave_expr::agg::{AggCall, AggKind}; +use risingwave_expr::aggregate::{AggCall, AggKind}; use risingwave_storage::StateStore; -use super::ActorContextRef; use crate::common::table::state_table::StateTable; use crate::executor::error::StreamExecutorResult; use crate::executor::Executor; @@ -60,8 +59,6 @@ pub fn generate_agg_schema( } pub async fn agg_call_filter_res( - ctx: &ActorContextRef, - identity: &str, agg_call: &AggCall, chunk: &DataChunk, ) -> StreamExecutorResult { @@ -77,11 +74,7 @@ pub async fn agg_call_filter_res( } if let Some(ref filter) = agg_call.filter { - if let Bool(filter_res) = filter - .eval_infallible(chunk, |err| ctx.on_compute_error(err, identity)) - .await - .as_ref() - { + if let Bool(filter_res) = filter.eval_infallible(chunk).await.as_ref() { vis &= filter_res.to_bitmap(); } else { bail!("Filter can only receive bool array"); diff --git a/src/stream/src/executor/backfill/arrangement_backfill.rs b/src/stream/src/executor/backfill/arrangement_backfill.rs index d33aed6d6c441..ae5e8696de6c3 100644 --- a/src/stream/src/executor/backfill/arrangement_backfill.rs +++ b/src/stream/src/executor/backfill/arrangement_backfill.rs @@ -473,7 +473,7 @@ where &mut temporary_state, ).await?; - self.progress.finish(barrier.epoch.curr); + self.progress.finish(barrier.epoch.curr, total_snapshot_processed_rows); yield msg; break; } @@ -548,7 +548,7 @@ where let range_bounds = range_bounds.unwrap(); let vnode_row_iter = upstream_table - .iter_row_with_pk_range(&range_bounds, vnode, Default::default()) + .iter_with_vnode(vnode, &range_bounds, Default::default()) .await?; // TODO: Is there some way to avoid double-pin here? diff --git a/src/stream/src/executor/backfill/cdc_backfill.rs b/src/stream/src/executor/backfill/cdc_backfill.rs index 2f522ae8eeb0c..c17aad1d2d62d 100644 --- a/src/stream/src/executor/backfill/cdc_backfill.rs +++ b/src/stream/src/executor/backfill/cdc_backfill.rs @@ -48,7 +48,7 @@ use crate::executor::{ }; use crate::task::{ActorId, CreateMviewProgress}; -const BACKFILL_STATE_KEY_SUFFIX: &str = "_backfill"; +pub const BACKFILL_STATE_KEY_SUFFIX: &str = "_backfill"; pub struct CdcBackfillExecutor { actor_ctx: ActorContextRef, @@ -227,7 +227,9 @@ impl CdcBackfillExecutor { #[allow(unused_variables)] let mut total_snapshot_processed_rows: u64 = 0; - let mut last_binlog_offset: Option; + // Read the current binlog offset as a low watermark + let mut last_binlog_offset: Option = + upstream_table_reader.current_binlog_offset().await?; let mut consumed_binlog_offset: Option = None; @@ -251,7 +253,6 @@ impl CdcBackfillExecutor { // // Once the backfill loop ends, we forward the upstream directly to the downstream. if to_backfill { - last_binlog_offset = upstream_table_reader.current_binlog_offset().await?; // drive the upstream changelog first to ensure we can receive timely changelog event, // otherwise the upstream changelog may be blocked by the snapshot read stream let _ = Pin::new(&mut upstream).peek().await; @@ -348,6 +349,11 @@ impl CdcBackfillExecutor { break; } Message::Chunk(chunk) => { + // skip empty upstream chunk + if chunk.cardinality() == 0 { + continue; + } + let chunk_binlog_offset = get_cdc_chunk_last_offset( upstream_table_reader.inner().table_reader(), &chunk, @@ -441,18 +447,23 @@ impl CdcBackfillExecutor { } } } - } else { + } else if is_snapshot_empty { + tracing::info!( + upstream_table_id, + initial_binlog_offset = ?last_binlog_offset, + "upstream snapshot is empty, mark backfill is done and persist current binlog offset"); + Self::write_backfill_state( &mut self.source_state_handler, upstream_table_id, &split_id, &mut cdc_split, - None, + last_binlog_offset, ) .await?; } - tracing::debug!( + tracing::info!( actor = self.actor_id, "CdcBackfill has already finished and forward messages directly to the downstream" ); @@ -485,6 +496,11 @@ impl CdcBackfillExecutor { cdc_split: &mut Option, last_binlog_offset: Option, ) -> StreamExecutorResult<()> { + assert!( + last_binlog_offset.is_some(), + "last binlog offset cannot be None" + ); + if let Some(split_id) = split_id.as_ref() { let mut key = split_id.to_string(); key.push_str(BACKFILL_STATE_KEY_SUFFIX); @@ -517,6 +533,9 @@ impl CdcBackfillExecutor { "server".to_string() => server }, source_offset, + // upstream heartbeat event would not emit to the cdc backfill executor, + // since we don't parse heartbeat event in the source parser. + is_heartbeat: false, } }); diff --git a/src/stream/src/executor/backfill/no_shuffle_backfill.rs b/src/stream/src/executor/backfill/no_shuffle_backfill.rs index f796ff0dcb690..97a9da0ff6a99 100644 --- a/src/stream/src/executor/backfill/no_shuffle_backfill.rs +++ b/src/stream/src/executor/backfill/no_shuffle_backfill.rs @@ -17,11 +17,12 @@ use std::sync::Arc; use either::Either; use futures::stream::select_with_strategy; -use futures::{pin_mut, stream, StreamExt, TryStreamExt}; +use futures::{pin_mut, stream, StreamExt}; use futures_async_stream::try_stream; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::catalog::Schema; -use risingwave_common::row::OwnedRow; +use risingwave_common::hash::VnodeBitmapExt; +use risingwave_common::row::{OwnedRow, Row}; use risingwave_common::types::Datum; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; use risingwave_common::util::epoch::EpochPair; @@ -34,8 +35,8 @@ use risingwave_storage::StateStore; use crate::common::table::state_table::StateTable; use crate::executor::backfill::utils; use crate::executor::backfill::utils::{ - check_all_vnode_finished, compute_bounds, construct_initial_finished_state, get_new_pos, - get_row_count_state, iter_chunks, mapping_chunk, mapping_message, mark_chunk, owned_row_iter, + compute_bounds, construct_initial_finished_state, get_new_pos, iter_chunks, mapping_chunk, + mapping_message, mark_chunk, owned_row_iter, }; use crate::executor::monitor::StreamingMetrics; use crate::executor::{ @@ -44,6 +45,19 @@ use crate::executor::{ }; use crate::task::{ActorId, CreateMviewProgress}; +/// vnode, `is_finished`, `row_count`, all occupy 1 column each. +const METADATA_STATE_LEN: usize = 3; + +/// Schema: | vnode | pk ... | `backfill_finished` | `row_count` | +/// We can decode that into `BackfillState` on recovery. +#[derive(Debug, Eq, PartialEq)] +pub struct BackfillState { + current_pos: Option, + old_state: Option>, + is_finished: bool, + row_count: u64, +} + /// An implementation of the [RFC: Use Backfill To Let Mv On Mv Stream Again](https://github.com/risingwavelabs/rfcs/pull/13). /// `BackfillExecutor` is used to create a materialized view on another materialized view. /// @@ -104,12 +118,13 @@ where pk_indices: PkIndices, metrics: Arc, chunk_size: usize, + executor_id: u64, ) -> Self { Self { info: ExecutorInfo { schema, pk_indices, - identity: "BackfillExecutor".to_owned(), + identity: format!("BackfillExecutor {:X}", executor_id), }, upstream_table, upstream, @@ -127,9 +142,7 @@ where // The primary key columns, in the output columns of the upstream_table scan. let pk_in_output_indices = self.upstream_table.pk_in_output_indices().unwrap(); - // schema: | vnode | pk ... | backfill_finished | row_count | - // +1 for vnode, +1 for backfill_finished, +1 for row_count. - let state_len = pk_in_output_indices.len() + 3; + let state_len = pk_in_output_indices.len() + METADATA_STATE_LEN; let pk_order = self.upstream_table.pk_serializer().get_order_types(); @@ -144,63 +157,20 @@ where state_table.init_epoch(first_barrier.epoch); } - let is_finished = if let Some(state_table) = self.state_table.as_ref() { - let is_finished = check_all_vnode_finished(state_table, state_len).await?; - if is_finished { - assert!(!first_barrier.is_newly_added(self.actor_id)); - } - is_finished - } else { - // Maintain backwards compatibility with no state table - !first_barrier.is_newly_added(self.actor_id) - }; + let BackfillState { + mut current_pos, + is_finished, + row_count, + mut old_state, + } = Self::recover_backfill_state(self.state_table.as_ref(), pk_in_output_indices.len()) + .await?; let mut builder = DataChunkBuilder::new(self.upstream_table.schema().data_types(), self.chunk_size); - // If the snapshot is empty, we don't need to backfill. - // We cannot complete progress now, as we want to persist - // finished state to state store first. - // As such we will wait for next barrier. - let is_snapshot_empty: bool = { - if is_finished { - // It is finished, so just assign a value to avoid accessing storage table again. - false - } else { - let snapshot_is_empty = { - let snapshot = Self::snapshot_read( - &self.upstream_table, - init_epoch, - None, - false, - &mut builder, - ); - pin_mut!(snapshot); - snapshot.try_next().await?.unwrap().is_none() - }; - let snapshot_buffer_is_empty = builder.is_empty(); - builder.clear(); - snapshot_is_empty && snapshot_buffer_is_empty - } - }; - - // | backfill_is_finished | snapshot_empty | need_to_backfill | - // | t | t/f | f | - // | f | t | f | - // | f | f | t | - let to_backfill = !is_finished && !is_snapshot_empty; - - // Current position of the upstream_table storage primary key. - // `None` means it starts from the beginning. - let mut current_pos: Option = None; - - // Use these to persist state. - // They contain the backfill position, - // as well as the progress. - // However, they do not contain the vnode key at index 0. - // That is filled in when we flush the state table. + // Use this buffer to construct state, + // which will then be persisted. let mut current_state: Vec = vec![None; state_len]; - let mut old_state: Option> = None; // The first barrier message should be propagated. yield Message::Barrier(first_barrier); @@ -219,13 +189,7 @@ where let mut snapshot_read_epoch = init_epoch; // Keep track of rows from the snapshot. - let mut total_snapshot_processed_rows: u64 = - if let Some(state_table) = self.state_table.as_ref() { - get_row_count_state(state_table, state_len).await? - } else { - // Maintain backwards compatibility with no state_table. - 0 - }; + let mut total_snapshot_processed_rows: u64 = row_count; // Backfill Algorithm: // @@ -250,12 +214,14 @@ where // finished. // // Once the backfill loop ends, we forward the upstream directly to the downstream. - if to_backfill { + if !is_finished { let mut upstream_chunk_buffer: Vec = vec![]; let mut pending_barrier: Option = None; 'backfill_loop: loop { let mut cur_barrier_snapshot_processed_rows: u64 = 0; let mut cur_barrier_upstream_processed_rows: u64 = 0; + let mut snapshot_read_complete = false; + let mut has_snapshot_read = false; // We should not buffer rows from previous epoch, else we can have duplicates. assert!(upstream_chunk_buffer.is_empty()); @@ -274,13 +240,13 @@ where // Prefer to select upstream, so we can stop snapshot stream as soon as the // barrier comes. - let backfill_stream = + let mut backfill_stream = select_with_strategy(left_upstream, right_snapshot, |_: &mut ()| { stream::PollNext::Left }); #[for_await] - for either in backfill_stream { + for either in &mut backfill_stream { match either { // Upstream Either::Left(msg) => { @@ -307,6 +273,7 @@ where } // Snapshot read Either::Right(msg) => { + has_snapshot_read = true; match msg? { None => { // End of the snapshot read stream. @@ -346,6 +313,43 @@ where } } } + + // Before processing barrier, if did not snapshot read, + // do a snapshot read first. + // This is so we don't lose the tombstone iteration progress. + if !has_snapshot_read { + let (_, snapshot) = backfill_stream.into_inner(); + #[for_await] + for msg in snapshot { + let Either::Right(msg) = msg else { + bail!("BUG: snapshot_read contains upstream messages"); + }; + match msg? { + None => { + // End of the snapshot read stream. + // We let the barrier handling logic take care of upstream updates. + // But we still want to exit backfill loop, so we mark snapshot read complete. + snapshot_read_complete = true; + break; + } + Some(chunk) => { + // Raise the current position. + // As snapshot read streams are ordered by pk, so we can + // just use the last row to update `current_pos`. + current_pos = Some(get_new_pos(&chunk, &pk_in_output_indices)); + + let chunk_cardinality = chunk.cardinality() as u64; + cur_barrier_snapshot_processed_rows += chunk_cardinality; + total_snapshot_processed_rows += chunk_cardinality; + yield Message::Chunk(mapping_chunk( + chunk, + &self.output_indices, + )); + break; + } + } + } + } } // When we break out of inner backfill_stream loop, it means we have a barrier. // If there are no updates and there are no snapshots left, @@ -414,13 +418,6 @@ where total_snapshot_processed_rows, ); - tracing::trace!( - actor = self.actor_id, - epoch = ?barrier.epoch, - ?current_pos, - total_snapshot_processed_rows, - "Backfill position persisted" - ); // Persist state on barrier Self::persist_state( barrier.epoch, @@ -433,57 +430,77 @@ where ) .await?; + tracing::trace!( + epoch = ?barrier.epoch, + ?current_pos, + total_snapshot_processed_rows, + "Backfill state persisted" + ); + yield Message::Barrier(barrier); + if snapshot_read_complete { + break 'backfill_loop; + } + // We will switch snapshot at the start of the next iteration of the backfill loop. } } - tracing::trace!( - actor = self.actor_id, - "Backfill has already finished and forward messages directly to the downstream" - ); + tracing::trace!("Backfill has finished, waiting for barrier"); // Wait for first barrier to come after backfill is finished. // So we can update our progress + persist the status. while let Some(Ok(msg)) = upstream.next().await { if let Some(msg) = mapping_message(msg, &self.output_indices) { // If not finished then we need to update state, otherwise no need. - if let Message::Barrier(barrier) = &msg && !is_finished { - // If snapshot was empty, we do not need to backfill, - // but we still need to persist the finished state. - // We currently persist it on the second barrier here rather than first. - // This is because we can't update state table in first epoch, - // since it expects to have been initialized in previous epoch - // (there's no epoch before the first epoch). - if is_snapshot_empty { - current_pos = - Some(construct_initial_finished_state(pk_in_output_indices.len())) - } + if let Message::Barrier(barrier) = &msg { + if is_finished { + // If already finished, no need persist any state. + } else { + // If snapshot was empty, we do not need to backfill, + // but we still need to persist the finished state. + // We currently persist it on the second barrier here rather than first. + // This is because we can't update state table in first epoch, + // since it expects to have been initialized in previous epoch + // (there's no epoch before the first epoch). + if current_pos.is_none() { + current_pos = + Some(construct_initial_finished_state(pk_in_output_indices.len())) + } - // We will update current_pos at least once, - // since snapshot read has to be non-empty, - // Or snapshot was empty and we construct a placeholder state. - debug_assert_ne!(current_pos, None); + // We will update current_pos at least once, + // since snapshot read has to be non-empty, + // Or snapshot was empty and we construct a placeholder state. + debug_assert_ne!(current_pos, None); + + Self::persist_state( + barrier.epoch, + &mut self.state_table, + true, + ¤t_pos, + total_snapshot_processed_rows, + &mut old_state, + &mut current_state, + ) + .await?; + tracing::trace!( + epoch = ?barrier.epoch, + ?current_pos, + total_snapshot_processed_rows, + "Backfill position persisted after completion" + ); + } + // For both backfill finished before recovery, + // and backfill which just finished, we need to update mview tracker, + // it does not persist this information. + self.progress + .finish(barrier.epoch.curr, total_snapshot_processed_rows); tracing::trace!( - actor = self.actor_id, epoch = ?barrier.epoch, - ?current_pos, - total_snapshot_processed_rows, - "Backfill position persisted after completion" + "Updated CreateMaterializedTracker" ); - Self::persist_state( - barrier.epoch, - &mut self.state_table, - true, - ¤t_pos, - total_snapshot_processed_rows, - &mut old_state, - &mut current_state, - ) - .await?; - self.progress.finish(barrier.epoch.curr); yield msg; break; } @@ -491,6 +508,10 @@ where } } + tracing::trace!( + "Backfill has already finished and forward messages directly to the downstream" + ); + // After progress finished + state persisted, // we can forward messages directly to the downstream, // as backfill is finished. @@ -499,14 +520,68 @@ where #[for_await] for msg in upstream { if let Some(msg) = mapping_message(msg?, &self.output_indices) { - if let Some(state_table) = self.state_table.as_mut() && let Message::Barrier(barrier) = &msg { - state_table.commit_no_data_expected(barrier.epoch); - } yield msg; } } } + async fn recover_backfill_state( + state_table: Option<&StateTable>, + pk_len: usize, + ) -> StreamExecutorResult { + let Some(state_table) = state_table else { + // If no state table, but backfill is present, it must be from an old cluster. + // In that case backfill must be finished, otherwise it won't have been persisted. + return Ok(BackfillState { + current_pos: None, + is_finished: true, + row_count: 0, + old_state: None, + }); + }; + let mut vnodes = state_table.vnodes().iter_vnodes_scalar(); + let first_vnode = vnodes.next().unwrap(); + let key: &[Datum] = &[Some(first_vnode.into())]; + let row = state_table.get_row(key).await?; + let expected_state = Self::deserialize_backfill_state(row, pk_len); + + // All vnode partitions should have same state (no scale-in supported). + for vnode in vnodes { + let key: &[Datum] = &[Some(vnode.into())]; + let row = state_table.get_row(key).await?; + let state = Self::deserialize_backfill_state(row, pk_len); + assert_eq!(state.is_finished, expected_state.is_finished); + } + Ok(expected_state) + } + + fn deserialize_backfill_state(row: Option, pk_len: usize) -> BackfillState { + let Some(row) = row else { + return BackfillState { + current_pos: None, + is_finished: false, + row_count: 0, + old_state: None, + }; + }; + let row = row.into_inner(); + let mut old_state = vec![None; pk_len + METADATA_STATE_LEN]; + old_state[1..row.len() + 1].clone_from_slice(&row); + let current_pos = Some((&row[0..pk_len]).into_owned_row()); + let is_finished = row[pk_len].clone().map_or(false, |d| d.into_bool()); + let row_count = row + .get(pk_len + 1) + .cloned() + .unwrap_or(None) + .map_or(0, |d| d.into_int64() as u64); + BackfillState { + current_pos, + is_finished, + row_count, + old_state: Some(old_state), + } + } + /// Snapshot read the upstream mv. /// The rows from upstream snapshot read will be buffered inside the `builder`. /// If snapshot is dropped before its rows are consumed, diff --git a/src/stream/src/executor/backfill/utils.rs b/src/stream/src/executor/backfill/utils.rs index 8a2cded5ca8d3..259b67d5f202b 100644 --- a/src/stream/src/executor/backfill/utils.rs +++ b/src/stream/src/executor/backfill/utils.rs @@ -308,79 +308,6 @@ pub(crate) async fn get_progress_per_vnode Option { - let datum = if row.len() == state_len - 2 { - // Handle backwards compatibility case where - // we did not have row count (-1 for this). - // -1 to exclude `vnode` as well. - row.last() - } else { - row.datum_at(row.len() - 2) - }; - datum.map(|d| d.into_bool()) -} - -/// The row here does not include `vnode`, -/// it should have been excluded by setting `value_indices`. -/// Row schema: | `pk_indices` ... | `backfill_finished` | `row_count` -pub(crate) fn get_row_count(row: OwnedRow, state_len: usize) -> u64 { - if row.len() == state_len - 2 { - // Handle backwards compatibility case where - // we did not have row count (-1 for this). - // -1 to exclude `vnode` as well. - return 0; - } - match row.last() { - None => 0, - Some(d) => d.into_int64() as u64, - } -} - -pub(crate) async fn get_row_count_state( - state_table: &StateTableInner, - state_len: usize, -) -> StreamExecutorResult { - let mut vnodes = state_table.vnodes().iter_vnodes_scalar(); - let vnode = vnodes.next().unwrap(); - let key: &[Datum] = &[Some(vnode.into())]; - let row = state_table.get_row(key).await?; - let row_count = match row { - None => 0, - Some(row) => get_row_count(row, state_len), - }; - Ok(row_count) -} - -/// All vnodes should be persisted with status finished. -pub(crate) async fn check_all_vnode_finished( - state_table: &StateTableInner, - state_len: usize, -) -> StreamExecutorResult { - debug_assert!(!state_table.vnode_bitmap().is_empty()); - let vnodes = state_table.vnodes().iter_vnodes_scalar(); - let mut is_finished = true; - for vnode in vnodes { - let key: &[Datum] = &[Some(vnode.into())]; - let row = state_table.get_row(key).await?; - - let vnode_is_finished = if let Some(row) = row - && let Some(vnode_is_finished) = get_backfill_finished(row, state_len) - { - vnode_is_finished - } else { - false - }; - if !vnode_is_finished { - is_finished = false; - break; - } - } - Ok(is_finished) -} - /// Flush the data // This is a clippy bug, see https://github.com/rust-lang/rust-clippy/issues/11380. // TODO: remove `allow` here after the issued is closed. diff --git a/src/stream/src/executor/chain.rs b/src/stream/src/executor/chain.rs index ab3ef9ae44973..a51c9e95abbb1 100644 --- a/src/stream/src/executor/chain.rs +++ b/src/stream/src/executor/chain.rs @@ -79,7 +79,7 @@ impl ChainExecutor { // If the barrier is a conf change of creating this mview, and the snapshot is not to be // consumed, we can finish the progress immediately. if barrier.is_newly_added(self.actor_id) && self.upstream_only { - self.progress.finish(barrier.epoch.curr); + self.progress.finish(barrier.epoch.curr, 0); } // The first barrier message should be propagated. @@ -103,7 +103,7 @@ impl ChainExecutor { for msg in upstream { let msg = msg?; if to_consume_snapshot && let Message::Barrier(barrier) = &msg { - self.progress.finish(barrier.epoch.curr); + self.progress.finish(barrier.epoch.curr, 0); } yield msg; } diff --git a/src/stream/src/executor/dispatch.rs b/src/stream/src/executor/dispatch.rs index 9c0c931aaa022..414721c34efbf 100644 --- a/src/stream/src/executor/dispatch.rs +++ b/src/stream/src/executor/dispatch.rs @@ -651,7 +651,7 @@ impl Dispatcher for HashDataDispatcher { // get hash value of every line by its key let vnodes = VirtualNode::compute_chunk(chunk.data_chunk(), &self.keys); - tracing::trace!(target: "events::stream::dispatch::hash", "\n{}\n keys {:?} => {:?}", chunk.to_pretty(), self.keys, vnodes); + tracing::debug!(target: "events::stream::dispatch::hash", "\n{}\n keys {:?} => {:?}", chunk.to_pretty(), self.keys, vnodes); let mut vis_maps = repeat_with(|| BitmapBuilder::with_capacity(chunk.capacity())) .take(num_outputs) diff --git a/src/stream/src/executor/dynamic_filter.rs b/src/stream/src/executor/dynamic_filter.rs index 6d95b52d9e548..e8eb4da545f2e 100644 --- a/src/stream/src/executor/dynamic_filter.rs +++ b/src/stream/src/executor/dynamic_filter.rs @@ -22,10 +22,12 @@ use risingwave_common::bail; use risingwave_common::buffer::{Bitmap, BitmapBuilder}; use risingwave_common::catalog::Schema; use risingwave_common::hash::VnodeBitmapExt; -use risingwave_common::row::{once, OwnedRow as RowData, Row}; +use risingwave_common::row::{self, once, OwnedRow, OwnedRow as RowData, Row}; use risingwave_common::types::{DataType, Datum, DefaultOrd, ScalarImpl, ToDatumRef, ToOwnedDatum}; use risingwave_common::util::iter_util::ZipEqDebug; -use risingwave_expr::expr::{build_func, BoxedExpression, InputRefExpression, LiteralExpression}; +use risingwave_expr::expr::{ + build_func_non_strict, BoxedExpression, InputRefExpression, LiteralExpression, +}; use risingwave_pb::expr::expr_node::Type as ExprNodeType; use risingwave_pb::expr::expr_node::Type::{ GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, @@ -42,6 +44,7 @@ use super::{ use crate::common::table::state_table::{StateTable, WatermarkCacheParameterizedStateTable}; use crate::common::StreamChunkBuilder; use crate::executor::expect_first_barrier_from_aligned_stream; +use crate::task::ActorEvalErrorReport; pub struct DynamicFilterExecutor { ctx: ActorContextRef, @@ -101,10 +104,7 @@ impl DynamicFilterExecutor DynamicFilterExecutor Result, StreamExecutorError> { // Recover value for RHS if available - let rhs_stream = self.right_table.iter_row(Default::default()).await?; + let sub_range: &(Bound, Bound) = &(Unbounded, Unbounded); + let rhs_stream = self + .right_table + .iter_with_prefix(row::empty(), sub_range, Default::default()) + .await?; pin_mut!(rhs_stream); if let Some(res) = rhs_stream.next().await { @@ -258,17 +262,24 @@ impl DynamicFilterExecutor DynamicFilterExecutor = std::result::Result; @@ -54,9 +53,6 @@ enum ErrorKind { StorageError, ), - #[error("Log store error: {0}")] - LogStoreError(#[source] LogStoreError), - #[error("Chunk operation error: {0}")] ArrayError(#[source] ArrayError), @@ -154,13 +150,6 @@ impl From for StreamExecutorError { } } -/// Log store error -impl From for StreamExecutorError { - fn from(e: LogStoreError) -> Self { - ErrorKind::LogStoreError(e).into() - } -} - /// Chunk operation error. impl From for StreamExecutorError { fn from(e: ArrayError) -> Self { diff --git a/src/stream/src/executor/exchange/input.rs b/src/stream/src/executor/exchange/input.rs index 576542ecfcecd..3804904b7c4f2 100644 --- a/src/stream/src/executor/exchange/input.rs +++ b/src/stream/src/executor/exchange/input.rs @@ -14,7 +14,6 @@ use std::pin::Pin; use std::task::{Context, Poll}; -use std::time::Instant; use anyhow::Context as _; use futures::{pin_mut, Stream}; @@ -149,12 +148,9 @@ impl RemoteInput { .await?; let up_actor_id = up_down_ids.0.to_string(); - let down_actor_id = up_down_ids.1.to_string(); let up_fragment_id = up_down_frag.0.to_string(); let down_fragment_id = up_down_frag.1.to_string(); - let mut rr = 0; - const SAMPLING_FREQUENCY: u64 = 100; let span: await_tree::Span = format!("RemoteInput (actor {up_actor_id})").into(); let mut batched_permits_accumulated = 0; @@ -171,20 +167,7 @@ impl RemoteInput { .with_label_values(&[&up_fragment_id, &down_fragment_id]) .inc_by(bytes as u64); - // add deserialization duration metric with given sampling frequency - let msg_res = if rr % SAMPLING_FREQUENCY == 0 { - let start_time = Instant::now(); - let msg_res = Message::from_protobuf(&msg); - metrics - .actor_sampled_deserialize_duration_ns - .with_label_values(&[&down_actor_id]) - .inc_by(start_time.elapsed().as_nanos() as u64); - msg_res - } else { - Message::from_protobuf(&msg) - }; - rr += 1; - + let msg_res = Message::from_protobuf(&msg); if let Some(add_back_permits) = match permits.unwrap().value { // For records, batch the permits we received to reduce the backward // `AddPermits` messages. diff --git a/src/stream/src/executor/filter.rs b/src/stream/src/executor/filter.rs index ccdf9729c48d3..ef593f8734284 100644 --- a/src/stream/src/executor/filter.rs +++ b/src/stream/src/executor/filter.rs @@ -28,7 +28,7 @@ use super::*; /// `FilterExecutor` will insert, delete or update element into next executor according /// to the result of the expression. pub struct FilterExecutor { - ctx: ActorContextRef, + _ctx: ActorContextRef, info: ExecutorInfo, input: BoxedExecutor, @@ -46,7 +46,7 @@ impl FilterExecutor { ) -> Self { let input_info = input.info(); Self { - ctx, + _ctx: ctx, input, info: ExecutorInfo { schema: input_info.schema, @@ -170,12 +170,7 @@ impl FilterExecutor { Message::Chunk(chunk) => { let chunk = chunk.compact(); - let pred_output = self - .expr - .eval_infallible(chunk.data_chunk(), |err| { - self.ctx.on_compute_error(err, &self.info.identity) - }) - .await; + let pred_output = self.expr.eval_infallible(chunk.data_chunk()).await; match Self::filter(chunk, pred_output)? { Some(new_chunk) => yield Message::Chunk(new_chunk), diff --git a/src/stream/src/executor/hash_agg.rs b/src/stream/src/executor/hash_agg.rs index a2814195ac9c6..cb62e8d8f94aa 100644 --- a/src/stream/src/executor/hash_agg.rs +++ b/src/stream/src/executor/hash_agg.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::marker::PhantomData; use std::sync::Arc; @@ -22,12 +22,13 @@ use itertools::Itertools; use risingwave_common::array::StreamChunk; use risingwave_common::buffer::{Bitmap, BitmapBuilder}; use risingwave_common::catalog::Schema; -use risingwave_common::estimate_size::{EstimateSize, KvSize}; +use risingwave_common::estimate_size::collections::hashmap::EstimatedHashMap; +use risingwave_common::estimate_size::EstimateSize; use risingwave_common::hash::{HashKey, PrecomputedBuildHasher}; use risingwave_common::types::ScalarImpl; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::iter_util::ZipEqFast; -use risingwave_expr::agg::{build_retractable, AggCall, BoxedAggregateFunction}; +use risingwave_expr::aggregate::{build_retractable, AggCall, BoxedAggregateFunction}; use risingwave_storage::StateStore; use super::agg_common::{AggExecutorArgs, HashAggExecutorExtraArgs}; @@ -52,7 +53,15 @@ use crate::executor::{BoxedMessageStream, Executor, Message}; use crate::task::AtomicU64Ref; type AggGroup = GenericAggGroup; -type AggGroupCache = ManagedLruCache, PrecomputedBuildHasher>; +type BoxedAggGroup = Box>; + +impl EstimateSize for BoxedAggGroup { + fn estimated_heap_size(&self) -> usize { + self.as_ref().estimated_size() + } +} + +type AggGroupCache = ManagedLruCache>, PrecomputedBuildHasher>; /// [`HashAggExecutor`] could process large amounts of data using a state backend. It works as /// follows: @@ -120,6 +129,9 @@ struct ExecutorInner { /// The maximum size of the chunk produced by executor at a time. chunk_size: usize, + /// The maximum heap size of dirty groups. If exceeds, the executor should flush dirty groups. + max_dirty_groups_heap_size: usize, + /// Should emit on window close according to watermark? emit_on_window_close: bool, @@ -140,11 +152,8 @@ struct ExecutionVars { /// Cache for [`AggGroup`]s. `HashKey` -> `AggGroup`. agg_group_cache: AggGroupCache, - /// Changed group keys in the current epoch (before next flush). - group_change_set: HashSet, - - /// Heap size of `group_change_set` and dirty [`AggGroup`]s in `agg_group_cache`. - dirty_groups_heap_size: KvSize, + /// Changed [`AggGroup`]s in the current epoch (before next flush). + dirty_groups: EstimatedHashMap>, /// Distinct deduplicater to deduplicate input rows for each distinct agg call. distinct_dedup: DistinctDeduplicater, @@ -243,6 +252,7 @@ impl HashAggExecutor { watermark_epoch: args.watermark_epoch, extreme_cache_size: args.extreme_cache_size, chunk_size: args.extra.chunk_size, + max_dirty_groups_heap_size: args.extra.max_dirty_groups_heap_size, emit_on_window_close: args.extra.emit_on_window_close, metrics: args.metrics, }, @@ -274,53 +284,70 @@ impl HashAggExecutor { .collect() } - async fn ensure_keys_in_cache( + /// Touch the [`AggGroup`]s for the given keys, which means move them from cache to the `dirty_groups` map. + /// If the [`AggGroup`] doesn't exist in the cache before, it will be created or recovered from state table. + async fn touch_agg_groups( this: &ExecutorInner, - cache: &mut AggGroupCache, + vars: &mut ExecutionVars, keys: impl IntoIterator, - stats: &mut ExecutionStats, ) -> StreamExecutorResult<()> { let group_key_types = &this.info.schema.data_types()[..this.group_key_indices.len()]; let futs = keys .into_iter() .filter_map(|key| { - stats.total_lookup_count += 1; - if cache.contains(key) { - None - } else { - stats.lookup_miss_count += 1; - Some(async { - // Create `AggGroup` for the current group if not exists. This will - // restore agg states from the intermediate state table. - let agg_group = AggGroup::create( - Some(GroupKey::new( - key.deserialize(group_key_types)?, - Some(this.group_key_table_pk_projection.clone()), - )), - &this.agg_calls, - &this.agg_funcs, - &this.storages, - &this.intermediate_state_table, - &this.input_pk_indices, - this.row_count_index, - this.extreme_cache_size, - &this.input_schema, - ) - .await?; - Ok::<_, StreamExecutorError>((key.clone(), agg_group)) - }) + vars.stats.total_lookup_count += 1; + if vars.dirty_groups.contains_key(key) { + // already dirty + return None; + } + match vars.agg_group_cache.get_mut(key) { + Some(mut agg_group) => { + let agg_group: &mut Option<_> = &mut agg_group; + assert!( + agg_group.is_some(), + "invalid state: AggGroup is None in cache but not dirty" + ); + // move from cache to `dirty_groups` + vars.dirty_groups + .insert(key.clone(), agg_group.take().unwrap()); + None // no need to create + } + None => { + vars.stats.lookup_miss_count += 1; + Some(async { + // Create `AggGroup` for the current group if not exists. This will + // restore agg states from the intermediate state table. + let agg_group = AggGroup::create( + Some(GroupKey::new( + key.deserialize(group_key_types)?, + Some(this.group_key_table_pk_projection.clone()), + )), + &this.agg_calls, + &this.agg_funcs, + &this.storages, + &this.intermediate_state_table, + &this.input_pk_indices, + this.row_count_index, + this.extreme_cache_size, + &this.input_schema, + ) + .await?; + Ok::<_, StreamExecutorError>((key.clone(), Box::new(agg_group))) + }) + } } }) .collect_vec(); // collect is necessary to avoid lifetime issue of `agg_group_cache` - stats.chunk_total_lookup_count += 1; + vars.stats.chunk_total_lookup_count += 1; if !futs.is_empty() { // If not all the required states/keys are in the cache, it's a chunk-level cache miss. - stats.chunk_lookup_miss_count += 1; + vars.stats.chunk_lookup_miss_count += 1; let mut buffered = stream::iter(futs).buffer_unordered(10).fuse(); while let Some(result) = buffered.next().await { let (key, agg_group) = result?; - cache.put(key, agg_group); + let none = vars.dirty_groups.insert(key, agg_group); + debug_assert!(none.is_none()); } } Ok(()) @@ -335,20 +362,13 @@ impl HashAggExecutor { let keys = K::build(&this.group_key_indices, chunk.data_chunk())?; let group_visibilities = Self::get_group_visibilities(keys, chunk.visibility()); - // Create `AggGroup` for each group if not exists. - Self::ensure_keys_in_cache( - this, - &mut vars.agg_group_cache, - group_visibilities.iter().map(|(k, _)| k), - &mut vars.stats, - ) - .await?; + // Ensure all `AggGroup`s are in `dirty_groups`. + Self::touch_agg_groups(this, vars, group_visibilities.iter().map(|(k, _)| k)).await?; // Calculate the row visibility for every agg call. let mut call_visibilities = Vec::with_capacity(this.agg_calls.len()); for agg_call in &this.agg_calls { - let agg_call_filter_res = - agg_call_filter_res(&this.actor_ctx, &this.info.identity, agg_call, &chunk).await?; + let agg_call_filter_res = agg_call_filter_res(agg_call, &chunk).await?; call_visibilities.push(agg_call_filter_res); } @@ -366,15 +386,7 @@ impl HashAggExecutor { // Apply chunk to each of the state (per agg_call), for each group. for (key, visibility) in group_visibilities { - let mut agg_group = vars.agg_group_cache.get_mut(&key).unwrap(); - - // Mark the group as changed. - let key_size = key.estimated_size(); - let old_group_size = if vars.group_change_set.insert(key) { - None - } else { - Some(agg_group.estimated_size()) - }; + let agg_group: &mut BoxedAggGroup<_> = &mut vars.dirty_groups.get_mut(&key).unwrap(); let visibilities = call_visibilities .iter() @@ -403,84 +415,30 @@ impl HashAggExecutor { agg_group .apply_chunk(&chunk, &this.agg_calls, &this.agg_funcs, visibilities) .await?; - - // Update the metrics. - let actor_id_str = this.actor_ctx.id.to_string(); - let table_id_str = this.intermediate_state_table.table_id().to_string(); - let metric_dirty_count = this - .metrics - .agg_dirty_group_count - .with_label_values(&[&table_id_str, &actor_id_str]); - let metric_dirty_heap_size = this - .metrics - .agg_dirty_group_heap_size - .with_label_values(&[&table_id_str, &actor_id_str]); - let new_group_size = agg_group.estimated_size(); - if let Some(old_group_size) = old_group_size { - match new_group_size.cmp(&old_group_size) { - std::cmp::Ordering::Greater => { - let inc_size = new_group_size - old_group_size; - vars.dirty_groups_heap_size.add_size(inc_size); - metric_dirty_heap_size.add(inc_size as i64); - } - std::cmp::Ordering::Less => { - let dec_size = old_group_size - new_group_size; - vars.dirty_groups_heap_size.sub_size(dec_size); - metric_dirty_heap_size.sub(dec_size as i64); - } - std::cmp::Ordering::Equal => {} - } - } else { - let inc_size = key_size * 2 + new_group_size; - vars.dirty_groups_heap_size.add_size(inc_size); - metric_dirty_heap_size.add(inc_size as i64); - metric_dirty_count.set(vars.group_change_set.len() as i64); - } } - Ok(()) - } - - #[try_stream(ok = StreamChunk, error = StreamExecutorError)] - async fn flush_data<'a>( - this: &'a mut ExecutorInner, - vars: &'a mut ExecutionVars, - epoch: EpochPair, - ) { - // Update metrics. + // Update the metrics. let actor_id_str = this.actor_ctx.id.to_string(); + let fragment_id_str = this.actor_ctx.fragment_id.to_string(); let table_id_str = this.intermediate_state_table.table_id().to_string(); this.metrics - .agg_lookup_miss_count - .with_label_values(&[&table_id_str, &actor_id_str]) - .inc_by(vars.stats.lookup_miss_count); - vars.stats.lookup_miss_count = 0; + .agg_dirty_groups_count + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) + .set(vars.dirty_groups.len() as i64); this.metrics - .agg_total_lookup_count - .with_label_values(&[&table_id_str, &actor_id_str]) - .inc_by(vars.stats.total_lookup_count); - vars.stats.total_lookup_count = 0; - this.metrics - .agg_cached_keys - .with_label_values(&[&table_id_str, &actor_id_str]) - .set(vars.agg_group_cache.len() as i64); - this.metrics - .agg_chunk_lookup_miss_count - .with_label_values(&[&table_id_str, &actor_id_str]) - .inc_by(vars.stats.chunk_lookup_miss_count); - vars.stats.chunk_lookup_miss_count = 0; - this.metrics - .agg_chunk_total_lookup_count - .with_label_values(&[&table_id_str, &actor_id_str]) - .inc_by(vars.stats.chunk_total_lookup_count); - vars.stats.chunk_total_lookup_count = 0; + .agg_dirty_groups_heap_size + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) + .set(vars.dirty_groups.estimated_heap_size() as i64); + Ok(()) + } + + #[try_stream(ok = StreamChunk, error = StreamExecutorError)] + async fn flush_data<'a>(this: &'a mut ExecutorInner, vars: &'a mut ExecutionVars) { let window_watermark = vars.window_watermark.take(); - let n_dirty_group = vars.group_change_set.len(); // flush changed states into intermediate state table - for key in &vars.group_change_set { - let agg_group = vars.agg_group_cache.get_mut(key).unwrap(); + for agg_group in vars.dirty_groups.values() { let encoded_states = agg_group.encode_states(&this.agg_funcs)?; if this.emit_on_window_close { vars.buffer @@ -535,8 +493,8 @@ impl HashAggExecutor { } else { // emit on update // TODO(wrj,rc): we may need to parallelize it and set a reasonable concurrency limit. - for group_key in &vars.group_change_set { - let mut agg_group = vars.agg_group_cache.get_mut(group_key).unwrap(); + for mut agg_group in vars.dirty_groups.values_mut() { + let agg_group = agg_group.as_mut(); let change = agg_group .build_change(&this.storages, &this.agg_funcs) .await?; @@ -548,40 +506,20 @@ impl HashAggExecutor { } } - // clear the change set - vars.group_change_set.clear(); - vars.dirty_groups_heap_size.set(0); - this.metrics - .agg_dirty_group_count - .with_label_values(&[&table_id_str, &actor_id_str]) - .set(0); - this.metrics - .agg_dirty_group_heap_size - .with_label_values(&[&table_id_str, &actor_id_str]) - .set(0); + // move dirty groups back to cache + for (key, agg_group) in vars.dirty_groups.drain() { + vars.agg_group_cache.put(key, Some(agg_group)); + } // Yield the remaining rows in chunk builder. if let Some(chunk) = vars.chunk_builder.take() { yield chunk; } - if n_dirty_group == 0 && window_watermark.is_none() { - // Nothing is expected to be changed. - this.all_state_tables_mut().for_each(|table| { - table.commit_no_data_expected(epoch); - }); - } else { - if let Some(watermark) = window_watermark { - // Update watermark of state tables, for state cleaning. - this.all_state_tables_mut() - .for_each(|table| table.update_watermark(watermark.clone(), false)); - } - // Commit all state tables. - futures::future::try_join_all( - this.all_state_tables_mut() - .map(|table| async { table.commit(epoch).await }), - ) - .await?; + if let Some(watermark) = window_watermark { + // Update watermark of state tables, for state cleaning. + this.all_state_tables_mut() + .for_each(|table| table.update_watermark(watermark.clone(), false)); } // Flush distinct dedup state. @@ -592,6 +530,44 @@ impl HashAggExecutor { vars.agg_group_cache.evict(); } + fn update_metrics(this: &ExecutorInner, vars: &mut ExecutionVars) { + let actor_id_str = this.actor_ctx.id.to_string(); + let fragment_id_str = this.actor_ctx.fragment_id.to_string(); + let table_id_str = this.intermediate_state_table.table_id().to_string(); + this.metrics + .agg_lookup_miss_count + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) + .inc_by(std::mem::take(&mut vars.stats.lookup_miss_count)); + this.metrics + .agg_total_lookup_count + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) + .inc_by(std::mem::take(&mut vars.stats.total_lookup_count)); + this.metrics + .agg_cached_entry_count + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) + .set(vars.agg_group_cache.len() as i64); + this.metrics + .agg_chunk_lookup_miss_count + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) + .inc_by(std::mem::take(&mut vars.stats.chunk_lookup_miss_count)); + this.metrics + .agg_chunk_total_lookup_count + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) + .inc_by(std::mem::take(&mut vars.stats.chunk_total_lookup_count)); + } + + async fn commit_state_tables( + this: &mut ExecutorInner, + epoch: EpochPair, + ) -> StreamExecutorResult<()> { + futures::future::try_join_all( + this.all_state_tables_mut() + .map(|table| async { table.commit(epoch).await }), + ) + .await?; + Ok(()) + } + #[try_stream(ok = Message, error = StreamExecutorError)] async fn execute_inner(self) { let HashAggExecutor { @@ -616,8 +592,7 @@ impl HashAggExecutor { agg_group_cache_metrics_info, PrecomputedBuildHasher, ), - group_change_set: HashSet::new(), - dirty_groups_heap_size: KvSize::default(), + dirty_groups: Default::default(), distinct_dedup: DistinctDeduplicater::new( &this.agg_calls, &this.watermark_epoch, @@ -656,7 +631,7 @@ impl HashAggExecutor { #[for_await] for msg in input { let msg = msg?; - vars.agg_group_cache.evict_except_cur_epoch(); + vars.agg_group_cache.evict(); match msg { Message::Watermark(watermark) => { let group_key_seq = group_key_invert_idx[watermark.col_idx]; @@ -670,12 +645,23 @@ impl HashAggExecutor { } Message::Chunk(chunk) => { Self::apply_chunk(&mut this, &mut vars, chunk).await?; + + if vars.dirty_groups.estimated_heap_size() >= this.max_dirty_groups_heap_size { + // flush dirty groups if heap size is too large, to better prevent from OOM + #[for_await] + for chunk in Self::flush_data(&mut this, &mut vars) { + yield Message::Chunk(chunk?); + } + } } Message::Barrier(barrier) => { + Self::update_metrics(&this, &mut vars); + #[for_await] - for chunk in Self::flush_data(&mut this, &mut vars, barrier.epoch) { + for chunk in Self::flush_data(&mut this, &mut vars) { yield Message::Chunk(chunk?); } + Self::commit_state_tables(&mut this, barrier.epoch).await?; if this.emit_on_window_close { // ignore watermarks on other columns diff --git a/src/stream/src/executor/hash_join.rs b/src/stream/src/executor/hash_join.rs index a5a39723c9319..7aed840679c82 100644 --- a/src/stream/src/executor/hash_join.rs +++ b/src/stream/src/executor/hash_join.rs @@ -310,7 +310,6 @@ struct HashJoinChunkBuilder { ctx: &'a ActorContextRef, - identity: &'a str, side_l: &'a mut JoinSide, side_r: &'a mut JoinSide, actual_output_data_types: &'a [DataType], @@ -640,6 +639,7 @@ impl HashJoinExecutor HashJoinExecutor HashJoinExecutor HashJoinExecutor { @@ -745,7 +747,6 @@ impl HashJoinExecutor HashJoinExecutor { @@ -772,7 +773,6 @@ impl HashJoinExecutor HashJoinExecutor { @@ -814,23 +814,15 @@ impl HashJoinExecutor250ms). - // Those will result in that barrier is always ready - // in source. Since select barrier is preferred, - // chunk would never be selected. - // self.metrics - // .join_cached_rows - // .with_label_values(&[&actor_id_str, side]) - // .set(ht.cached_rows() as i64); self.metrics - .join_cached_entries - .with_label_values(&[&actor_id_str, side]) + .join_cached_entry_count + .with_label_values(&[&actor_id_str, &fragment_id_str, side]) .set(ht.entry_count() as i64); } self.metrics .join_match_duration_ns - .with_label_values(&[&actor_id_str, "barrier"]) + .with_label_values(&[&actor_id_str, &fragment_id_str, "barrier"]) .inc_by(barrier_start_time.elapsed().as_nanos() as u64); yield Message::Barrier(barrier); } @@ -990,7 +982,6 @@ impl HashJoinExecutor(args: EqJoinArgs<'_, K, S>) { let EqJoinArgs { ctx, - identity, side_l, side_r, actual_output_data_types, @@ -1000,6 +991,7 @@ impl HashJoinExecutor HashJoinExecutor HashJoinExecutor Self { HopWindowExecutor { - ctx, + _ctx: ctx, input, info, time_col_idx, @@ -90,13 +90,11 @@ impl HopWindowExecutor { #[try_stream(ok = Message, error = StreamExecutorError)] async fn execute_inner(self: Box) { let Self { - ctx, input, window_slide, window_size, output_indices, - info, time_col_idx, chunk_size, @@ -152,22 +150,14 @@ impl HopWindowExecutor { let window_start_col = if out_window_start_col_idx.is_some() { Some( self.window_start_exprs[i] - .eval_infallible(&data_chunk, |err| { - ctx.on_compute_error(err, &info.identity) - }) + .eval_infallible(&data_chunk) .await, ) } else { None }; let window_end_col = if out_window_end_col_idx.is_some() { - Some( - self.window_end_exprs[i] - .eval_infallible(&data_chunk, |err| { - ctx.on_compute_error(err, &info.identity) - }) - .await, - ) + Some(self.window_end_exprs[i].eval_infallible(&data_chunk).await) } else { None }; @@ -228,9 +218,7 @@ impl HopWindowExecutor { { let w = w .clone() - .transform_with_expr(start_expr, out_start_idx, |err| { - ctx.on_compute_error(err, &info.identity) - }) + .transform_with_expr(start_expr, out_start_idx) .await; if let Some(w) = w { yield Message::Watermark(w); @@ -239,11 +227,7 @@ impl HopWindowExecutor { if let (Some(out_end_idx), Some(end_expr)) = (out_window_end_col_idx, self.window_end_exprs.get(0)) { - let w = w - .transform_with_expr(end_expr, out_end_idx, |err| { - ctx.on_compute_error(err, &info.identity) - }) - .await; + let w = w.transform_with_expr(end_expr, out_end_idx).await; if let Some(w) = w { yield Message::Watermark(w); } @@ -290,7 +274,7 @@ mod tests { U+ 6 2 ^10:42:00 - 7 1 ^10:51:00 + 8 3 ^11:02:00" - .replace('^', "2022-2-2T"), + .replace('^', "2022-02-02T"), ); let input = MockSource::with_chunks(schema.clone(), pk_indices.clone(), vec![chunk]).boxed(); @@ -354,7 +338,7 @@ mod tests { - 7 1 ^10:51:00 ^10:45:00 ^11:15:00 + 8 3 ^11:02:00 ^10:45:00 ^11:15:00 + 8 3 ^11:02:00 ^11:00:00 ^11:30:00" - .replace('^', "2022-2-2T"), + .replace('^', "2022-02-02T"), ) ); } @@ -387,7 +371,7 @@ mod tests { - ^11:15:00 1 7 ^10:51:00 + ^11:15:00 3 8 ^11:02:00 + ^11:30:00 3 8 ^11:02:00" - .replace('^', "2022-2-2T"), + .replace('^', "2022-02-02T"), ) ); } diff --git a/src/stream/src/executor/integration_tests.rs b/src/stream/src/executor/integration_tests.rs index cee9b74b67b6b..a9c219a25641f 100644 --- a/src/stream/src/executor/integration_tests.rs +++ b/src/stream/src/executor/integration_tests.rs @@ -20,7 +20,7 @@ use multimap::MultiMap; use risingwave_common::array::*; use risingwave_common::catalog::{Field, Schema}; use risingwave_common::types::*; -use risingwave_expr::agg::AggCall; +use risingwave_expr::aggregate::AggCall; use risingwave_expr::expr::*; use risingwave_storage::memory::MemoryStateStore; diff --git a/src/stream/src/executor/lookup/impl_.rs b/src/stream/src/executor/lookup/impl_.rs index 463cec2d5f6bf..e7f39c0247bf9 100644 --- a/src/stream/src/executor/lookup/impl_.rs +++ b/src/stream/src/executor/lookup/impl_.rs @@ -322,7 +322,7 @@ impl LookupExecutor { .lookup_one_row(&row, self.last_barrier.as_ref().unwrap().epoch) .await? { - tracing::trace!(target: "events::stream::lookup::put", "{:?} {:?}", row, matched_row); + tracing::debug!(target: "events::stream::lookup::put", "{:?} {:?}", row, matched_row); if let Some(chunk) = builder.append_row(*op, row, &matched_row) { yield Message::Chunk(chunk); @@ -371,10 +371,11 @@ impl LookupExecutor { .into_owned_row(); let table_id_str = self.arrangement.storage_table.table_id().to_string(); let actor_id_str = self.ctx.id.to_string(); + let fragment_id_str = self.ctx.fragment_id.to_string(); self.ctx .streaming_metrics .lookup_total_query_cache_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); if let Some(result) = self.lookup_cache.lookup(&lookup_row) { return Ok(result.iter().cloned().collect_vec()); @@ -384,10 +385,10 @@ impl LookupExecutor { self.ctx .streaming_metrics .lookup_cache_miss_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); - tracing::trace!(target: "events::stream::lookup::lookup_row", "{:?}", lookup_row); + tracing::debug!(target: "events::stream::lookup::lookup_row", "{:?}", lookup_row); let mut all_rows = VecWithKvSize::new(); // Drop the stream. @@ -426,14 +427,14 @@ impl LookupExecutor { } } - tracing::trace!(target: "events::stream::lookup::result", "{:?} => {:?}", lookup_row, all_rows.inner()); + tracing::debug!(target: "events::stream::lookup::result", "{:?} => {:?}", lookup_row, all_rows.inner()); self.lookup_cache.batch_update(lookup_row, all_rows.clone()); self.ctx .streaming_metrics .lookup_cached_entry_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .set(self.lookup_cache.len() as i64); Ok(all_rows.into_inner()) diff --git a/src/stream/src/executor/managed_state/join/mod.rs b/src/stream/src/executor/managed_state/join/mod.rs index 7ee23c06a5631..b7a81a0f75745 100644 --- a/src/stream/src/executor/managed_state/join/mod.rs +++ b/src/stream/src/executor/managed_state/join/mod.rs @@ -15,7 +15,7 @@ mod join_entry_state; use std::alloc::Global; -use std::ops::{Deref, DerefMut}; +use std::ops::{Bound, Deref, DerefMut}; use std::sync::Arc; use futures::future::try_join; @@ -40,7 +40,7 @@ use crate::common::metrics::MetricsInfo; use crate::common::table::state_table::StateTable; use crate::executor::error::StreamExecutorResult; use crate::executor::monitor::StreamingMetrics; -use crate::task::{ActorId, AtomicU64Ref}; +use crate::task::{ActorId, AtomicU64Ref, FragmentId}; type DegreeType = u64; @@ -161,6 +161,7 @@ pub struct JoinHashMapMetrics { metrics: Arc, /// Basic information actor_id: String, + fragment_id: String, join_table_id: String, degree_table_id: String, side: &'static str, @@ -175,6 +176,7 @@ impl JoinHashMapMetrics { pub fn new( metrics: Arc, actor_id: ActorId, + fragment_id: FragmentId, side: &'static str, join_table_id: u32, degree_table_id: u32, @@ -182,6 +184,7 @@ impl JoinHashMapMetrics { Self { metrics, actor_id: actor_id.to_string(), + fragment_id: fragment_id.to_string(), join_table_id: join_table_id.to_string(), degree_table_id: degree_table_id.to_string(), side, @@ -193,23 +196,25 @@ impl JoinHashMapMetrics { pub fn flush(&mut self) { self.metrics - .join_lookup_miss_count + .join_lookup_total_count .with_label_values(&[ (self.side), &self.join_table_id, &self.degree_table_id, &self.actor_id, + &self.fragment_id, ]) - .inc_by(self.lookup_miss_count as u64); + .inc_by(self.total_lookup_count as u64); self.metrics - .join_total_lookup_count + .join_lookup_miss_count .with_label_values(&[ (self.side), &self.join_table_id, &self.degree_table_id, &self.actor_id, + &self.fragment_id, ]) - .inc_by(self.total_lookup_count as u64); + .inc_by(self.lookup_miss_count as u64); self.metrics .join_insert_cache_miss_count .with_label_values(&[ @@ -217,6 +222,7 @@ impl JoinHashMapMetrics { &self.join_table_id, &self.degree_table_id, &self.actor_id, + &self.fragment_id, ]) .inc_by(self.insert_cache_miss_count as u64); self.total_lookup_count = 0; @@ -284,6 +290,7 @@ impl JoinHashMap { pk_contained_in_jk: bool, metrics: Arc, actor_id: ActorId, + fragment_id: FragmentId, side: &'static str, ) -> Self { let alloc = StatsAlloc::new(Global).shared(); @@ -335,6 +342,7 @@ impl JoinHashMap { metrics: JoinHashMapMetrics::new( metrics, actor_id, + fragment_id, side, join_table_id, degree_table_id, @@ -402,14 +410,18 @@ impl JoinHashMap { let mut entry_state = JoinEntryState::default(); if self.need_degree_table { - let table_iter_fut = self - .state - .table - .iter_row_with_pk_prefix(&key, PrefetchOptions::new_for_exhaust_iter()); - let degree_table_iter_fut = self - .degree_state - .table - .iter_row_with_pk_prefix(&key, PrefetchOptions::new_for_exhaust_iter()); + let sub_range: &(Bound, Bound) = + &(Bound::Unbounded, Bound::Unbounded); + let table_iter_fut = self.state.table.iter_with_prefix( + &key, + sub_range, + PrefetchOptions::new_for_exhaust_iter(), + ); + let degree_table_iter_fut = self.degree_state.table.iter_with_prefix( + &key, + sub_range, + PrefetchOptions::new_for_exhaust_iter(), + ); let (table_iter, degree_table_iter) = try_join(table_iter_fut, degree_table_iter_fut).await?; @@ -437,10 +449,12 @@ impl JoinHashMap { ); } } else { + let sub_range: &(Bound, Bound) = + &(Bound::Unbounded, Bound::Unbounded); let table_iter = self .state .table - .iter_row_with_pk_prefix(&key, PrefetchOptions::new_for_exhaust_iter()) + .iter_with_prefix(&key, sub_range, PrefetchOptions::new_for_exhaust_iter()) .await?; #[for_await] diff --git a/src/stream/src/executor/merge.rs b/src/stream/src/executor/merge.rs index 4df4ae4d3e302..f2f7d84ca2a3d 100644 --- a/src/stream/src/executor/merge.rs +++ b/src/stream/src/executor/merge.rs @@ -134,11 +134,11 @@ impl MergeExecutor { Message::Chunk(chunk) => { self.metrics .actor_in_record_cnt - .with_label_values(&[&actor_id_str]) + .with_label_values(&[&actor_id_str, &fragment_id_str]) .inc_by(chunk.cardinality() as _); } Message::Barrier(barrier) => { - tracing::trace!( + tracing::debug!( target: "events::stream::barrier::path", actor_id = actor_id, "receiver receives barrier from path: {:?}", diff --git a/src/stream/src/executor/mod.rs b/src/stream/src/executor/mod.rs index 8fa7a5d818cc4..99b090e21a240 100644 --- a/src/stream/src/executor/mod.rs +++ b/src/stream/src/executor/mod.rs @@ -26,14 +26,13 @@ use risingwave_common::array::StreamChunk; use risingwave_common::buffer::Bitmap; use risingwave_common::catalog::Schema; use risingwave_common::row::OwnedRow; -use risingwave_common::types::{DataType, DefaultOrd, ScalarImpl}; +use risingwave_common::types::{DataType, Datum, DefaultOrd, ScalarImpl}; use risingwave_common::util::epoch::{Epoch, EpochPair}; use risingwave_common::util::tracing::TracingContext; -use risingwave_common::util::value_encoding::{deserialize_datum, serialize_datum}; +use risingwave_common::util::value_encoding::{DatumFromProtoExt, DatumToProtoExt}; use risingwave_connector::source::SplitImpl; use risingwave_expr::expr::BoxedExpression; -use risingwave_expr::ExprError; -use risingwave_pb::data::{PbDatum, PbEpoch}; +use risingwave_pb::data::PbEpoch; use risingwave_pb::expr::PbInputRef; use risingwave_pb::stream_plan::barrier::{BarrierKind, PbMutation}; use risingwave_pb::stream_plan::stream_message::StreamMessage; @@ -333,13 +332,7 @@ impl Barrier { } } - /// Whether this barrier is for configuration change. Used for source executor initialization. - pub fn is_update(&self) -> bool { - matches!(self.mutation.as_deref(), Some(Mutation::Update { .. })) - } - - /// Whether this barrier is for resume. Used for now executor to determine whether to yield a - /// chunk and a watermark before this barrier. + /// Whether this barrier is for resume. pub fn is_resume(&self) -> bool { matches!(self.mutation.as_deref(), Some(Mutation::Resume)) } @@ -650,7 +643,6 @@ impl Watermark { self, expr: &BoxedExpression, new_col_idx: usize, - on_err: impl Fn(ExprError), ) -> Option { let Self { col_idx, val, .. } = self; let row = { @@ -658,7 +650,7 @@ impl Watermark { row[col_idx] = Some(val); OwnedRow::new(row) }; - let val = expr.eval_row_infallible(&row, on_err).await?; + let val = expr.eval_row_infallible(&row).await?; Some(Self::new(new_col_idx, expr.return_type(), val)) } @@ -677,16 +669,14 @@ impl Watermark { index: self.col_idx as _, r#type: Some(self.data_type.to_protobuf()), }), - val: Some(PbDatum { - body: serialize_datum(Some(&self.val)), - }), + val: Some(&self.val).to_protobuf().into(), } } pub fn from_protobuf(prost: &PbWatermark) -> StreamExecutorResult { let col_ref = prost.get_column()?; let data_type = DataType::from(col_ref.get_type()?); - let val = deserialize_datum(prost.get_val()?.get_body().as_slice(), &data_type)? + let val = Datum::from_protobuf(prost.get_val()?, &data_type)? .expect("watermark value cannot be null"); Ok(Self::new(col_ref.get_index() as _, data_type, val)) } diff --git a/src/stream/src/executor/monitor/streaming_stats.rs b/src/stream/src/executor/monitor/streaming_stats.rs index 52c8487433c1b..a31727e76639f 100644 --- a/src/stream/src/executor/monitor/streaming_stats.rs +++ b/src/stream/src/executor/monitor/streaming_stats.rs @@ -17,23 +17,33 @@ use std::sync::OnceLock; use prometheus::core::{AtomicF64, AtomicI64, AtomicU64, GenericCounterVec, GenericGaugeVec}; use prometheus::{ exponential_buckets, histogram_opts, register_gauge_vec_with_registry, - register_histogram_vec_with_registry, register_histogram_with_registry, - register_int_counter_vec_with_registry, register_int_counter_with_registry, - register_int_gauge_vec_with_registry, register_int_gauge_with_registry, Histogram, - HistogramVec, IntCounter, IntGauge, Registry, + register_histogram_with_registry, register_int_counter_vec_with_registry, + register_int_counter_with_registry, register_int_gauge_vec_with_registry, + register_int_gauge_with_registry, Histogram, IntCounter, IntGauge, Registry, }; use risingwave_common::config::MetricLevel; -use risingwave_common::metrics::RelabeledHistogramVec; +use risingwave_common::metrics::{ + LabelGuardedHistogramVec, LabelGuardedIntCounterVec, LabelGuardedIntGaugeVec, + RelabeledGuardedHistogramVec, +}; use risingwave_common::monitor::GLOBAL_METRICS_REGISTRY; +use risingwave_common::{ + register_guarded_histogram_vec_with_registry, register_guarded_int_counter_vec_with_registry, + register_guarded_int_gauge_vec_with_registry, +}; +use risingwave_connector::sink::SinkMetrics; #[derive(Clone)] pub struct StreamingMetrics { pub level: MetricLevel, + // Executor metrics (disabled by default) pub executor_row_count: GenericCounterVec, + + // Streaming actor metrics from tokio (disabled by default) pub actor_execution_time: GenericGaugeVec, - pub actor_output_buffer_blocking_duration_ns: GenericCounterVec, - pub actor_input_buffer_blocking_duration_ns: GenericCounterVec, + pub actor_output_buffer_blocking_duration_ns: LabelGuardedIntCounterVec<3>, + pub actor_input_buffer_blocking_duration_ns: LabelGuardedIntCounterVec<3>, pub actor_scheduled_duration: GenericGaugeVec, pub actor_scheduled_cnt: GenericGaugeVec, pub actor_fast_poll_duration: GenericGaugeVec, @@ -44,40 +54,45 @@ pub struct StreamingMetrics { pub actor_poll_cnt: GenericGaugeVec, pub actor_idle_duration: GenericGaugeVec, pub actor_idle_cnt: GenericGaugeVec, + + // Streaming actor pub actor_memory_usage: GenericGaugeVec, - pub actor_in_record_cnt: GenericCounterVec, - pub actor_out_record_cnt: GenericCounterVec, - pub actor_sampled_deserialize_duration_ns: GenericCounterVec, + pub actor_in_record_cnt: LabelGuardedIntCounterVec<2>, + pub actor_out_record_cnt: LabelGuardedIntCounterVec<2>, + + // Source pub source_output_row_count: GenericCounterVec, pub source_row_per_barrier: GenericCounterVec, pub source_split_change_count: GenericCounterVec, + // Sink & materialized view + pub sink_input_row_count: GenericCounterVec, + pub mview_input_row_count: GenericCounterVec, + // Exchange (see also `compute::ExchangeServiceMetrics`) pub exchange_frag_recv_size: GenericCounterVec, // Streaming Join - pub join_lookup_miss_count: GenericCounterVec, - pub join_total_lookup_count: GenericCounterVec, - pub join_insert_cache_miss_count: GenericCounterVec, - pub join_actor_input_waiting_duration_ns: GenericCounterVec, - pub join_match_duration_ns: GenericCounterVec, - pub join_barrier_align_duration: RelabeledHistogramVec, - pub join_cached_entries: GenericGaugeVec, - pub join_cached_rows: GenericGaugeVec, - pub join_cached_estimated_size: GenericGaugeVec, - pub join_matched_join_keys: RelabeledHistogramVec, + pub join_lookup_miss_count: LabelGuardedIntCounterVec<5>, + pub join_lookup_total_count: LabelGuardedIntCounterVec<5>, + pub join_insert_cache_miss_count: LabelGuardedIntCounterVec<5>, + pub join_actor_input_waiting_duration_ns: LabelGuardedIntCounterVec<2>, + pub join_match_duration_ns: LabelGuardedIntCounterVec<3>, + pub join_barrier_align_duration: RelabeledGuardedHistogramVec<3>, + pub join_cached_entry_count: LabelGuardedIntGaugeVec<3>, + pub join_matched_join_keys: RelabeledGuardedHistogramVec<3>, // Streaming Aggregation pub agg_lookup_miss_count: GenericCounterVec, pub agg_total_lookup_count: GenericCounterVec, - pub agg_cached_keys: GenericGaugeVec, + pub agg_cached_entry_count: GenericGaugeVec, pub agg_chunk_lookup_miss_count: GenericCounterVec, pub agg_chunk_total_lookup_count: GenericCounterVec, pub agg_distinct_cache_miss_count: GenericCounterVec, pub agg_distinct_total_cache_count: GenericCounterVec, pub agg_distinct_cached_entry_count: GenericGaugeVec, - pub agg_dirty_group_count: GenericGaugeVec, - pub agg_dirty_group_heap_size: GenericGaugeVec, + pub agg_dirty_groups_count: GenericGaugeVec, + pub agg_dirty_groups_heap_size: GenericGaugeVec, // Streaming TopN pub group_top_n_cache_miss_count: GenericCounterVec, @@ -87,7 +102,7 @@ pub struct StreamingMetrics { pub group_top_n_appendonly_total_query_cache_count: GenericCounterVec, pub group_top_n_appendonly_cached_entry_count: GenericGaugeVec, - // look up + // Lookup executor pub lookup_cache_miss_count: GenericCounterVec, pub lookup_total_query_cache_count: GenericCounterVec, pub lookup_cached_entry_count: GenericGaugeVec, @@ -119,7 +134,18 @@ pub struct StreamingMetrics { /// The progress made by the earliest in-flight barriers in the local barrier manager. pub barrier_manager_progress: IntCounter, - pub sink_commit_duration: HistogramVec, + // Sink related metrics + pub sink_commit_duration: LabelGuardedHistogramVec<3>, + pub connector_sink_rows_received: LabelGuardedIntCounterVec<2>, + pub log_store_first_write_epoch: LabelGuardedIntGaugeVec<3>, + pub log_store_latest_write_epoch: LabelGuardedIntGaugeVec<3>, + pub log_store_write_rows: LabelGuardedIntCounterVec<3>, + pub log_store_latest_read_epoch: LabelGuardedIntGaugeVec<3>, + pub log_store_read_rows: LabelGuardedIntCounterVec<3>, + pub kv_log_store_storage_write_count: LabelGuardedIntCounterVec<3>, + pub kv_log_store_storage_write_size: LabelGuardedIntCounterVec<3>, + pub kv_log_store_storage_read_count: LabelGuardedIntCounterVec<4>, + pub kv_log_store_storage_read_size: LabelGuardedIntCounterVec<4>, // Memory management // FIXME(yuhao): use u64 here @@ -130,6 +156,8 @@ pub struct StreamingMetrics { pub lru_evicted_watermark_time_ms: GenericGaugeVec, pub jemalloc_allocated_bytes: IntGauge, pub jemalloc_active_bytes: IntGauge, + pub jvm_allocated_bytes: IntGauge, + pub jvm_active_bytes: IntGauge, /// User compute error reporting pub user_compute_error_count: GenericCounterVec, @@ -158,7 +186,7 @@ impl StreamingMetrics { let executor_row_count = register_int_counter_vec_with_registry!( "stream_executor_row_count", "Total number of rows that have been output from each executor", - &["actor_id", "executor_identity"], + &["actor_id", "fragment_id", "executor_identity"], registry ) .unwrap(); @@ -187,30 +215,48 @@ impl StreamingMetrics { ) .unwrap(); - let actor_execution_time = register_gauge_vec_with_registry!( - "stream_actor_actor_execution_time", - "Total execution time (s) of an actor", - &["actor_id"], + let sink_input_row_count = register_int_counter_vec_with_registry!( + "stream_sink_input_row_count", + "Total number of rows streamed into sink executors", + &["sink_id", "actor_id", "fragment_id"], registry ) .unwrap(); - let actor_output_buffer_blocking_duration_ns = register_int_counter_vec_with_registry!( - "stream_actor_output_buffer_blocking_duration_ns", - "Total blocking duration (ns) of output buffer", - &["actor_id", "fragment_id", "downstream_fragment_id"], + let mview_input_row_count = register_int_counter_vec_with_registry!( + "stream_mview_input_row_count", + "Total number of rows streamed into materialize executors", + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); - let actor_input_buffer_blocking_duration_ns = register_int_counter_vec_with_registry!( - "stream_actor_input_buffer_blocking_duration_ns", - "Total blocking duration (ns) of input buffer", - &["actor_id", "fragment_id", "upstream_fragment_id"], + let actor_execution_time = register_gauge_vec_with_registry!( + "stream_actor_actor_execution_time", + "Total execution time (s) of an actor", + &["actor_id"], registry ) .unwrap(); + let actor_output_buffer_blocking_duration_ns = + register_guarded_int_counter_vec_with_registry!( + "stream_actor_output_buffer_blocking_duration_ns", + "Total blocking duration (ns) of output buffer", + &["actor_id", "fragment_id", "downstream_fragment_id"], + registry + ) + .unwrap(); + + let actor_input_buffer_blocking_duration_ns = + register_guarded_int_counter_vec_with_registry!( + "stream_actor_input_buffer_blocking_duration_ns", + "Total blocking duration (ns) of input buffer", + &["actor_id", "fragment_id", "upstream_fragment_id"], + registry + ) + .unwrap(); + let exchange_frag_recv_size = register_int_counter_vec_with_registry!( "stream_exchange_frag_recv_size", "Total size of messages that have been received from upstream Fragment", @@ -299,15 +345,15 @@ impl StreamingMetrics { ) .unwrap(); - let actor_in_record_cnt = register_int_counter_vec_with_registry!( + let actor_in_record_cnt = register_guarded_int_counter_vec_with_registry!( "stream_actor_in_record_cnt", "Total number of rows actor received", - &["actor_id"], + &["actor_id", "fragment_id"], registry ) .unwrap(); - let actor_out_record_cnt = register_int_counter_vec_with_registry!( + let actor_out_record_cnt = register_guarded_int_counter_vec_with_registry!( "stream_actor_out_record_cnt", "Total number of rows actor sent", &["actor_id", "fragment_id"], @@ -315,58 +361,68 @@ impl StreamingMetrics { ) .unwrap(); - let actor_sampled_deserialize_duration_ns = register_int_counter_vec_with_registry!( - "actor_sampled_deserialize_duration_ns", - "Duration (ns) of sampled chunk deserialization", - &["actor_id"], - registry - ) - .unwrap(); - let actor_memory_usage = register_int_gauge_vec_with_registry!( "actor_memory_usage", "Memory usage (bytes)", - &["actor_id"], + &["actor_id", "fragment_id"], registry, ) .unwrap(); - let join_lookup_miss_count = register_int_counter_vec_with_registry!( + let join_lookup_miss_count = register_guarded_int_counter_vec_with_registry!( "stream_join_lookup_miss_count", "Join executor lookup miss duration", - &["side", "join_table_id", "degree_table_id", "actor_id"], + &[ + "side", + "join_table_id", + "degree_table_id", + "actor_id", + "fragment_id" + ], registry ) .unwrap(); - let join_total_lookup_count = register_int_counter_vec_with_registry!( + let join_lookup_total_count = register_guarded_int_counter_vec_with_registry!( "stream_join_lookup_total_count", "Join executor lookup total operation", - &["side", "join_table_id", "degree_table_id", "actor_id"], + &[ + "side", + "join_table_id", + "degree_table_id", + "actor_id", + "fragment_id" + ], registry ) .unwrap(); - let join_insert_cache_miss_count = register_int_counter_vec_with_registry!( + let join_insert_cache_miss_count = register_guarded_int_counter_vec_with_registry!( "stream_join_insert_cache_miss_count", "Join executor cache miss when insert operation", - &["side", "join_table_id", "degree_table_id", "actor_id"], + &[ + "side", + "join_table_id", + "degree_table_id", + "actor_id", + "fragment_id" + ], registry ) .unwrap(); - let join_actor_input_waiting_duration_ns = register_int_counter_vec_with_registry!( + let join_actor_input_waiting_duration_ns = register_guarded_int_counter_vec_with_registry!( "stream_join_actor_input_waiting_duration_ns", "Total waiting duration (ns) of input buffer of join actor", - &["actor_id"], + &["actor_id", "fragment_id"], registry ) .unwrap(); - let join_match_duration_ns = register_int_counter_vec_with_registry!( + let join_match_duration_ns = register_guarded_int_counter_vec_with_registry!( "stream_join_match_duration_ns", "Matching duration for each side", - &["actor_id", "side"], + &["actor_id", "fragment_id", "side"], registry ) .unwrap(); @@ -376,40 +432,24 @@ impl StreamingMetrics { "Duration of join align barrier", exponential_buckets(0.0001, 2.0, 21).unwrap() // max 104s ); - let join_barrier_align_duration = register_histogram_vec_with_registry!( + let join_barrier_align_duration = register_guarded_histogram_vec_with_registry!( opts, &["actor_id", "fragment_id", "wait_side"], registry ) .unwrap(); - let join_barrier_align_duration = RelabeledHistogramVec::with_metric_level_relabel_n( + let join_barrier_align_duration = RelabeledGuardedHistogramVec::with_metric_level_relabel_n( MetricLevel::Debug, join_barrier_align_duration, level, 1, ); - let join_cached_entries = register_int_gauge_vec_with_registry!( - "stream_join_cached_entries", + let join_cached_entry_count = register_guarded_int_gauge_vec_with_registry!( + "stream_join_cached_entry_count", "Number of cached entries in streaming join operators", - &["actor_id", "side"], - registry - ) - .unwrap(); - - let join_cached_rows = register_int_gauge_vec_with_registry!( - "stream_join_cached_rows", - "Number of cached rows in streaming join operators", - &["actor_id", "side"], - registry - ) - .unwrap(); - - let join_cached_estimated_size = register_int_gauge_vec_with_registry!( - "stream_join_cached_estimated_size", - "Estimated size of all cached entries in streaming join operators", - &["actor_id", "side"], + &["actor_id", "fragment_id", "side"], registry ) .unwrap(); @@ -420,14 +460,14 @@ impl StreamingMetrics { exponential_buckets(16.0, 2.0, 28).unwrap() // max 2^31 ); - let join_matched_join_keys = register_histogram_vec_with_registry!( + let join_matched_join_keys = register_guarded_histogram_vec_with_registry!( join_matched_join_keys_opts, &["actor_id", "fragment_id", "table_id"], registry ) .unwrap(); - let join_matched_join_keys = RelabeledHistogramVec::with_metric_level_relabel_n( + let join_matched_join_keys = RelabeledGuardedHistogramVec::with_metric_level_relabel_n( MetricLevel::Debug, join_matched_join_keys, level, @@ -437,7 +477,7 @@ impl StreamingMetrics { let agg_lookup_miss_count = register_int_counter_vec_with_registry!( "stream_agg_lookup_miss_count", "Aggregation executor lookup miss duration", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -445,7 +485,7 @@ impl StreamingMetrics { let agg_total_lookup_count = register_int_counter_vec_with_registry!( "stream_agg_lookup_total_count", "Aggregation executor lookup total operation", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -453,7 +493,7 @@ impl StreamingMetrics { let agg_distinct_cache_miss_count = register_int_counter_vec_with_registry!( "stream_agg_distinct_cache_miss_count", "Aggregation executor dinsinct miss duration", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -461,7 +501,7 @@ impl StreamingMetrics { let agg_distinct_total_cache_count = register_int_counter_vec_with_registry!( "stream_agg_distinct_total_cache_count", "Aggregation executor distinct total operation", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -469,23 +509,23 @@ impl StreamingMetrics { let agg_distinct_cached_entry_count = register_int_gauge_vec_with_registry!( "stream_agg_distinct_cached_entry_count", "Total entry counts in distinct aggregation executor cache", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); - let agg_dirty_group_count = register_int_gauge_vec_with_registry!( - "stream_agg_dirty_group_count", + let agg_dirty_groups_count = register_int_gauge_vec_with_registry!( + "stream_agg_dirty_groups_count", "Total dirty group counts in aggregation executor", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); - let agg_dirty_group_heap_size = register_int_gauge_vec_with_registry!( - "stream_agg_dirty_group_heap_size", + let agg_dirty_groups_heap_size = register_int_gauge_vec_with_registry!( + "stream_agg_dirty_groups_heap_size", "Total dirty group heap size in aggregation executor", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -493,7 +533,7 @@ impl StreamingMetrics { let group_top_n_cache_miss_count = register_int_counter_vec_with_registry!( "stream_group_top_n_cache_miss_count", "Group top n executor cache miss count", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -501,7 +541,7 @@ impl StreamingMetrics { let group_top_n_total_query_cache_count = register_int_counter_vec_with_registry!( "stream_group_top_n_total_query_cache_count", "Group top n executor query cache total count", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -509,7 +549,7 @@ impl StreamingMetrics { let group_top_n_cached_entry_count = register_int_gauge_vec_with_registry!( "stream_group_top_n_cached_entry_count", "Total entry counts in group top n executor cache", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -517,7 +557,7 @@ impl StreamingMetrics { let group_top_n_appendonly_cache_miss_count = register_int_counter_vec_with_registry!( "stream_group_top_n_appendonly_cache_miss_count", "Group top n appendonly executor cache miss count", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -526,7 +566,7 @@ impl StreamingMetrics { register_int_counter_vec_with_registry!( "stream_group_top_n_appendonly_total_query_cache_count", "Group top n appendonly executor total cache count", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -534,7 +574,7 @@ impl StreamingMetrics { let group_top_n_appendonly_cached_entry_count = register_int_gauge_vec_with_registry!( "stream_group_top_n_appendonly_cached_entry_count", "Total entry counts in group top n appendonly executor cache", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -542,7 +582,7 @@ impl StreamingMetrics { let lookup_cache_miss_count = register_int_counter_vec_with_registry!( "stream_lookup_cache_miss_count", "Lookup executor cache miss count", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -550,7 +590,7 @@ impl StreamingMetrics { let lookup_total_query_cache_count = register_int_counter_vec_with_registry!( "stream_lookup_total_query_cache_count", "Lookup executor query cache total count", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -558,7 +598,7 @@ impl StreamingMetrics { let lookup_cached_entry_count = register_int_gauge_vec_with_registry!( "stream_lookup_cached_entry_count", "Total entry counts in lookup executor cache", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -566,7 +606,7 @@ impl StreamingMetrics { let temporal_join_cache_miss_count = register_int_counter_vec_with_registry!( "stream_temporal_join_cache_miss_count", "Temporal join executor cache miss count", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -574,7 +614,7 @@ impl StreamingMetrics { let temporal_join_total_query_cache_count = register_int_counter_vec_with_registry!( "stream_temporal_join_total_query_cache_count", "Temporal join executor query cache total count", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -582,15 +622,15 @@ impl StreamingMetrics { let temporal_join_cached_entry_count = register_int_gauge_vec_with_registry!( "stream_temporal_join_cached_entry_count", "Total entry count in temporal join executor cache", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); - let agg_cached_keys = register_int_gauge_vec_with_registry!( - "stream_agg_cached_keys", + let agg_cached_entry_count = register_int_gauge_vec_with_registry!( + "stream_agg_cached_entry_count", "Number of cached keys in streaming aggregation operators", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -598,7 +638,7 @@ impl StreamingMetrics { let agg_chunk_lookup_miss_count = register_int_counter_vec_with_registry!( "stream_agg_chunk_lookup_miss_count", "Aggregation executor chunk-level lookup miss duration", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -606,7 +646,7 @@ impl StreamingMetrics { let agg_chunk_total_lookup_count = register_int_counter_vec_with_registry!( "stream_agg_chunk_lookup_total_count", "Aggregation executor chunk-level lookup total operation", - &["table_id", "actor_id"], + &["table_id", "actor_id", "fragment_id"], registry ) .unwrap(); @@ -689,10 +729,90 @@ impl StreamingMetrics { ) .unwrap(); - let sink_commit_duration = register_histogram_vec_with_registry!( + let sink_commit_duration = register_guarded_histogram_vec_with_registry!( "sink_commit_duration", "Duration of commit op in sink", - &["executor_id", "connector"], + &["executor_id", "connector", "sink_id"], + registry + ) + .unwrap(); + + let connector_sink_rows_received = register_guarded_int_counter_vec_with_registry!( + "connector_sink_rows_received", + "Number of rows received by sink", + &["connector_type", "sink_id"], + registry + ) + .unwrap(); + + let log_store_first_write_epoch = register_guarded_int_gauge_vec_with_registry!( + "log_store_first_write_epoch", + "The first write epoch of log store", + &["executor_id", "connector", "sink_id"], + registry + ) + .unwrap(); + + let log_store_latest_write_epoch = register_guarded_int_gauge_vec_with_registry!( + "log_store_latest_write_epoch", + "The latest write epoch of log store", + &["executor_id", "connector", "sink_id"], + registry + ) + .unwrap(); + + let log_store_write_rows = register_guarded_int_counter_vec_with_registry!( + "log_store_write_rows", + "The write rate of rows", + &["executor_id", "connector", "sink_id"], + registry + ) + .unwrap(); + + let log_store_latest_read_epoch = register_guarded_int_gauge_vec_with_registry!( + "log_store_latest_read_epoch", + "The latest read epoch of log store", + &["executor_id", "connector", "sink_id"], + registry + ) + .unwrap(); + + let log_store_read_rows = register_guarded_int_counter_vec_with_registry!( + "log_store_read_rows", + "The read rate of rows", + &["executor_id", "connector", "sink_id"], + registry + ) + .unwrap(); + + let kv_log_store_storage_write_count = register_guarded_int_counter_vec_with_registry!( + "kv_log_store_storage_write_count", + "Write row count throughput of kv log store", + &["executor_id", "connector", "sink_id"], + registry + ) + .unwrap(); + + let kv_log_store_storage_write_size = register_guarded_int_counter_vec_with_registry!( + "kv_log_store_storage_write_size", + "Write size throughput of kv log store", + &["executor_id", "connector", "sink_id"], + registry + ) + .unwrap(); + + let kv_log_store_storage_read_count = register_guarded_int_counter_vec_with_registry!( + "kv_log_store_storage_read_count", + "Write row count throughput of kv log store", + &["executor_id", "connector", "sink_id", "read_type"], + registry + ) + .unwrap(); + + let kv_log_store_storage_read_size = register_guarded_int_counter_vec_with_registry!( + "kv_log_store_storage_read_size", + "Write size throughput of kv log store", + &["executor_id", "connector", "sink_id", "read_type"], registry ) .unwrap(); @@ -747,6 +867,20 @@ impl StreamingMetrics { ) .unwrap(); + let jvm_allocated_bytes = register_int_gauge_with_registry!( + "jvm_allocated_bytes", + "The allocated jvm memory", + registry + ) + .unwrap(); + + let jvm_active_bytes = register_int_gauge_with_registry!( + "jvm_active_bytes", + "The active jvm memory", + registry + ) + .unwrap(); + let user_compute_error_count = register_int_counter_vec_with_registry!( "user_compute_error_count", "Compute errors in the system, queryable by tags", @@ -812,31 +946,30 @@ impl StreamingMetrics { actor_memory_usage, actor_in_record_cnt, actor_out_record_cnt, - actor_sampled_deserialize_duration_ns, source_output_row_count, source_row_per_barrier, source_split_change_count, + sink_input_row_count, + mview_input_row_count, exchange_frag_recv_size, join_lookup_miss_count, - join_total_lookup_count, + join_lookup_total_count, join_insert_cache_miss_count, join_actor_input_waiting_duration_ns, join_match_duration_ns, join_barrier_align_duration, - join_cached_entries, - join_cached_rows, - join_cached_estimated_size, + join_cached_entry_count, join_matched_join_keys, agg_lookup_miss_count, agg_total_lookup_count, - agg_cached_keys, + agg_cached_entry_count, agg_chunk_lookup_miss_count, agg_chunk_total_lookup_count, agg_distinct_cache_miss_count, agg_distinct_total_cache_count, agg_distinct_cached_entry_count, - agg_dirty_group_count, - agg_dirty_group_heap_size, + agg_dirty_groups_count, + agg_dirty_groups_heap_size, group_top_n_cache_miss_count, group_top_n_total_query_cache_count, group_top_n_cached_entry_count, @@ -860,6 +993,16 @@ impl StreamingMetrics { barrier_sync_latency, barrier_manager_progress, sink_commit_duration, + connector_sink_rows_received, + log_store_first_write_epoch, + log_store_latest_write_epoch, + log_store_write_rows, + log_store_latest_read_epoch, + log_store_read_rows, + kv_log_store_storage_write_count, + kv_log_store_storage_write_size, + kv_log_store_storage_read_count, + kv_log_store_storage_read_size, lru_current_watermark_time_ms, lru_physical_now_ms, lru_runtime_loop_count, @@ -867,6 +1010,8 @@ impl StreamingMetrics { lru_evicted_watermark_time_ms, jemalloc_allocated_bytes, jemalloc_active_bytes, + jvm_allocated_bytes, + jvm_active_bytes, user_compute_error_count, user_source_reader_error_count, materialize_cache_hit_count, @@ -879,4 +1024,42 @@ impl StreamingMetrics { pub fn unused() -> Self { global_streaming_metrics(MetricLevel::Disabled) } + + pub fn new_sink_metrics( + &self, + identity: &str, + sink_id_str: &str, + connector: &str, + ) -> SinkMetrics { + let label_list = [identity, connector, sink_id_str]; + let sink_commit_duration_metrics = self.sink_commit_duration.with_label_values(&label_list); + let connector_sink_rows_received = self + .connector_sink_rows_received + .with_label_values(&[connector, sink_id_str]); + + let log_store_latest_read_epoch = self + .log_store_latest_read_epoch + .with_label_values(&label_list); + + let log_store_latest_write_epoch = self + .log_store_latest_write_epoch + .with_label_values(&label_list); + + let log_store_first_write_epoch = self + .log_store_first_write_epoch + .with_label_values(&label_list); + + let log_store_write_rows = self.log_store_write_rows.with_label_values(&label_list); + let log_store_read_rows = self.log_store_read_rows.with_label_values(&label_list); + + SinkMetrics { + sink_commit_duration_metrics, + connector_sink_rows_received, + log_store_first_write_epoch, + log_store_latest_write_epoch, + log_store_write_rows, + log_store_latest_read_epoch, + log_store_read_rows, + } + } } diff --git a/src/stream/src/executor/mview/materialize.rs b/src/stream/src/executor/mview/materialize.rs index caabe213e146c..cfb02ec34c481 100644 --- a/src/stream/src/executor/mview/materialize.rs +++ b/src/stream/src/executor/mview/materialize.rs @@ -52,16 +52,15 @@ use crate::task::AtomicU64Ref; /// `MaterializeExecutor` materializes changes in stream into a materialized view on storage. pub struct MaterializeExecutor { input: BoxedExecutor, + info: ExecutorInfo, state_table: StateTableInner, /// Columns of arrange keys (including pk, group keys, join keys, etc.) - arrange_columns: Vec, + arrange_key_indices: Vec, actor_context: ActorContextRef, - info: ExecutorInfo, - materialize_cache: MaterializeCache, conflict_behavior: ConflictBehavior, @@ -74,9 +73,9 @@ impl MaterializeExecutor { #[allow(clippy::too_many_arguments)] pub async fn new( input: BoxedExecutor, + info: ExecutorInfo, store: S, - key: Vec, - executor_id: u64, + arrange_key: Vec, actor_context: ActorContextRef, vnodes: Option>, table_catalog: &Table, @@ -84,9 +83,7 @@ impl MaterializeExecutor { conflict_behavior: ConflictBehavior, metrics: Arc, ) -> Self { - let arrange_columns: Vec = key.iter().map(|k| k.column_index).collect(); - - let schema = input.schema().clone(); + let arrange_key_indices: Vec = arrange_key.iter().map(|k| k.column_index).collect(); let state_table = if table_catalog.version.is_some() { // TODO: If we do some `Delete` after schema change, we cannot ensure the encoded value @@ -104,14 +101,10 @@ impl MaterializeExecutor { Self { input, + info, state_table, - arrange_columns: arrange_columns.clone(), + arrange_key_indices, actor_context, - info: ExecutorInfo { - schema, - pk_indices: arrange_columns, - identity: format!("MaterializeExecutor {:X}", executor_id), - }, materialize_cache: MaterializeCache::new(watermark_epoch, metrics_info), conflict_behavior, } @@ -119,6 +112,11 @@ impl MaterializeExecutor { #[try_stream(ok = Message, error = StreamExecutorError)] async fn execute_inner(mut self) { + // for metrics + let table_id_str = self.state_table.table_id().to_string(); + let actor_id_str = self.actor_context.id.to_string(); + let fragment_id_str = self.actor_context.fragment_id.to_string(); + let data_types = self.schema().data_types().clone(); let mut input = self.input.execute(); @@ -136,6 +134,12 @@ impl MaterializeExecutor { yield match msg { Message::Watermark(w) => Message::Watermark(w), Message::Chunk(chunk) => { + self.actor_context + .streaming_metrics + .mview_input_row_count + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) + .inc_by(chunk.cardinality() as u64); + match self.conflict_behavior { ConflictBehavior::Overwrite | ConflictBehavior::IgnoreConflict => { // create MaterializeBuffer from chunk @@ -231,7 +235,7 @@ impl MaterializeExecutor { Self { input, state_table, - arrange_columns: arrange_columns.clone(), + arrange_key_indices: arrange_columns.clone(), actor_context: ActorContext::create(0), info: ExecutorInfo { schema, @@ -412,8 +416,8 @@ impl Executor for MaterializeExecutor { impl std::fmt::Debug for MaterializeExecutor { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MaterializeExecutor") - .field("input info", &self.info()) - .field("arrange_columns", &self.arrange_columns) + .field("info", &self.info()) + .field("arrange_key_indices", &self.arrange_key_indices) .finish() } } diff --git a/src/stream/src/executor/now.rs b/src/stream/src/executor/now.rs index d2cbf05d71f80..2ee5468ff5ad9 100644 --- a/src/stream/src/executor/now.rs +++ b/src/stream/src/executor/now.rs @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::ops::Bound; +use std::ops::Bound::Unbounded; + use futures::{pin_mut, StreamExt}; use futures_async_stream::try_stream; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::catalog::{Field, Schema}; -use risingwave_common::row; +use risingwave_common::row::{self, OwnedRow}; use risingwave_common::types::{DataType, Datum}; use risingwave_storage::StateStore; use tokio::sync::mpsc::UnboundedReceiver; @@ -79,9 +82,11 @@ impl NowExecutor { if !initialized { // Handle the first barrier. state_table.init_epoch(barrier.epoch); - let state_row = { - let data_iter = state_table.iter_row(Default::default()).await?; + let sub_range: &(Bound, Bound) = &(Unbounded, Unbounded); + let data_iter = state_table + .iter_with_prefix(row::empty(), sub_range, Default::default()) + .await?; pin_mut!(data_iter); if let Some(keyed_row) = data_iter.next().await { Some(keyed_row?) @@ -90,6 +95,7 @@ impl NowExecutor { } }; last_timestamp = state_row.and_then(|row| row[0].clone()); + paused = barrier.is_pause_on_startup(); initialized = true; } else if paused { // Assert that no data is updated. @@ -104,7 +110,7 @@ impl NowExecutor { // Update paused state. if let Some(mutation) = barrier.mutation.as_deref() { match mutation { - Mutation::Pause | Mutation::Update { .. } => paused = true, + Mutation::Pause => paused = true, Mutation::Resume => paused = false, _ => {} } diff --git a/src/stream/src/executor/over_window/eowc.rs b/src/stream/src/executor/over_window/eowc.rs index d483b0c345c0f..b5da45edd47e5 100644 --- a/src/stream/src/executor/over_window/eowc.rs +++ b/src/stream/src/executor/over_window/eowc.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::marker::PhantomData; +use std::ops::Bound; use futures::StreamExt; use futures_async_stream::{for_await, try_stream}; @@ -195,10 +196,15 @@ impl EowcOverWindowExecutor { curr_row_buffer: Default::default(), }; + let sub_range: &(Bound, Bound) = &(Bound::Unbounded, Bound::Unbounded); // Recover states from state table. let table_iter = this .state_table - .iter_row_with_pk_prefix(partition_key, PrefetchOptions::new_for_exhaust_iter()) + .iter_with_prefix( + partition_key, + sub_range, + PrefetchOptions::new_for_exhaust_iter(), + ) .await?; #[for_await] diff --git a/src/stream/src/executor/over_window/general.rs b/src/stream/src/executor/over_window/general.rs index 091e199d7b52a..9e66835b54b05 100644 --- a/src/stream/src/executor/over_window/general.rs +++ b/src/stream/src/executor/over_window/general.rs @@ -21,7 +21,7 @@ use futures::StreamExt; use futures_async_stream::try_stream; use itertools::Itertools; use risingwave_common::array::stream_record::Record; -use risingwave_common::array::{RowRef, StreamChunk}; +use risingwave_common::array::{Op, RowRef, StreamChunk}; use risingwave_common::catalog::Field; use risingwave_common::row::{OwnedRow, Row, RowExt}; use risingwave_common::session_config::OverWindowCachePolicy as CachePolicy; @@ -225,26 +225,25 @@ impl OverWindowExecutor { chunk: &'a StreamChunk, ) -> impl Iterator>> { let mut changes_merged = BTreeMap::new(); - for record in chunk.records() { - match record { - Record::Insert { new_row } => { - let pk = DefaultOrdered(this.get_input_pk(new_row)); + for (op, row) in chunk.rows() { + let pk = DefaultOrdered(this.get_input_pk(row)); + match op { + Op::Insert | Op::UpdateInsert => { if let Some(prev_change) = changes_merged.get_mut(&pk) { match prev_change { Record::Delete { old_row } => { *prev_change = Record::Update { old_row: *old_row, - new_row, + new_row: row, }; } _ => panic!("inconsistent changes in input chunk"), } } else { - changes_merged.insert(pk, record); + changes_merged.insert(pk, Record::Insert { new_row: row }); } } - Record::Delete { old_row } => { - let pk = DefaultOrdered(this.get_input_pk(old_row)); + Op::Delete | Op::UpdateDelete => { if let Some(prev_change) = changes_merged.get_mut(&pk) { match prev_change { Record::Insert { .. } => { @@ -261,29 +260,7 @@ impl OverWindowExecutor { _ => panic!("inconsistent changes in input chunk"), } } else { - changes_merged.insert(pk, record); - } - } - Record::Update { old_row, new_row } => { - let pk = DefaultOrdered(this.get_input_pk(old_row)); - if let Some(prev_change) = changes_merged.get_mut(&pk) { - match prev_change { - Record::Insert { .. } => { - *prev_change = Record::Insert { new_row }; - } - Record::Update { - old_row: real_old_row, - .. - } => { - *prev_change = Record::Update { - old_row: *real_old_row, - new_row, - }; - } - _ => panic!("inconsistent changes in input chunk"), - } - } else { - changes_merged.insert(pk, record); + changes_merged.insert(pk, Record::Delete { old_row: row }); } } } @@ -368,7 +345,6 @@ impl OverWindowExecutor { &mut cache, this.cache_policy, &this.calls, - &this.partition_key_indices, &this.order_key_data_types, &this.order_key_order_types, &this.order_key_indices, diff --git a/src/stream/src/executor/over_window/over_partition.rs b/src/stream/src/executor/over_window/over_partition.rs index ab785acd9b681..42529a1c80587 100644 --- a/src/stream/src/executor/over_window/over_partition.rs +++ b/src/stream/src/executor/over_window/over_partition.rs @@ -19,11 +19,8 @@ use std::collections::{BTreeMap, HashSet, VecDeque}; use std::marker::PhantomData; use std::ops::{Bound, RangeInclusive}; -use futures::stream::select_all; -use futures::{stream, StreamExt, TryStreamExt}; use futures_async_stream::for_await; use risingwave_common::array::stream_record::Record; -use risingwave_common::hash::VnodeBitmapExt; use risingwave_common::row::{OwnedRow, Row, RowExt}; use risingwave_common::session_config::OverWindowCachePolicy as CachePolicy; use risingwave_common::types::DataType; @@ -31,7 +28,6 @@ use risingwave_common::util::memcmp_encoding; use risingwave_common::util::sort_util::OrderType; use risingwave_expr::window_function::{FrameBounds, StateKey, WindowFuncCall}; use risingwave_storage::store::PrefetchOptions; -use risingwave_storage::table::merge_sort::merge_sort; use risingwave_storage::StateStore; use super::delta_btree_map::Change; @@ -75,7 +71,7 @@ pub(super) fn shrink_partition_cache( cache_policy: CachePolicy, recently_accessed_range: RangeInclusive, ) { - tracing::debug!( + tracing::trace!( this_partition_key=?this_partition_key, cache_policy=?cache_policy, recently_accessed_range=?recently_accessed_range, @@ -199,7 +195,7 @@ pub(super) fn shrink_partition_cache( } }; - tracing::debug!( + tracing::trace!( this_partition_key=?this_partition_key, retain_range=?(&start..=&end), "retain range in the range cache" @@ -230,12 +226,11 @@ pub(super) struct OverPartition<'a, S: StateStore> { cache_policy: CachePolicy, calls: &'a [WindowFuncCall], - partition_key_indices: &'a [usize], order_key_data_types: &'a [DataType], order_key_order_types: &'a [OrderType], order_key_indices: &'a [usize], input_pk_indices: &'a [usize], - state_key_to_table_pk_proj: Vec, + state_key_to_table_sub_pk_proj: Vec, _phantom: PhantomData, } @@ -248,20 +243,16 @@ impl<'a, S: StateStore> OverPartition<'a, S> { cache: &'a mut PartitionCache, cache_policy: CachePolicy, calls: &'a [WindowFuncCall], - partition_key_indices: &'a [usize], order_key_data_types: &'a [DataType], order_key_order_types: &'a [OrderType], order_key_indices: &'a [usize], input_pk_indices: &'a [usize], ) -> Self { // TODO(rc): move the calculation to executor? - let mut projection = Vec::with_capacity( - partition_key_indices.len() + order_key_indices.len() + input_pk_indices.len(), - ); + let mut projection = Vec::with_capacity(order_key_indices.len() + input_pk_indices.len()); let mut col_dedup = HashSet::new(); - for (proj_idx, key_idx) in partition_key_indices + for (proj_idx, key_idx) in order_key_indices .iter() - .chain(order_key_indices.iter()) .chain(input_pk_indices.iter()) .enumerate() { @@ -277,12 +268,11 @@ impl<'a, S: StateStore> OverPartition<'a, S> { cache_policy, calls, - partition_key_indices, order_key_data_types, order_key_order_types, order_key_indices, input_pk_indices, - state_key_to_table_pk_proj: projection, + state_key_to_table_sub_pk_proj: projection, _phantom: PhantomData, } } @@ -431,16 +421,16 @@ impl<'a, S: StateStore> OverPartition<'a, S> { if left_reached_sentinel { // TODO(rc): should count cache miss for this, and also the below - tracing::debug!(partition=?self.this_partition_key, "partition cache left extension triggered"); + tracing::trace!(partition=?self.this_partition_key, "partition cache left extension triggered"); let left_most = self.cache_real_first_key().unwrap_or(delta_first).clone(); self.extend_cache_leftward_by_n(table, &left_most).await?; } if right_reached_sentinel { - tracing::debug!(partition=?self.this_partition_key, "partition cache right extension triggered"); + tracing::trace!(partition=?self.this_partition_key, "partition cache right extension triggered"); let right_most = self.cache_real_last_key().unwrap_or(delta_last).clone(); self.extend_cache_rightward_by_n(table, &right_most).await?; } - tracing::debug!(partition=?self.this_partition_key, "partition cache extended"); + tracing::trace!(partition=?self.this_partition_key, "partition cache extended"); } } @@ -453,12 +443,14 @@ impl<'a, S: StateStore> OverPartition<'a, S> { return Ok(()); } - tracing::debug!(partition=?self.this_partition_key, "loading the whole partition into cache"); + tracing::trace!(partition=?self.this_partition_key, "loading the whole partition into cache"); let mut new_cache = PartitionCache::new(); // shouldn't use `new_empty_partition_cache` here because we don't want sentinels + let sub_range: &(Bound, Bound) = &(Bound::Unbounded, Bound::Unbounded); let table_iter = table - .iter_row_with_pk_prefix( + .iter_with_prefix( self.this_partition_key, + sub_range, PrefetchOptions::new_for_exhaust_iter(), ) .await?; @@ -506,17 +498,17 @@ impl<'a, S: StateStore> OverPartition<'a, S> { if self.cache_real_len() == 0 { // no normal entry in the cache, just load the given range - let table_pk_range = ( - Bound::Included(self.state_key_to_table_pk(range.start())?), - Bound::Included(self.state_key_to_table_pk(range.end())?), + let table_sub_range = ( + Bound::Included(self.state_key_to_table_sub_pk(range.start())?), + Bound::Included(self.state_key_to_table_sub_pk(range.end())?), ); tracing::debug!( partition=?self.this_partition_key, - table_pk_range=?table_pk_range, + table_sub_range=?table_sub_range, "cache is empty, just loading the given range" ); return self - .extend_cache_by_range_inner(table, table_pk_range) + .extend_cache_by_range_inner(table, table_sub_range) .await; } @@ -526,33 +518,33 @@ impl<'a, S: StateStore> OverPartition<'a, S> { if self.cache_left_is_sentinel() && *range.start() < cache_real_first_key { // extend leftward only if there's smallest sentinel - let table_pk_range = ( - Bound::Included(self.state_key_to_table_pk(range.start())?), - Bound::Excluded(self.state_key_to_table_pk(cache_real_first_key)?), + let table_sub_range = ( + Bound::Included(self.state_key_to_table_sub_pk(range.start())?), + Bound::Excluded(self.state_key_to_table_sub_pk(cache_real_first_key)?), ); - tracing::debug!( + tracing::trace!( partition=?self.this_partition_key, - table_pk_range=?table_pk_range, + table_sub_range=?table_sub_range, "loading the left half of given range" ); return self - .extend_cache_by_range_inner(table, table_pk_range) + .extend_cache_by_range_inner(table, table_sub_range) .await; } if self.cache_right_is_sentinel() && *range.end() > cache_real_last_key { // extend rightward only if there's largest sentinel - let table_pk_range = ( - Bound::Excluded(self.state_key_to_table_pk(cache_real_last_key)?), - Bound::Included(self.state_key_to_table_pk(range.end())?), + let table_sub_range = ( + Bound::Excluded(self.state_key_to_table_sub_pk(cache_real_last_key)?), + Bound::Included(self.state_key_to_table_sub_pk(range.end())?), ); - tracing::debug!( + tracing::trace!( partition=?self.this_partition_key, - table_pk_range=?table_pk_range, + table_sub_range=?table_sub_range, "loading the right half of given range" ); return self - .extend_cache_by_range_inner(table, table_pk_range) + .extend_cache_by_range_inner(table, table_sub_range) .await; } @@ -567,24 +559,18 @@ impl<'a, S: StateStore> OverPartition<'a, S> { async fn extend_cache_by_range_inner( &mut self, table: &StateTable, - table_pk_range: (Bound, Bound), + table_sub_range: (Bound, Bound), ) -> StreamExecutorResult<()> { - let streams = stream::iter(table.vnode_bitmap().iter_vnodes()) - .map(|vnode| { - table.iter_row_with_pk_range( - &table_pk_range, - vnode, - PrefetchOptions::new_for_exhaust_iter(), - ) - }) - .buffer_unordered(10) - .try_collect::>() - .await? - .into_iter() - .map(Box::pin); + let stream = table + .iter_with_prefix( + self.this_partition_key, + &table_sub_range, + PrefetchOptions::new_for_exhaust_iter(), + ) + .await?; #[for_await] - for row in select_all(streams) { + for row in stream { let row: OwnedRow = row?.into_owned_row(); let key = self.row_to_state_key(&row)?; self.range_cache.insert(CacheKey::from(key), row); @@ -645,25 +631,20 @@ impl<'a, S: StateStore> OverPartition<'a, S> { ) -> StreamExecutorResult<()> { let mut to_extend: VecDeque = VecDeque::with_capacity(MAGIC_BATCH_SIZE); { - let pk_range = ( - Bound::Included(self.this_partition_key.into_owned_row()), - Bound::Excluded(self.state_key_to_table_pk(range_to_exclusive)?), + let sub_range = ( + Bound::::Unbounded, + Bound::Excluded(self.state_key_to_table_sub_pk(range_to_exclusive)?), ); - let streams: Vec<_> = - futures::future::try_join_all(table.vnode_bitmap().iter_vnodes().map(|vnode| { - table.iter_row_with_pk_range( - &pk_range, - vnode, - PrefetchOptions::new_for_exhaust_iter(), - ) - })) - .await? - .into_iter() - .map(Box::pin) - .collect(); + let stream = table + .iter_with_prefix( + self.this_partition_key, + &sub_range, + PrefetchOptions::new_for_exhaust_iter(), + ) + .await?; #[for_await] - for row in merge_sort(streams) { + for row in stream { let row: OwnedRow = row?.into_owned_row(); // For leftward extension, we now must iterate the table in order from the beginning @@ -741,33 +722,22 @@ impl<'a, S: StateStore> OverPartition<'a, S> { ) -> StreamExecutorResult<()> { let mut n_extended = 0usize; { - let pk_range = ( - Bound::Excluded(self.state_key_to_table_pk(range_from_exclusive)?), - // currently we can't get the first possible key after this partition, so use - // `Unbounded` plus manual check for workaround + let sub_range = ( + Bound::Excluded(self.state_key_to_table_sub_pk(range_from_exclusive)?), Bound::::Unbounded, ); - let streams: Vec<_> = - futures::future::try_join_all(table.vnode_bitmap().iter_vnodes().map(|vnode| { - table.iter_row_with_pk_range(&pk_range, vnode, PrefetchOptions::default()) - })) - .await? - .into_iter() - .map(Box::pin) - .collect(); + let stream = table + .iter_with_prefix( + self.this_partition_key, + &sub_range, + PrefetchOptions::default(), + ) + .await?; #[for_await] - for row in merge_sort(streams) { + for row in stream { let row: OwnedRow = row?.into_owned_row(); - if !Row::eq( - self.this_partition_key, - (&row).project(self.partition_key_indices), - ) { - // we've reached the end of this partition - break; - } - let key = self.row_to_state_key(&row)?; self.range_cache.insert(CacheKey::from(key), row); @@ -786,17 +756,16 @@ impl<'a, S: StateStore> OverPartition<'a, S> { Ok(()) } - fn state_key_to_table_pk(&self, key: &StateKey) -> StreamExecutorResult { - Ok(self - .this_partition_key - .chain(memcmp_encoding::decode_row( - &key.order_key, - self.order_key_data_types, - self.order_key_order_types, - )?) - .chain(key.pk.as_inner()) - .project(&self.state_key_to_table_pk_proj) - .into_owned_row()) + /// Convert [`StateKey`] to sub pk (pk without partition key) as [`OwnedRow`]. + fn state_key_to_table_sub_pk(&self, key: &StateKey) -> StreamExecutorResult { + Ok(memcmp_encoding::decode_row( + &key.order_key, + self.order_key_data_types, + self.order_key_order_types, + )? + .chain(key.pk.as_inner()) + .project(&self.state_key_to_table_sub_pk_proj) + .into_owned_row()) } fn row_to_state_key(&self, full_row: impl Row + Copy) -> StreamExecutorResult { @@ -986,7 +955,7 @@ mod find_affected_ranges_tests { use itertools::Itertools; use risingwave_common::types::{DataType, ScalarImpl}; - use risingwave_expr::agg::{AggArgs, AggKind}; + use risingwave_expr::aggregate::{AggArgs, AggKind}; use risingwave_expr::window_function::{Frame, FrameBound, WindowFuncKind}; use super::*; diff --git a/src/stream/src/executor/project.rs b/src/stream/src/executor/project.rs index 4da00e7c9d94c..56a31bde901b9 100644 --- a/src/stream/src/executor/project.rs +++ b/src/stream/src/executor/project.rs @@ -34,7 +34,7 @@ pub struct ProjectExecutor { } struct Inner { - ctx: ActorContextRef, + _ctx: ActorContextRef, info: ExecutorInfo, /// Expressions of the current projection. @@ -82,7 +82,7 @@ impl ProjectExecutor { Self { input, inner: Inner { - ctx, + _ctx: ctx, info: ExecutorInfo { schema, pk_indices: info.pk_indices, @@ -138,11 +138,7 @@ impl Inner { let mut projected_columns = Vec::new(); for expr in &self.exprs { - let evaluated_expr = expr - .eval_infallible(&data_chunk, |err| { - self.ctx.on_compute_error(err, &self.info.identity) - }) - .await; + let evaluated_expr = expr.eval_infallible(&data_chunk).await; projected_columns.push(evaluated_expr); } let (_, vis) = data_chunk.into_parts(); @@ -160,12 +156,7 @@ impl Inner { let out_col_idx = *out_col_idx; let derived_watermark = watermark .clone() - .transform_with_expr(&self.exprs[out_col_idx], out_col_idx, |err| { - self.ctx.on_compute_error( - err, - &(self.info.identity.to_string() + "(when computing watermark)"), - ) - }) + .transform_with_expr(&self.exprs[out_col_idx], out_col_idx) .await; if let Some(derived_watermark) = derived_watermark { ret.push(derived_watermark); diff --git a/src/stream/src/executor/project_set.rs b/src/stream/src/executor/project_set.rs index f1962d456b2e1..6867e3d55bfde 100644 --- a/src/stream/src/executor/project_set.rs +++ b/src/stream/src/executor/project_set.rs @@ -45,7 +45,7 @@ pub struct ProjectSetExecutor { struct Inner { info: ExecutorInfo, - ctx: ActorContextRef, + _ctx: ActorContextRef, /// Expressions of the current project_section. select_list: Vec, chunk_size: usize, @@ -83,7 +83,7 @@ impl ProjectSetExecutor { let inner = Inner { info, - ctx, + _ctx: ctx, select_list, chunk_size, watermark_derivations, @@ -260,12 +260,7 @@ impl Inner { ProjectSetSelectItem::Expr(expr) => { watermark .clone() - .transform_with_expr(expr, expr_idx + PROJ_ROW_ID_OFFSET, |err| { - self.ctx.on_compute_error( - err, - &(self.info.identity.to_string() + "(when computing watermark)"), - ) - }) + .transform_with_expr(expr, expr_idx + PROJ_ROW_ID_OFFSET) .await } ProjectSetSelectItem::TableFunction(_) => { diff --git a/src/stream/src/executor/rearranged_chain.rs b/src/stream/src/executor/rearranged_chain.rs index 1ad43de432551..d2aaae9fd5025 100644 --- a/src/stream/src/executor/rearranged_chain.rs +++ b/src/stream/src/executor/rearranged_chain.rs @@ -135,6 +135,8 @@ impl RearrangedChainExecutor { .unbounded_send(RearrangedMessage::PhantomBarrier(first_barrier)) .unwrap(); + let mut processed_rows: u64 = 0; + { // 3. Rearrange stream, will yield the barriers polled from upstream to rearrange. let rearranged_barrier = @@ -162,8 +164,6 @@ impl RearrangedChainExecutor { let mut last_rearranged_epoch = create_epoch; let mut stop_rearrange_tx = Some(stop_rearrange_tx); - let mut processed_rows: u64 = 0; - #[for_await] for rearranged_msg in &mut rearranged { match rearranged_msg? { @@ -223,7 +223,7 @@ impl RearrangedChainExecutor { continue; }; if let Some(barrier) = msg.as_barrier() { - self.progress.finish(barrier.epoch.curr); + self.progress.finish(barrier.epoch.curr, processed_rows); } yield msg; } @@ -236,7 +236,7 @@ impl RearrangedChainExecutor { for msg in upstream { let msg: Message = msg?; if let Some(barrier) = msg.as_barrier() { - self.progress.finish(barrier.epoch.curr); + self.progress.finish(barrier.epoch.curr, processed_rows); } yield msg; } diff --git a/src/stream/src/executor/receiver.rs b/src/stream/src/executor/receiver.rs index 64c4bfabc1b58..5b96cf6f9f8d8 100644 --- a/src/stream/src/executor/receiver.rs +++ b/src/stream/src/executor/receiver.rs @@ -140,11 +140,11 @@ impl Executor for ReceiverExecutor { Message::Chunk(chunk) => { self.metrics .actor_in_record_cnt - .with_label_values(&[&actor_id_str]) + .with_label_values(&[&actor_id_str, &fragment_id_str]) .inc_by(chunk.cardinality() as _); } Message::Barrier(barrier) => { - tracing::trace!( + tracing::debug!( target: "events::stream::barrier::path", actor_id = actor_id, "receiver receives barrier from path: {:?}", diff --git a/src/stream/src/executor/simple_agg.rs b/src/stream/src/executor/simple_agg.rs index b50f53977dc84..6e88241f48433 100644 --- a/src/stream/src/executor/simple_agg.rs +++ b/src/stream/src/executor/simple_agg.rs @@ -17,7 +17,7 @@ use futures_async_stream::try_stream; use risingwave_common::array::StreamChunk; use risingwave_common::catalog::Schema; use risingwave_common::util::iter_util::ZipEqFast; -use risingwave_expr::agg::{build_retractable, AggCall, BoxedAggregateFunction}; +use risingwave_expr::aggregate::{build_retractable, AggCall, BoxedAggregateFunction}; use risingwave_storage::StateStore; use super::agg_common::{AggExecutorArgs, SimpleAggExecutorExtraArgs}; @@ -169,8 +169,7 @@ impl SimpleAggExecutor { // Calculate the row visibility for every agg call. let mut call_visibilities = Vec::with_capacity(this.agg_calls.len()); for agg_call in &this.agg_calls { - let vis = - agg_call_filter_res(&this.actor_ctx, &this.info.identity, agg_call, &chunk).await?; + let vis = agg_call_filter_res(agg_call, &chunk).await?; call_visibilities.push(vis); } @@ -318,7 +317,7 @@ mod tests { use risingwave_common::array::stream_chunk::StreamChunkTestExt; use risingwave_common::catalog::Field; use risingwave_common::types::*; - use risingwave_expr::agg::AggCall; + use risingwave_expr::aggregate::AggCall; use risingwave_storage::memory::MemoryStateStore; use risingwave_storage::StateStore; diff --git a/src/stream/src/executor/sink.rs b/src/stream/src/executor/sink.rs index b36a5613ba7bb..70e63b4b33cd0 100644 --- a/src/stream/src/executor/sink.rs +++ b/src/stream/src/executor/sink.rs @@ -12,35 +12,31 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; -use std::time::Instant; +use std::mem; +use anyhow::anyhow; use futures::stream::select; use futures::{FutureExt, StreamExt}; use futures_async_stream::try_stream; use itertools::Itertools; -use prometheus::Histogram; use risingwave_common::array::stream_chunk::StreamChunkMut; -use risingwave_common::array::{merge_chunk_row, Op, StreamChunk}; +use risingwave_common::array::{merge_chunk_row, Op, StreamChunk, StreamChunkCompactor}; use risingwave_common::catalog::{ColumnCatalog, Field, Schema}; -use risingwave_common::util::epoch::EpochPair; use risingwave_connector::dispatch_sink; -use risingwave_connector::sink::catalog::SinkType; +use risingwave_connector::sink::catalog::{SinkId, SinkType}; +use risingwave_connector::sink::log_store::{ + LogReader, LogReaderExt, LogStoreFactory, LogWriter, LogWriterExt, +}; use risingwave_connector::sink::{ - build_sink, Sink, SinkImpl, SinkParam, SinkWriter, SinkWriterParam, + build_sink, LogSinker, Sink, SinkImpl, SinkParam, SinkWriterParam, }; use super::error::{StreamExecutorError, StreamExecutorResult}; use super::{BoxedExecutor, Executor, Message, PkIndices}; -use crate::common::log_store::{ - LogReader, LogStoreFactory, LogStoreReadItem, LogWriter, TruncateOffset, -}; -use crate::executor::monitor::StreamingMetrics; use crate::executor::{expect_first_barrier, ActorContextRef, BoxedMessageStream}; pub struct SinkExecutor { input: BoxedExecutor, - metrics: Arc, sink: SinkImpl, identity: String, pk_indices: PkIndices, @@ -53,10 +49,6 @@ pub struct SinkExecutor { sink_writer_param: SinkWriterParam, } -struct SinkMetrics { - sink_commit_duration_metrics: Histogram, -} - // Drop all the DELETE messages in this chunk and convert UPDATE INSERT into INSERT. fn force_append_only(c: StreamChunk) -> StreamChunk { let mut c: StreamChunkMut = c.into(); @@ -70,11 +62,23 @@ fn force_append_only(c: StreamChunk) -> StreamChunk { c.into() } +// Drop all the INSERT messages in this chunk and convert UPDATE DELETE into DELETE. +fn force_delete_only(c: StreamChunk) -> StreamChunk { + let mut c: StreamChunkMut = c.into(); + for (_, mut r) in c.to_rows_mut() { + match r.op() { + Op::Delete => {} + Op::Insert | Op::UpdateInsert => r.set_vis(false), + Op::UpdateDelete => r.set_op(Op::Delete), + } + } + c.into() +} + impl SinkExecutor { #[allow(clippy::too_many_arguments)] pub async fn new( input: BoxedExecutor, - metrics: Arc, sink_writer_param: SinkWriterParam, sink_param: SinkParam, columns: Vec, @@ -91,7 +95,6 @@ impl SinkExecutor { .collect(); Ok(Self { input, - metrics, sink, identity: format!("SinkExecutor {:X?}", sink_writer_param.executor_id), pk_indices, @@ -106,21 +109,23 @@ impl SinkExecutor { } fn execute_inner(self) -> BoxedMessageStream { - let sink_commit_duration_metrics = self - .metrics - .sink_commit_duration - .with_label_values(&[self.identity.as_str(), self.sink.get_connector()]); + let stream_key = self.pk_indices; - let sink_metrics = SinkMetrics { - sink_commit_duration_metrics, + let stream_key_sink_pk_mismatch = { + stream_key + .iter() + .any(|i| !self.sink_param.downstream_pk.contains(i)) }; let write_log_stream = Self::execute_write_log( self.input, - self.pk_indices, - self.log_writer, + stream_key, + self.log_writer + .monitored(self.sink_writer_param.sink_metrics.clone()), + self.sink_param.sink_id, self.sink_param.sink_type, self.actor_context, + stream_key_sink_pk_mismatch, ); dispatch_sink!(self.sink, sink, { @@ -128,7 +133,6 @@ impl SinkExecutor { sink, self.log_reader, self.input_columns, - sink_metrics, self.sink_writer_param, ); select(consume_log_stream.into_stream(), write_log_stream).boxed() @@ -140,8 +144,10 @@ impl SinkExecutor { input: BoxedExecutor, stream_key: PkIndices, mut log_writer: impl LogWriter, + sink_id: SinkId, sink_type: SinkType, actor_context: ActorContextRef, + stream_key_sink_pk_mismatch: bool, ) { let mut input = input.execute(); @@ -149,43 +155,131 @@ impl SinkExecutor { let epoch_pair = barrier.epoch; - log_writer - .init(EpochPair::new_test_epoch(epoch_pair.curr)) - .await?; + log_writer.init(epoch_pair).await?; // Propagate the first barrier yield Message::Barrier(barrier); - #[for_await] - for msg in input { - match msg? { - Message::Watermark(w) => yield Message::Watermark(w), - Message::Chunk(chunk) => { - // Compact the chunk to eliminate any useless intermediate result (e.g. UPDATE - // V->V). - let chunk = merge_chunk_row(chunk, &stream_key); - let chunk = if sink_type == SinkType::ForceAppendOnly { - // Force append-only by dropping UPDATE/DELETE messages. We do this when the - // user forces the sink to be append-only while it is actually not based on - // the frontend derivation result. - force_append_only(chunk) - } else { - chunk - }; - - log_writer.write_chunk(chunk.clone()).await?; - - // Use original chunk instead of the reordered one as the executor output. - yield Message::Chunk(chunk); + // for metrics + let sink_id_str = sink_id.to_string(); + let actor_id_str = actor_context.id.to_string(); + let fragment_id_str = actor_context.fragment_id.to_string(); + + // When stream key is different from the user defined primary key columns for sinks. The operations could be out of order + // stream key: a,b + // sink pk: a + + // original: + // (1,1) -> (1,2) + // (1,2) -> (1,3) + + // mv fragment 1: + // delete (1,1) + + // mv fragment 2: + // insert (1,2) + // delete (1,2) + + // mv fragment 3: + // insert (1,3) + + // merge to sink fragment: + // insert (1,3) + // insert (1,2) + // delete (1,2) + // delete (1,1) + // So we do additional compaction in the sink executor per barrier. + + // 1. compact all the chanes with the stream key. + // 2. sink all the delete events and then sink all insert evernt. + + // after compacting with the stream key, the two event with the same used defined sink pk must have different stream key. + // So the delete event is not to delete the inserted record in our internal streaming SQL semantic. + if stream_key_sink_pk_mismatch && sink_type != SinkType::AppendOnly { + let mut chunk_buffer = StreamChunkCompactor::new(stream_key.clone()); + let mut watermark = None; + #[for_await] + for msg in input { + match msg? { + Message::Watermark(w) => watermark = Some(w), + Message::Chunk(c) => { + actor_context + .streaming_metrics + .sink_input_row_count + .with_label_values(&[&sink_id_str, &actor_id_str, &fragment_id_str]) + .inc_by(c.capacity() as u64); + + chunk_buffer.push_chunk(c); + } + Message::Barrier(barrier) => { + let mut delete_chunks = vec![]; + let mut insert_chunks = vec![]; + for c in mem::replace( + &mut chunk_buffer, + StreamChunkCompactor::new(stream_key.clone()), + ) + .into_compacted_chunks() + { + if sink_type != SinkType::ForceAppendOnly { + // Force append-only by dropping UPDATE/DELETE messages. We do this when the + // user forces the sink to be append-only while it is actually not based on + // the frontend derivation result. + delete_chunks.push(force_delete_only(c.clone())); + } + insert_chunks.push(force_append_only(c)); + } + + for c in delete_chunks.into_iter().chain(insert_chunks.into_iter()) { + log_writer.write_chunk(c.clone()).await?; + yield Message::Chunk(c); + } + if let Some(w) = mem::take(&mut watermark) { + yield Message::Watermark(w) + } + log_writer + .flush_current_epoch(barrier.epoch.curr, barrier.kind.is_checkpoint()) + .await?; + if let Some(vnode_bitmap) = barrier.as_update_vnode_bitmap(actor_context.id) + { + log_writer.update_vnode_bitmap(vnode_bitmap); + } + yield Message::Barrier(barrier); + } } - Message::Barrier(barrier) => { - log_writer - .flush_current_epoch(barrier.epoch.curr, barrier.kind.is_checkpoint()) - .await?; - if let Some(vnode_bitmap) = barrier.as_update_vnode_bitmap(actor_context.id) { - log_writer.update_vnode_bitmap(vnode_bitmap); + } + } else { + #[for_await] + for msg in input { + match msg? { + Message::Watermark(w) => yield Message::Watermark(w), + Message::Chunk(chunk) => { + // Compact the chunk to eliminate any useless intermediate result (e.g. UPDATE + // V->V). + let chunk = merge_chunk_row(chunk, &stream_key); + let chunk = if sink_type == SinkType::ForceAppendOnly { + // Force append-only by dropping UPDATE/DELETE messages. We do this when the + // user forces the sink to be append-only while it is actually not based on + // the frontend derivation result. + force_append_only(chunk) + } else { + chunk + }; + + log_writer.write_chunk(chunk.clone()).await?; + + // Use original chunk instead of the reordered one as the executor output. + yield Message::Chunk(chunk); + } + Message::Barrier(barrier) => { + log_writer + .flush_current_epoch(barrier.epoch.curr, barrier.kind.is_checkpoint()) + .await?; + if let Some(vnode_bitmap) = barrier.as_update_vnode_bitmap(actor_context.id) + { + log_writer.update_vnode_bitmap(vnode_bitmap); + } + yield Message::Barrier(barrier); } - yield Message::Barrier(barrier); } } } @@ -193,13 +287,12 @@ impl SinkExecutor { async fn execute_consume_log( sink: S, - mut log_reader: R, + log_reader: R, columns: Vec, - sink_metrics: SinkMetrics, sink_writer_param: SinkWriterParam, ) -> StreamExecutorResult { - log_reader.init().await?; - let mut sink_writer = sink.new_writer(sink_writer_param).await?; + let metrics = sink_writer_param.sink_metrics.clone(); + let log_sinker = sink.new_log_sinker(sink_writer_param).await?; let visible_columns = columns .iter() @@ -207,96 +300,20 @@ impl SinkExecutor { .filter_map(|(idx, column)| (!column.is_hidden).then_some(idx)) .collect_vec(); - #[derive(Debug)] - enum LogConsumerState { - /// Mark that the log consumer is not initialized yet - Uninitialized, - - /// Mark that a new epoch has begun. - EpochBegun { curr_epoch: u64 }, - - /// Mark that the consumer has just received a barrier - BarrierReceived { prev_epoch: u64 }, - } - - let mut state = LogConsumerState::Uninitialized; - - loop { - let (epoch, item): (u64, LogStoreReadItem) = log_reader.next_item().await?; - if let LogStoreReadItem::UpdateVnodeBitmap(_) = &item { - match &state { - LogConsumerState::BarrierReceived { .. } => {} - _ => unreachable!( - "update vnode bitmap can be accepted only right after \ - barrier, but current state is {:?}", - state - ), - } - } - // begin_epoch when not previously began - state = match state { - LogConsumerState::Uninitialized => { - sink_writer.begin_epoch(epoch).await?; - LogConsumerState::EpochBegun { curr_epoch: epoch } - } - LogConsumerState::EpochBegun { curr_epoch } => { - assert!( - epoch >= curr_epoch, - "new epoch {} should not be below the current epoch {}", - epoch, - curr_epoch - ); - LogConsumerState::EpochBegun { curr_epoch: epoch } - } - LogConsumerState::BarrierReceived { prev_epoch } => { - assert!( - epoch > prev_epoch, - "new epoch {} should be greater than prev epoch {}", - epoch, - prev_epoch - ); - sink_writer.begin_epoch(epoch).await?; - LogConsumerState::EpochBegun { curr_epoch: epoch } - } - }; - match item { - LogStoreReadItem::StreamChunk { chunk, .. } => { - let chunk = if visible_columns.len() != columns.len() { - // Do projection here because we may have columns that aren't visible to - // the downstream. - chunk.project(&visible_columns) - } else { - chunk - }; - if let Err(e) = sink_writer.write_batch(chunk).await { - sink_writer.abort().await?; - return Err(e.into()); - } - } - LogStoreReadItem::Barrier { is_checkpoint } => { - let prev_epoch = match state { - LogConsumerState::EpochBegun { curr_epoch } => curr_epoch, - _ => unreachable!("epoch must have begun before handling barrier"), - }; - if is_checkpoint { - let start_time = Instant::now(); - sink_writer.barrier(true).await?; - sink_metrics - .sink_commit_duration_metrics - .observe(start_time.elapsed().as_millis() as f64); - log_reader - .truncate(TruncateOffset::Barrier { epoch: prev_epoch }) - .await?; - } else { - sink_writer.barrier(false).await?; - } - state = LogConsumerState::BarrierReceived { prev_epoch } - } - LogStoreReadItem::UpdateVnodeBitmap(vnode_bitmap) => { - sink_writer.update_vnode_bitmap(vnode_bitmap).await?; + let log_reader = log_reader + .transform_chunk(move |chunk| { + if visible_columns.len() != columns.len() { + // Do projection here because we may have columns that aren't visible to + // the downstream. + chunk.project(&visible_columns) + } else { + chunk } - } - } + }) + .monitored(metrics); + + log_sinker.consume_log_and_sink(log_reader).await?; + Err(anyhow!("end of stream").into()) } } @@ -323,7 +340,7 @@ mod test { use risingwave_common::catalog::{ColumnDesc, ColumnId}; use super::*; - use crate::common::log_store::in_mem::BoundedInMemLogStoreFactory; + use crate::common::log_store_impl::in_mem::BoundedInMemLogStoreFactory; use crate::executor::test_utils::*; use crate::executor::ActorContext; @@ -396,14 +413,14 @@ mod test { .collect(), downstream_pk: pk.clone(), sink_type: SinkType::ForceAppendOnly, + format_desc: None, db_name: "test".into(), sink_from_name: "test".into(), }; let sink_executor = SinkExecutor::new( Box::new(mock), - Arc::new(StreamingMetrics::unused()), - SinkWriterParam::default(), + SinkWriterParam::for_test(), sink_param, columns.clone(), ActorContext::create(0), @@ -447,6 +464,147 @@ mod test { executor.next().await.unwrap().unwrap(); } + #[tokio::test] + async fn stream_key_sink_pk_mismatch() { + use risingwave_common::array::stream_chunk::StreamChunk; + use risingwave_common::array::StreamChunkTestExt; + use risingwave_common::types::DataType; + + use crate::executor::Barrier; + + let properties = maplit::hashmap! { + "connector".into() => "blackhole".into(), + }; + + // We have two visible columns and one hidden column. The hidden column will be pruned out + // within the sink executor. + let columns = vec![ + ColumnCatalog { + column_desc: ColumnDesc::unnamed(ColumnId::new(0), DataType::Int64), + is_hidden: false, + }, + ColumnCatalog { + column_desc: ColumnDesc::unnamed(ColumnId::new(1), DataType::Int64), + is_hidden: false, + }, + ColumnCatalog { + column_desc: ColumnDesc::unnamed(ColumnId::new(2), DataType::Int64), + is_hidden: true, + }, + ]; + let schema: Schema = columns + .iter() + .map(|column| Field::from(column.column_desc.clone())) + .collect(); + + let mock = MockSource::with_messages( + schema, + vec![0, 1], + vec![ + Message::Barrier(Barrier::new_test_barrier(1)), + Message::Chunk(std::mem::take(&mut StreamChunk::from_pretty( + " I I I + + 1 1 10", + ))), + Message::Barrier(Barrier::new_test_barrier(2)), + Message::Chunk(std::mem::take(&mut StreamChunk::from_pretty( + " I I I + + 1 3 30", + ))), + Message::Chunk(std::mem::take(&mut StreamChunk::from_pretty( + " I I I + + 1 2 20 + - 1 2 20", + ))), + Message::Chunk(std::mem::take(&mut StreamChunk::from_pretty( + " I I I + - 1 1 10", + ))), + Message::Barrier(Barrier::new_test_barrier(3)), + ], + ); + + let sink_param = SinkParam { + sink_id: 0.into(), + properties, + columns: columns + .iter() + .filter(|col| !col.is_hidden) + .map(|col| col.column_desc.clone()) + .collect(), + downstream_pk: vec![0], + sink_type: SinkType::Upsert, + format_desc: None, + db_name: "test".into(), + sink_from_name: "test".into(), + }; + + let sink_executor = SinkExecutor::new( + Box::new(mock), + SinkWriterParam::for_test(), + sink_param, + columns.clone(), + ActorContext::create(0), + BoundedInMemLogStoreFactory::new(1), + vec![0, 1], + ) + .await + .unwrap(); + + let mut executor = SinkExecutor::execute(Box::new(sink_executor)); + + // Barrier message. + executor.next().await.unwrap().unwrap(); + + let chunk_msg = executor.next().await.unwrap().unwrap(); + assert_eq!(chunk_msg.into_chunk().unwrap().cardinality(), 0); + + let chunk_msg = executor.next().await.unwrap().unwrap(); + assert_eq!( + chunk_msg.into_chunk().unwrap().compact(), + StreamChunk::from_pretty( + " I I I + + 1 1 10", + ) + ); + + // Barrier message. + executor.next().await.unwrap().unwrap(); + + let chunk_msg = executor.next().await.unwrap().unwrap(); + assert_eq!(chunk_msg.into_chunk().unwrap().cardinality(), 0); + let chunk_msg = executor.next().await.unwrap().unwrap(); + assert_eq!(chunk_msg.into_chunk().unwrap().cardinality(), 0); + + let chunk_msg = executor.next().await.unwrap().unwrap(); + assert_eq!( + chunk_msg.into_chunk().unwrap().compact(), + StreamChunk::from_pretty( + " I I I + - 1 1 10", + ) + ); + + let chunk_msg = executor.next().await.unwrap().unwrap(); + assert_eq!( + chunk_msg.into_chunk().unwrap().compact(), + StreamChunk::from_pretty( + " I I I + + 1 3 30", + ) + ); + let chunk_msg = executor.next().await.unwrap().unwrap(); + assert_eq!(chunk_msg.into_chunk().unwrap().cardinality(), 0); + let chunk_msg = executor.next().await.unwrap().unwrap(); + assert_eq!(chunk_msg.into_chunk().unwrap().cardinality(), 0); + + // Should not receive the third stream chunk message because the force-append-only sink + // executor will drop all DELETE messages. + + // The last barrier message. + executor.next().await.unwrap().unwrap(); + } + #[tokio::test] async fn test_empty_barrier_sink() { use risingwave_common::types::DataType; @@ -494,14 +652,14 @@ mod test { .collect(), downstream_pk: pk.clone(), sink_type: SinkType::ForceAppendOnly, + format_desc: None, db_name: "test".into(), sink_from_name: "test".into(), }; let sink_executor = SinkExecutor::new( Box::new(mock), - Arc::new(StreamingMetrics::unused()), - SinkWriterParam::default(), + SinkWriterParam::for_test(), sink_param, columns, ActorContext::create(0), diff --git a/src/stream/src/executor/sort_buffer.rs b/src/stream/src/executor/sort_buffer.rs index 709597109af14..a1d6e3286ed5f 100644 --- a/src/stream/src/executor/sort_buffer.rs +++ b/src/stream/src/executor/sort_buffer.rs @@ -213,9 +213,9 @@ impl SortBuffer { let streams: Vec<_> = futures::future::try_join_all(buffer_table.vnode_bitmap().iter_vnodes().map(|vnode| { - buffer_table.iter_row_with_pk_range( - &pk_range, + buffer_table.iter_with_vnode( vnode, + &pk_range, PrefetchOptions::new_with_exhaust_iter(filler.capacity().is_none()), ) })) diff --git a/src/stream/src/executor/source/fetch_executor.rs b/src/stream/src/executor/source/fetch_executor.rs new file mode 100644 index 0000000000000..3e7ea84bcfcce --- /dev/null +++ b/src/stream/src/executor/source/fetch_executor.rs @@ -0,0 +1,350 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{Debug, Formatter}; +use std::ops::Bound; +use std::sync::Arc; + +use either::Either; +use futures::pin_mut; +use futures::stream::{self, StreamExt}; +use futures_async_stream::try_stream; +use risingwave_common::catalog::{ColumnId, Schema, TableId}; +use risingwave_common::hash::VnodeBitmapExt; +use risingwave_common::row::{OwnedRow, Row}; +use risingwave_common::types::{ScalarRef, ScalarRefImpl}; +use risingwave_connector::source::filesystem::FsSplit; +use risingwave_connector::source::{ + BoxSourceWithStateStream, SourceContext, SourceCtrlOpts, SplitImpl, SplitMetaData, + StreamChunkWithState, +}; +use risingwave_connector::ConnectorParams; +use risingwave_source::source_desc::SourceDesc; +use risingwave_storage::store::PrefetchOptions; +use risingwave_storage::StateStore; + +use crate::executor::stream_reader::StreamReaderWithPause; +use crate::executor::{ + expect_first_barrier, ActorContextRef, BoxedExecutor, BoxedMessageStream, Executor, Message, + Mutation, PkIndices, PkIndicesRef, SourceStateTableHandler, StreamExecutorError, + StreamExecutorResult, StreamSourceCore, +}; + +const SPLIT_BATCH_SIZE: usize = 1000; + +type SplitBatch = Option>; + +pub struct FsFetchExecutor { + actor_ctx: ActorContextRef, + + identity: String, + + schema: Schema, + + pk_indices: PkIndices, + + /// Streaming source for external + stream_source_core: Option>, + + /// Upstream list executor. + upstream: Option, + + // control options for connector level + source_ctrl_opts: SourceCtrlOpts, + + // config for the connector node + connector_params: ConnectorParams, +} + +impl FsFetchExecutor { + #[allow(clippy::too_many_arguments)] + pub fn new( + actor_ctx: ActorContextRef, + schema: Schema, + pk_indices: PkIndices, + stream_source_core: StreamSourceCore, + executor_id: u64, + upstream: BoxedExecutor, + source_ctrl_opts: SourceCtrlOpts, + connector_params: ConnectorParams, + ) -> Self { + Self { + actor_ctx, + identity: format!("FsFetchExecutor {:X}", executor_id), + schema, + pk_indices, + stream_source_core: Some(stream_source_core), + upstream: Some(upstream), + source_ctrl_opts, + connector_params, + } + } + + async fn replace_with_new_batch_reader( + splits_on_fetch: &mut usize, + state_store_handler: &SourceStateTableHandler, + column_ids: Vec, + source_ctx: SourceContext, + source_desc: &SourceDesc, + stream: &mut StreamReaderWithPause, + ) -> StreamExecutorResult<()> { + let mut batch = Vec::with_capacity(SPLIT_BATCH_SIZE); + 'vnodes: for vnode in state_store_handler.state_store.vnodes().iter_vnodes() { + let table_iter = state_store_handler + .state_store + .iter_with_vnode( + vnode, + &(Bound::::Unbounded, Bound::::Unbounded), + PrefetchOptions::new_for_exhaust_iter(), + ) + .await?; + pin_mut!(table_iter); + + while let Some(item) = table_iter.next().await { + let row = item?; + let split = match row.datum_at(1) { + Some(ScalarRefImpl::Jsonb(jsonb_ref)) => { + SplitImpl::from(FsSplit::restore_from_json(jsonb_ref.to_owned_scalar())?) + } + _ => unreachable!(), + }; + batch.push(split); + + if batch.len() >= SPLIT_BATCH_SIZE { + break 'vnodes; + } + } + } + + if batch.is_empty() { + stream.replace_data_stream(stream::pending().boxed()); + } else { + *splits_on_fetch += batch.len(); + let batch_reader = + Self::build_batched_stream_reader(column_ids, source_ctx, source_desc, Some(batch)) + .await?; + stream.replace_data_stream(batch_reader); + } + + Ok(()) + } + + async fn build_batched_stream_reader( + column_ids: Vec, + source_ctx: SourceContext, + source_desc: &SourceDesc, + batch: SplitBatch, + ) -> StreamExecutorResult { + source_desc + .source + .stream_reader(batch, column_ids, Arc::new(source_ctx)) + .await + .map_err(StreamExecutorError::connector_error) + } + + fn build_source_ctx(&self, source_desc: &SourceDesc, source_id: TableId) -> SourceContext { + SourceContext::new_with_suppressor( + self.actor_ctx.id, + source_id, + self.actor_ctx.fragment_id, + source_desc.metrics.clone(), + self.source_ctrl_opts.clone(), + self.connector_params.connector_client.clone(), + self.actor_ctx.error_suppressor.clone(), + ) + } + + #[try_stream(ok = Message, error = StreamExecutorError)] + async fn into_stream(mut self) { + let mut upstream = self.upstream.take().unwrap().execute(); + let barrier = expect_first_barrier(&mut upstream).await?; + + let mut core = self.stream_source_core.take().unwrap(); + let mut state_store_handler = core.split_state_store; + + // Build source description from the builder. + let source_desc_builder = core.source_desc_builder.take().unwrap(); + + let source_desc = source_desc_builder + .build() + .map_err(StreamExecutorError::connector_error)?; + + // Initialize state table. + state_store_handler.init_epoch(barrier.epoch); + + let mut splits_on_fetch: usize = 0; + let mut stream = StreamReaderWithPause::::new( + upstream, + stream::pending().boxed(), + ); + + if barrier.is_pause_on_startup() { + stream.pause_stream(); + } + + // If it is a recovery startup, + // there can be file assignments in the state table. + // Hence we try building a reader first. + Self::replace_with_new_batch_reader( + &mut splits_on_fetch, + &state_store_handler, + core.column_ids.clone(), + self.build_source_ctx(&source_desc, core.source_id), + &source_desc, + &mut stream, + ) + .await?; + + yield Message::Barrier(barrier); + + while let Some(msg) = stream.next().await { + match msg { + Err(e) => { + tracing::error!("Fetch Error: {:?}", e); + splits_on_fetch = 0; + } + Ok(msg) => { + match msg { + // This branch will be preferred. + Either::Left(msg) => { + match &msg { + Message::Barrier(barrier) => { + if let Some(mutation) = barrier.mutation.as_deref() { + match mutation { + Mutation::Pause => stream.pause_stream(), + Mutation::Resume => stream.resume_stream(), + _ => (), + } + } + + state_store_handler + .state_store + .commit(barrier.epoch) + .await?; + + if let Some(vnode_bitmap) = + barrier.as_update_vnode_bitmap(self.actor_ctx.id) + { + // if _cache_may_stale, we must rebuild the stream to adjust vnode mappings + let (_prev_vnode_bitmap, cache_may_stale) = + state_store_handler + .state_store + .update_vnode_bitmap(vnode_bitmap); + + if cache_may_stale { + splits_on_fetch = 0; + } + } + + if splits_on_fetch == 0 { + Self::replace_with_new_batch_reader( + &mut splits_on_fetch, + &state_store_handler, + core.column_ids.clone(), + self.build_source_ctx(&source_desc, core.source_id), + &source_desc, + &mut stream, + ) + .await?; + } + + // Propagate the barrier. + yield msg; + } + // Receiving file assignments from upstream list executor, + // store into state table and try building a new reader. + Message::Chunk(chunk) => { + let file_assignment = chunk + .data_chunk() + .rows() + .map(|row| { + let filename = row.datum_at(0).unwrap().into_utf8(); + let size = row.datum_at(2).unwrap().into_int64(); + FsSplit::new(filename.to_owned(), 0, size as usize) + }) + .collect(); + state_store_handler.take_snapshot(file_assignment).await?; + } + _ => unreachable!(), + } + } + // StreamChunk from FsSourceReader, and the reader reads only one file. + // If the file read out, replace with a new file reader. + Either::Right(StreamChunkWithState { + chunk, + split_offset_mapping, + }) => { + let mapping = split_offset_mapping.unwrap(); + for (split_id, offset) in mapping { + let row = state_store_handler + .get(split_id.clone()) + .await? + .expect("The fs_split should be in the state table."); + let fs_split = match row.datum_at(1) { + Some(ScalarRefImpl::Jsonb(jsonb_ref)) => { + FsSplit::restore_from_json(jsonb_ref.to_owned_scalar())? + } + _ => unreachable!(), + }; + + if offset.parse::().unwrap() >= fs_split.size { + splits_on_fetch -= 1; + state_store_handler.delete(split_id).await?; + } else { + state_store_handler + .set(split_id, fs_split.encode_to_json()) + .await?; + } + } + + yield Message::Chunk(chunk); + } + } + } + } + } + } +} + +impl Executor for FsFetchExecutor { + fn execute(self: Box) -> BoxedMessageStream { + self.into_stream().boxed() + } + + fn schema(&self) -> &Schema { + &self.schema + } + + fn pk_indices(&self) -> PkIndicesRef<'_> { + &self.pk_indices + } + + fn identity(&self) -> &str { + self.identity.as_str() + } +} + +impl Debug for FsFetchExecutor { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if let Some(core) = &self.stream_source_core { + f.debug_struct("FsFetchExecutor") + .field("source_id", &core.source_id) + .field("column_ids", &core.column_ids) + .field("pk_indices", &self.pk_indices) + .finish() + } else { + f.debug_struct("FsFetchExecutor").finish() + } + } +} diff --git a/src/stream/src/executor/source/fs_source_executor.rs b/src/stream/src/executor/source/fs_source_executor.rs index ae77adb427e23..bba0e30eb5712 100644 --- a/src/stream/src/executor/source/fs_source_executor.rs +++ b/src/stream/src/executor/source/fs_source_executor.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// *** NOTICE: TO BE DEPRECATED *** // + use std::fmt::{Debug, Formatter}; use std::sync::Arc; diff --git a/src/stream/src/executor/source/list_executor.rs b/src/stream/src/executor/source/list_executor.rs new file mode 100644 index 0000000000000..53e8854594ce4 --- /dev/null +++ b/src/stream/src/executor/source/list_executor.rs @@ -0,0 +1,231 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use anyhow::anyhow; +use either::Either; +use futures::StreamExt; +use futures_async_stream::try_stream; +use risingwave_common::array::Op; +use risingwave_common::catalog::Schema; +use risingwave_common::system_param::local_manager::SystemParamsReaderRef; +use risingwave_connector::source::filesystem::FsPage; +use risingwave_connector::source::{BoxTryStream, SourceCtrlOpts}; +use risingwave_connector::ConnectorParams; +use risingwave_source::source_desc::{SourceDesc, SourceDescBuilder}; +use risingwave_storage::StateStore; +use tokio::sync::mpsc::UnboundedReceiver; + +use crate::executor::error::StreamExecutorError; +use crate::executor::monitor::StreamingMetrics; +use crate::executor::stream_reader::StreamReaderWithPause; +use crate::executor::*; + +#[allow(dead_code)] +pub struct FsListExecutor { + actor_ctx: ActorContextRef, + + identity: String, + + schema: Schema, + + pk_indices: PkIndices, + + /// Streaming source for external + stream_source_core: Option>, + + /// Metrics for monitor. + metrics: Arc, + + /// Receiver of barrier channel. + barrier_receiver: Option>, + + /// System parameter reader to read barrier interval + system_params: SystemParamsReaderRef, + + // control options for connector level + source_ctrl_opts: SourceCtrlOpts, + + // config for the connector node + connector_params: ConnectorParams, +} + +impl FsListExecutor { + #[allow(clippy::too_many_arguments)] + pub fn new( + actor_ctx: ActorContextRef, + schema: Schema, + pk_indices: PkIndices, + stream_source_core: Option>, + metrics: Arc, + barrier_receiver: UnboundedReceiver, + system_params: SystemParamsReaderRef, + executor_id: u64, + source_ctrl_opts: SourceCtrlOpts, + connector_params: ConnectorParams, + ) -> Self { + Self { + actor_ctx, + identity: format!("FsListExecutor {:X}", executor_id), + schema, + pk_indices, + stream_source_core, + metrics, + barrier_receiver: Some(barrier_receiver), + system_params, + source_ctrl_opts, + connector_params, + } + } + + async fn build_chunked_paginate_stream( + &self, + source_desc: &SourceDesc, + ) -> StreamExecutorResult> { + let stream = source_desc + .source + .get_source_list() + .await + .map_err(StreamExecutorError::connector_error)?; + + Ok(stream + .map(|item| item.map(Self::map_fs_page_to_chunk)) + .boxed()) + } + + fn map_fs_page_to_chunk(page: FsPage) -> StreamChunk { + let rows = page + .into_iter() + .map(|split| { + ( + Op::Insert, + OwnedRow::new(vec![ + Some(ScalarImpl::Utf8(split.name.into_boxed_str())), + Some(ScalarImpl::Timestamp(split.timestamp)), + Some(ScalarImpl::Int64(split.size)), + ]), + ) + }) + .collect::>(); + StreamChunk::from_rows( + &rows, + &[DataType::Varchar, DataType::Timestamp, DataType::Int64], + ) + } + + #[try_stream(ok = Message, error = StreamExecutorError)] + async fn into_stream(mut self) { + let mut barrier_receiver = self.barrier_receiver.take().unwrap(); + let barrier = barrier_receiver + .recv() + .instrument_await("source_recv_first_barrier") + .await + .ok_or_else(|| { + anyhow!( + "failed to receive the first barrier, actor_id: {:?}, source_id: {:?}", + self.actor_ctx.id, + self.stream_source_core.as_ref().unwrap().source_id + ) + })?; + + let mut core = self.stream_source_core.unwrap(); + + // Build source description from the builder. + let source_desc_builder: SourceDescBuilder = core.source_desc_builder.take().unwrap(); + let source_desc = source_desc_builder + .build() + .map_err(StreamExecutorError::connector_error)?; + + // Return the ownership of `stream_source_core` to the source executor. + self.stream_source_core = Some(core); + + let chunked_paginate_stream = self.build_chunked_paginate_stream(&source_desc).await?; + + let barrier_stream = barrier_to_message_stream(barrier_receiver).boxed(); + let mut stream = + StreamReaderWithPause::::new(barrier_stream, chunked_paginate_stream); + + if barrier.is_pause_on_startup() { + stream.pause_stream(); + } + + yield Message::Barrier(barrier); + + while let Some(msg) = stream.next().await { + match msg { + Err(e) => { + tracing::warn!("encountered an error, recovering. {:?}", e); + // todo: rebuild stream here + } + Ok(msg) => match msg { + // Barrier arrives. + Either::Left(msg) => match &msg { + Message::Barrier(barrier) => { + if let Some(mutation) = barrier.mutation.as_deref() { + match mutation { + Mutation::Pause => stream.pause_stream(), + Mutation::Resume => stream.resume_stream(), + _ => (), + } + } + + // Propagate the barrier. + yield msg; + } + // Only barrier can be received. + _ => unreachable!(), + }, + // Chunked FsPage arrives. + Either::Right(chunk) => { + yield Message::Chunk(chunk); + } + }, + } + } + } +} + +impl Executor for FsListExecutor { + fn execute(self: Box) -> BoxedMessageStream { + self.into_stream().boxed() + } + + fn schema(&self) -> &Schema { + &self.schema + } + + fn pk_indices(&self) -> PkIndicesRef<'_> { + &self.pk_indices + } + + fn identity(&self) -> &str { + self.identity.as_str() + } +} + +impl Debug for FsListExecutor { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if let Some(core) = &self.stream_source_core { + f.debug_struct("FsListExecutor") + .field("source_id", &core.source_id) + .field("column_ids", &core.column_ids) + .field("pk_indices", &self.pk_indices) + .finish() + } else { + f.debug_struct("FsListExecutor").finish() + } + } +} diff --git a/src/stream/src/executor/source/mod.rs b/src/stream/src/executor/source/mod.rs index 1b06120561f51..18f7346777d6b 100644 --- a/src/stream/src/executor/source/mod.rs +++ b/src/stream/src/executor/source/mod.rs @@ -19,12 +19,15 @@ mod fs_source_executor; pub use fs_source_executor::*; use risingwave_common::bail; pub use state_table_handler::*; +pub mod fetch_executor; +pub use fetch_executor::*; pub mod source_executor; +pub mod list_executor; pub mod state_table_handler; - use futures_async_stream::try_stream; +pub use list_executor::*; use tokio::sync::mpsc::UnboundedReceiver; use crate::executor::error::StreamExecutorError; diff --git a/src/stream/src/executor/source/state_table_handler.rs b/src/stream/src/executor/source/state_table_handler.rs index f1ee9f0c90d4b..d742e72a4c7a9 100644 --- a/src/stream/src/executor/source/state_table_handler.rs +++ b/src/stream/src/executor/source/state_table_handler.rs @@ -14,8 +14,10 @@ use std::collections::HashSet; use std::ops::{Bound, Deref}; +use std::sync::Arc; use futures::{pin_mut, StreamExt}; +use risingwave_common::buffer::Bitmap; use risingwave_common::catalog::{DatabaseId, SchemaId}; use risingwave_common::constants::hummock::PROPERTIES_RETENTION_SECOND_KEY; use risingwave_common::hash::VirtualNode; @@ -36,7 +38,7 @@ use risingwave_storage::StateStore; use crate::common::table::state_table::StateTable; use crate::executor::error::StreamExecutorError; -use crate::executor::StreamExecutorResult; +use crate::executor::{StreamExecutorResult, BACKFILL_STATE_KEY_SUFFIX}; const COMPLETE_SPLIT_PREFIX: &str = "SsGLdzRDqBuKzMf9bDap"; @@ -56,6 +58,21 @@ impl SourceStateTableHandler { } } + pub async fn from_table_catalog_with_vnodes( + table_catalog: &PbTable, + store: S, + vnodes: Option>, + ) -> Self { + // The state of source should not be cleaned up by retention_seconds + assert!(!table_catalog + .properties + .contains_key(&String::from(PROPERTIES_RETENTION_SECOND_KEY))); + + Self { + state_store: StateTable::from_table_catalog(table_catalog, store, vnodes).await, + } + } + pub fn init_epoch(&mut self, epoch: EpochPair) { self.state_store.init_epoch(epoch); } @@ -84,9 +101,9 @@ impl SourceStateTableHandler { // all source executor has vnode id zero let iter = self .state_store - .iter_row_with_pk_range( - &(start, end), + .iter_with_vnode( VirtualNode::ZERO, + &(start, end), PrefetchOptions::new_for_exhaust_iter(), ) .await?; @@ -159,7 +176,7 @@ impl SourceStateTableHandler { Ok(()) } - async fn delete(&mut self, key: SplitId) -> StreamExecutorResult<()> { + pub async fn delete(&mut self, key: SplitId) -> StreamExecutorResult<()> { if let Some(prev_row) = self.get(key).await? { self.state_store.delete(prev_row); } @@ -203,16 +220,41 @@ impl SourceStateTableHandler { &mut self, stream_source_split: &SplitImpl, ) -> StreamExecutorResult> { - Ok(match self.get(stream_source_split.id()).await? { + let split_id = stream_source_split.id(); + Ok(match self.get(split_id.clone()).await? { None => None, Some(row) => match row.datum_at(1) { Some(ScalarRefImpl::Jsonb(jsonb_ref)) => { - Some(SplitImpl::restore_from_json(jsonb_ref.to_owned_scalar())?) + let mut split_impl = SplitImpl::restore_from_json(jsonb_ref.to_owned_scalar())?; + if let SplitImpl::MysqlCdc(ref mut split) = split_impl && let Some(mysql_split) = split.mysql_split.as_mut() { + // if the snapshot_done is not set, we should check whether the backfill is finished + if !mysql_split.inner.snapshot_done { + mysql_split.inner.snapshot_done = self.recover_cdc_snapshot_state(split_id).await?; + } + } + Some(split_impl) } _ => unreachable!(), }, }) } + + async fn recover_cdc_snapshot_state( + &mut self, + split_id: SplitId, + ) -> StreamExecutorResult { + let mut key = split_id.to_string(); + key.push_str(BACKFILL_STATE_KEY_SUFFIX); + + let flag = match self.get(key.into()).await? { + Some(row) => match row.datum_at(1) { + Some(ScalarRefImpl::Jsonb(jsonb_ref)) => jsonb_ref.as_bool()?, + _ => unreachable!("invalid cdc backfill persistent state"), + }, + None => false, + }; + Ok(flag) + } } // align with schema defined in `LogicalSource::infer_internal_table_catalog`. The function is used diff --git a/src/stream/src/executor/stateless_simple_agg.rs b/src/stream/src/executor/stateless_simple_agg.rs index 0f0411b5e277b..cc7e876f1d2de 100644 --- a/src/stream/src/executor/stateless_simple_agg.rs +++ b/src/stream/src/executor/stateless_simple_agg.rs @@ -18,7 +18,9 @@ use itertools::Itertools; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::catalog::Schema; use risingwave_common::util::iter_util::ZipEqFast; -use risingwave_expr::agg::{build_retractable, AggCall, AggregateState, BoxedAggregateFunction}; +use risingwave_expr::aggregate::{ + build_retractable, AggCall, AggregateState, BoxedAggregateFunction, +}; use super::aggregation::{agg_call_filter_res, generate_agg_schema}; use super::error::StreamExecutorError; @@ -26,7 +28,7 @@ use super::*; use crate::error::StreamResult; pub struct StatelessSimpleAggExecutor { - ctx: ActorContextRef, + _ctx: ActorContextRef, pub(super) input: Box, pub(super) info: ExecutorInfo, pub(super) aggs: Vec, @@ -53,15 +55,13 @@ impl Executor for StatelessSimpleAggExecutor { impl StatelessSimpleAggExecutor { async fn apply_chunk( - ctx: &ActorContextRef, - identity: &str, agg_calls: &[AggCall], aggs: &[BoxedAggregateFunction], states: &mut [AggregateState], chunk: &StreamChunk, ) -> StreamExecutorResult<()> { for ((agg, call), state) in aggs.iter().zip_eq_fast(agg_calls).zip_eq_fast(states) { - let vis = agg_call_filter_res(ctx, identity, call, chunk).await?; + let vis = agg_call_filter_res(call, chunk).await?; let chunk = chunk.project_with_vis(call.args.val_indices(), vis); agg.update(state, &chunk).await?; } @@ -71,7 +71,7 @@ impl StatelessSimpleAggExecutor { #[try_stream(ok = Message, error = StreamExecutorError)] async fn execute_inner(self) { let StatelessSimpleAggExecutor { - ctx, + _ctx, input, info, aggs, @@ -87,8 +87,7 @@ impl StatelessSimpleAggExecutor { match msg { Message::Watermark(_) => {} Message::Chunk(chunk) => { - Self::apply_chunk(&ctx, &info.identity, &agg_calls, &aggs, &mut states, &chunk) - .await?; + Self::apply_chunk(&agg_calls, &aggs, &mut states, &chunk).await?; is_dirty = true; } m @ Message::Barrier(_) => { @@ -139,7 +138,7 @@ impl StatelessSimpleAggExecutor { let aggs = agg_calls.iter().map(build_retractable).try_collect()?; Ok(StatelessSimpleAggExecutor { - ctx, + _ctx: ctx, input, info, aggs, diff --git a/src/stream/src/executor/temporal_join.rs b/src/stream/src/executor/temporal_join.rs index 7a523dc89540f..3c8cde63c4ca9 100644 --- a/src/stream/src/executor/temporal_join.rs +++ b/src/stream/src/executor/temporal_join.rs @@ -154,10 +154,11 @@ impl TemporalSide { async fn lookup(&mut self, key: &K, epoch: HummockEpoch) -> StreamExecutorResult { let table_id_str = self.source.table_id().to_string(); let actor_id_str = self.ctx.id.to_string(); + let fragment_id_str = self.ctx.id.to_string(); self.ctx .streaming_metrics .temporal_join_total_query_cache_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); let res = if self.cache.contains(key) { @@ -168,7 +169,7 @@ impl TemporalSide { self.ctx .streaming_metrics .temporal_join_cache_miss_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); let pk_prefix = key.deserialize(&self.join_key_data_types)?; @@ -414,13 +415,14 @@ impl TemporalJoinExecutor let table_id_str = self.right_table.source.table_id().to_string(); let actor_id_str = self.ctx.id.to_string(); + let fragment_id_str = self.ctx.fragment_id.to_string(); #[for_await] for msg in align_input(self.left, self.right) { self.right_table.cache.evict(); self.ctx .streaming_metrics .temporal_join_cached_entry_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .set(self.right_table.cache.len() as i64); match msg? { InternalMessage::WaterMark(watermark) => { @@ -447,9 +449,7 @@ impl TemporalJoinExecutor // check join condition let ok = if let Some(ref mut cond) = self.condition { let concat_row = left_row.chain(&right_row).into_owned_row(); - cond.eval_row_infallible(&concat_row, |err| { - self.ctx.on_compute_error(err, self.identity.as_str()) - }) + cond.eval_row_infallible(&concat_row) .await .map(|s| *s.as_bool()) .unwrap_or(false) diff --git a/src/stream/src/executor/test_utils.rs b/src/stream/src/executor/test_utils.rs index 3c06c36e41b47..bb4864ac04ef8 100644 --- a/src/stream/src/executor/test_utils.rs +++ b/src/stream/src/executor/test_utils.rs @@ -271,7 +271,7 @@ pub mod agg_executor { use risingwave_common::hash::SerializedKey; use risingwave_common::types::DataType; use risingwave_common::util::sort_util::OrderType; - use risingwave_expr::agg::{AggCall, AggKind}; + use risingwave_expr::aggregate::{AggCall, AggKind}; use risingwave_storage::StateStore; use crate::common::table::state_table::StateTable; @@ -454,6 +454,7 @@ pub mod agg_executor { extra: HashAggExecutorExtraArgs { group_key_indices, chunk_size: 1024, + max_dirty_groups_heap_size: 64 << 20, emit_on_window_close, }, }) diff --git a/src/stream/src/executor/top_n/group_top_n.rs b/src/stream/src/executor/top_n/group_top_n.rs index 78c12ee82f3cd..92681e3c31426 100644 --- a/src/stream/src/executor/top_n/group_top_n.rs +++ b/src/stream/src/executor/top_n/group_top_n.rs @@ -44,16 +44,14 @@ impl GroupTopNExecutor, ctx: ActorContextRef, + info: ExecutorInfo, storage_key: Vec, offset_and_limit: (usize, usize), order_by: Vec, - executor_id: u64, group_by: Vec, state_table: StateTable, watermark_epoch: AtomicU64Ref, - pk_indices: PkIndices, ) -> StreamResult { - let info = input.info(); Ok(TopNExecutorWrapper { input, ctx: ctx.clone(), @@ -62,12 +60,10 @@ impl GroupTopNExecutor InnerGroupTopNExecutor { #[allow(clippy::too_many_arguments)] pub fn new( - input_info: ExecutorInfo, + info: ExecutorInfo, storage_key: Vec, offset_and_limit: (usize, usize), order_by: Vec, - executor_id: u64, group_by: Vec, state_table: StateTable, watermark_epoch: AtomicU64Ref, ctx: ActorContextRef, - pk_indices: PkIndices, ) -> StreamResult { - let ExecutorInfo { - schema: input_schema, - .. - } = input_info; - let metrics_info = MetricsInfo::new( ctx.streaming_metrics.clone(), state_table.table_id(), @@ -126,15 +115,11 @@ impl InnerGroupTopNExecutor::new(state_table, cache_key_serde.clone()); Ok(Self { - info: ExecutorInfo { - schema: input_schema, - pk_indices, - identity: format!("GroupTopNExecutor {:X}", executor_id), - }, + info, offset: offset_and_limit.0, limit: offset_and_limit.1, managed_state, @@ -184,6 +169,7 @@ where let keys = K::build(&self.group_by, chunk.data_chunk())?; let table_id_str = self.managed_state.state_table.table_id().to_string(); let actor_id_str = self.ctx.id.to_string(); + let fragment_id_str = self.ctx.fragment_id.to_string(); for (r, group_cache_key) in chunk.rows_with_holes().zip_eq_debug(keys.iter()) { let Some((op, row_ref)) = r else { continue; @@ -196,7 +182,7 @@ where self.ctx .streaming_metrics .group_top_n_total_query_cache_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); // If 'self.caches' does not already have a cache for the current group, create a new // cache for it and insert it into `self.caches` @@ -204,7 +190,7 @@ where self.ctx .streaming_metrics .group_top_n_cache_miss_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); let mut topn_cache = TopNCache::new(self.offset, self.limit, self.schema().data_types()); @@ -241,7 +227,7 @@ where self.ctx .streaming_metrics .group_top_n_cached_entry_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .set(self.caches.len() as i64); generate_output(res_rows, res_ops, self.schema()) } @@ -405,20 +391,25 @@ mod tests { &pk_indices(), ) .await; - let a = GroupTopNExecutor::::new( - source as Box, - ActorContext::create(0), - storage_key(), - (0, 2), - order_by_1(), - 1, - vec![1], - state_table, - Arc::new(AtomicU64::new(0)), - pk_indices(), - ) - .unwrap(); - let top_n_executor = Box::new(a); + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), // this includes group key as prefix + identity: "GroupTopNExecutor 1".to_string(), + }; + let top_n_executor = Box::new( + GroupTopNExecutor::::new( + source as Box, + ActorContext::create(0), + info, + storage_key(), + (0, 2), + order_by_1(), + vec![1], + state_table, + Arc::new(AtomicU64::new(0)), + ) + .unwrap(), + ); let mut top_n_executor = top_n_executor.execute(); // consume the init barrier @@ -502,18 +493,22 @@ mod tests { &pk_indices(), ) .await; + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), // this includes group key as prefix + identity: "GroupTopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( GroupTopNExecutor::::new( source as Box, ActorContext::create(0), + info, storage_key(), (1, 2), order_by_1(), - 1, vec![1], state_table, Arc::new(AtomicU64::new(0)), - pk_indices(), ) .unwrap(), ); @@ -593,18 +588,22 @@ mod tests { &pk_indices(), ) .await; + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), // this includes group key as prefix + identity: "GroupTopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( GroupTopNExecutor::::new( source as Box, ActorContext::create(0), + info, storage_key(), (0, 2), order_by_2(), - 1, vec![1, 2], state_table, Arc::new(AtomicU64::new(0)), - pk_indices(), ) .unwrap(), ); diff --git a/src/stream/src/executor/top_n/group_top_n_appendonly.rs b/src/stream/src/executor/top_n/group_top_n_appendonly.rs index 140a06984e586..f8f3d4887b970 100644 --- a/src/stream/src/executor/top_n/group_top_n_appendonly.rs +++ b/src/stream/src/executor/top_n/group_top_n_appendonly.rs @@ -62,16 +62,14 @@ impl pub fn new( input: Box, ctx: ActorContextRef, + info: ExecutorInfo, storage_key: Vec, offset_and_limit: (usize, usize), order_by: Vec, - executor_id: u64, group_by: Vec, state_table: StateTable, watermark_epoch: AtomicU64Ref, - pk_indices: PkIndices, ) -> StreamResult { - let info = input.info(); Ok(TopNExecutorWrapper { input, ctx: ctx.clone(), @@ -80,12 +78,10 @@ impl storage_key, offset_and_limit, order_by, - executor_id, group_by, state_table, watermark_epoch, ctx, - pk_indices, )?, }) } @@ -122,22 +118,15 @@ impl { #[allow(clippy::too_many_arguments)] pub fn new( - input_info: ExecutorInfo, + info: ExecutorInfo, storage_key: Vec, offset_and_limit: (usize, usize), order_by: Vec, - executor_id: u64, group_by: Vec, state_table: StateTable, watermark_epoch: AtomicU64Ref, ctx: ActorContextRef, - pk_indices: PkIndices, ) -> StreamResult { - let ExecutorInfo { - schema: input_schema, - .. - } = input_info; - let metrics_info = MetricsInfo::new( ctx.streaming_metrics.clone(), state_table.table_id(), @@ -146,15 +135,11 @@ impl ); let cache_key_serde = - create_cache_key_serde(&storage_key, &input_schema, &order_by, &group_by); + create_cache_key_serde(&storage_key, &info.schema, &order_by, &group_by); let managed_state = ManagedTopNState::::new(state_table, cache_key_serde.clone()); Ok(Self { - info: ExecutorInfo { - schema: input_schema, - pk_indices, - identity: format!("AppendOnlyGroupTopNExecutor {:X}", executor_id), - }, + info, offset: offset_and_limit.0, limit: offset_and_limit.1, managed_state, @@ -181,6 +166,7 @@ where let row_deserializer = RowDeserializer::new(data_types.clone()); let table_id_str = self.managed_state.state_table.table_id().to_string(); let actor_id_str = self.ctx.id.to_string(); + let fragment_id_str = self.ctx.fragment_id.to_string(); for (r, group_cache_key) in chunk.rows_with_holes().zip_eq_debug(keys.iter()) { let Some((op, row_ref)) = r else { continue; @@ -193,7 +179,7 @@ where self.ctx .streaming_metrics .group_top_n_appendonly_total_query_cache_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); // If 'self.caches' does not already have a cache for the current group, create a new // cache for it and insert it into `self.caches` @@ -201,7 +187,7 @@ where self.ctx .streaming_metrics .group_top_n_appendonly_cache_miss_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); let mut topn_cache = TopNCache::new(self.offset, self.limit, data_types.clone()); self.managed_state @@ -224,7 +210,7 @@ where self.ctx .streaming_metrics .group_top_n_appendonly_cached_entry_count - .with_label_values(&[&table_id_str, &actor_id_str]) + .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .set(self.caches.len() as i64); generate_output(res_rows, res_ops, self.schema()) } diff --git a/src/stream/src/executor/top_n/top_n_appendonly.rs b/src/stream/src/executor/top_n/top_n_appendonly.rs index aa93e2f2f519e..a56b43e4c5903 100644 --- a/src/stream/src/executor/top_n/top_n_appendonly.rs +++ b/src/stream/src/executor/top_n/top_n_appendonly.rs @@ -41,14 +41,12 @@ impl AppendOnlyTopNExecutor pub fn new( input: Box, ctx: ActorContextRef, + info: ExecutorInfo, storage_key: Vec, offset_and_limit: (usize, usize), order_by: Vec, - executor_id: u64, state_table: StateTable, ) -> StreamResult { - let info = input.info(); - Ok(TopNExecutorWrapper { input, ctx, @@ -57,7 +55,6 @@ impl AppendOnlyTopNExecutor storage_key, offset_and_limit, order_by, - executor_id, state_table, )?, }) @@ -84,30 +81,21 @@ pub struct InnerAppendOnlyTopNExecutor { impl InnerAppendOnlyTopNExecutor { #[allow(clippy::too_many_arguments)] pub fn new( - input_info: ExecutorInfo, + info: ExecutorInfo, storage_key: Vec, offset_and_limit: (usize, usize), order_by: Vec, - executor_id: u64, state_table: StateTable, ) -> StreamResult { - let ExecutorInfo { - pk_indices, schema, .. - } = input_info; - let num_offset = offset_and_limit.0; let num_limit = offset_and_limit.1; - let cache_key_serde = create_cache_key_serde(&storage_key, &schema, &order_by, &[]); + let cache_key_serde = create_cache_key_serde(&storage_key, &info.schema, &order_by, &[]); let managed_state = ManagedTopNState::::new(state_table, cache_key_serde.clone()); - let data_types = schema.data_types(); + let data_types = info.schema.data_types(); Ok(Self { - info: ExecutorInfo { - schema, - pk_indices, - identity: format!("AppendOnlyTopNExecutor {:X}", executor_id), - }, + info, managed_state, storage_key_indices: storage_key.into_iter().map(|op| op.column_index).collect(), cache: TopNCache::new(num_offset, num_limit, data_types), @@ -180,7 +168,7 @@ mod tests { use super::AppendOnlyTopNExecutor; use crate::executor::test_utils::top_n_executor::create_in_memory_state_table; use crate::executor::test_utils::MockSource; - use crate::executor::{ActorContext, Barrier, Executor, Message, PkIndices}; + use crate::executor::{ActorContext, Barrier, Executor, ExecutorInfo, Message, PkIndices}; fn create_stream_chunks() -> Vec { let chunk1 = StreamChunk::from_pretty( @@ -261,14 +249,19 @@ mod tests { ) .await; + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "AppendOnlyTopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( AppendOnlyTopNExecutor::<_, false>::new( source as Box, ActorContext::create(0), + info, storage_key, (0, 5), order_by(), - 1, state_table, ) .unwrap(), @@ -343,14 +336,19 @@ mod tests { ) .await; + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "AppendOnlyTopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( AppendOnlyTopNExecutor::<_, false>::new( source as Box, ActorContext::create(0), + info, storage_key(), (3, 4), order_by(), - 1, state_table, ) .unwrap(), diff --git a/src/stream/src/executor/top_n/top_n_plain.rs b/src/stream/src/executor/top_n/top_n_plain.rs index cfb71053b18e4..e3cc70b9fc0b7 100644 --- a/src/stream/src/executor/top_n/top_n_plain.rs +++ b/src/stream/src/executor/top_n/top_n_plain.rs @@ -36,14 +36,12 @@ impl TopNExecutor { pub fn new( input: Box, ctx: ActorContextRef, + info: ExecutorInfo, storage_key: Vec, offset_and_limit: (usize, usize), order_by: Vec, - executor_id: u64, state_table: StateTable, ) -> StreamResult { - let info = input.info(); - Ok(TopNExecutorWrapper { input, ctx, @@ -52,7 +50,6 @@ impl TopNExecutor { storage_key, offset_and_limit, order_by, - executor_id, state_table, )?, }) @@ -67,22 +64,14 @@ impl TopNExecutor { pub fn new_with_ties_for_test( input: Box, ctx: ActorContextRef, + info: ExecutorInfo, storage_key: Vec, offset_and_limit: (usize, usize), order_by: Vec, - executor_id: u64, state_table: StateTable, ) -> StreamResult { - let info = input.info(); - - let mut inner = InnerTopNExecutor::new( - info, - storage_key, - offset_and_limit, - order_by, - executor_id, - state_table, - )?; + let mut inner = + InnerTopNExecutor::new(info, storage_key, offset_and_limit, order_by, state_table)?; inner.cache.high_capacity = 2; @@ -115,29 +104,21 @@ impl InnerTopNExecutor { /// into `CacheKey`. #[allow(clippy::too_many_arguments)] pub fn new( - input_info: ExecutorInfo, + info: ExecutorInfo, storage_key: Vec, offset_and_limit: (usize, usize), order_by: Vec, - executor_id: u64, state_table: StateTable, ) -> StreamResult { - let ExecutorInfo { - pk_indices, schema, .. - } = input_info; let num_offset = offset_and_limit.0; let num_limit = offset_and_limit.1; - let cache_key_serde = create_cache_key_serde(&storage_key, &schema, &order_by, &[]); + let cache_key_serde = create_cache_key_serde(&storage_key, &info.schema, &order_by, &[]); let managed_state = ManagedTopNState::::new(state_table, cache_key_serde.clone()); - let data_types = schema.data_types(); + let data_types = info.schema.data_types(); Ok(Self { - info: ExecutorInfo { - schema, - pk_indices, - identity: format!("TopNExecutor {:X}", executor_id), - }, + info, managed_state, storage_key_indices: storage_key.into_iter().map(|op| op.column_index).collect(), cache: TopNCache::new(num_offset, num_limit, data_types), @@ -311,14 +292,20 @@ mod tests { &pk_indices(), ) .await; + + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "TopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( TopNExecutor::<_, false>::new( source as Box, ActorContext::create(0), + info, storage_key(), (3, 1000), order_by(), - 1, state_table, ) .unwrap(), @@ -407,14 +394,19 @@ mod tests { &pk_indices(), ) .await; + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "TopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( TopNExecutor::<_, false>::new( source as Box, ActorContext::create(0), + info, storage_key(), (0, 4), order_by(), - 1, state_table, ) .unwrap(), @@ -515,14 +507,19 @@ mod tests { &pk_indices(), ) .await; + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "TopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( TopNExecutor::<_, true>::new( source as Box, ActorContext::create(0), + info, storage_key(), (0, 4), order_by(), - 1, state_table, ) .unwrap(), @@ -622,14 +619,19 @@ mod tests { &pk_indices(), ) .await; + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "TopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( TopNExecutor::<_, false>::new( source as Box, ActorContext::create(0), + info, storage_key(), (3, 4), order_by(), - 1, state_table, ) .unwrap(), @@ -849,14 +851,19 @@ mod tests { &pk_indices(), ) .await; + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "TopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( TopNExecutor::<_, false>::new( source as Box, ActorContext::create(0), + info, storage_key(), (1, 3), order_by(), - 1, state_table, ) .unwrap(), @@ -927,14 +934,20 @@ mod tests { state_store.clone(), ) .await; + let source = create_source_new_before_recovery(); + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "TopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( TopNExecutor::<_, false>::new( - create_source_new_before_recovery() as Box, + source as Box, ActorContext::create(0), + info, storage_key(), (1, 3), order_by(), - 1, state_table, ) .unwrap(), @@ -981,14 +994,20 @@ mod tests { .await; // recovery + let source = create_source_new_after_recovery(); + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "TopNExecutor 1".to_string(), + }; let top_n_executor_after_recovery = Box::new( TopNExecutor::<_, false>::new( - create_source_new_after_recovery() as Box, + source as Box, ActorContext::create(0), + info, storage_key(), (1, 3), order_by(), - 1, state_table, ) .unwrap(), @@ -1110,14 +1129,19 @@ mod tests { &pk_indices(), ) .await; + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "TopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( TopNExecutor::new_with_ties_for_test( source as Box, ActorContext::create(0), + info, storage_key(), (0, 3), order_by(), - 1, state_table, ) .unwrap(), @@ -1260,14 +1284,20 @@ mod tests { state_store.clone(), ) .await; + let source = create_source_before_recovery(); + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "TopNExecutor 1".to_string(), + }; let top_n_executor = Box::new( TopNExecutor::new_with_ties_for_test( - create_source_before_recovery() as Box, + source as Box, ActorContext::create(0), + info, storage_key(), (0, 3), order_by(), - 1, state_table, ) .unwrap(), @@ -1317,14 +1347,20 @@ mod tests { .await; // recovery + let source = create_source_after_recovery(); + let info = ExecutorInfo { + schema: source.schema().clone(), + pk_indices: source.pk_indices().to_vec(), + identity: "TopNExecutor 1".to_string(), + }; let top_n_executor_after_recovery = Box::new( TopNExecutor::new_with_ties_for_test( - create_source_after_recovery() as Box, + source as Box, ActorContext::create(0), + info, storage_key(), (0, 3), order_by(), - 1, state_table, ) .unwrap(), diff --git a/src/stream/src/executor/top_n/top_n_state.rs b/src/stream/src/executor/top_n/top_n_state.rs index 87d19e8550861..841e7f5bb50d7 100644 --- a/src/stream/src/executor/top_n/top_n_state.rs +++ b/src/stream/src/executor/top_n/top_n_state.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::ops::Bound; + use futures::{pin_mut, StreamExt}; use risingwave_common::row::{OwnedRow, Row, RowExt}; use risingwave_common::util::epoch::EpochPair; @@ -81,9 +83,10 @@ impl ManagedTopNState { offset: usize, limit: Option, ) -> StreamExecutorResult> { + let sub_range: &(Bound, Bound) = &(Bound::Unbounded, Bound::Unbounded); let state_table_iter = self .state_table - .iter_row_with_pk_prefix(&group_key, Default::default()) + .iter_with_prefix(&group_key, sub_range, Default::default()) .await?; pin_mut!(state_table_iter); @@ -118,10 +121,12 @@ impl ManagedTopNState { cache_size_limit: usize, ) -> StreamExecutorResult<()> { let cache = &mut topn_cache.high; + let sub_range: &(Bound, Bound) = &(Bound::Unbounded, Bound::Unbounded); let state_table_iter = self .state_table - .iter_row_with_pk_prefix( + .iter_with_prefix( &group_key, + sub_range, PrefetchOptions { exhaust_iter: cache_size_limit == usize::MAX, }, @@ -165,11 +170,12 @@ impl ManagedTopNState { assert!(topn_cache.low.is_empty()); assert!(topn_cache.middle.is_empty()); assert!(topn_cache.high.is_empty()); - + let sub_range: &(Bound, Bound) = &(Bound::Unbounded, Bound::Unbounded); let state_table_iter = self .state_table - .iter_row_with_pk_prefix( + .iter_with_prefix( &group_key, + sub_range, PrefetchOptions { exhaust_iter: topn_cache.limit == usize::MAX, }, diff --git a/src/stream/src/executor/values.rs b/src/stream/src/executor/values.rs index 512e9f6c28da3..624b2531bf7bd 100644 --- a/src/stream/src/executor/values.rs +++ b/src/stream/src/executor/values.rs @@ -83,10 +83,23 @@ impl ValuesExecutor { .unwrap(); let emit = barrier.is_newly_added(self.ctx.id); + let paused_on_startup = barrier.is_pause_on_startup(); yield Message::Barrier(barrier); + // If it's failover, do not evaluate rows (assume they have been yielded) if emit { + if paused_on_startup { + // Wait for the data stream to be resumed before yielding the chunks. + while let Some(barrier) = barrier_receiver.recv().await { + let is_resume = barrier.is_resume(); + yield Message::Barrier(barrier); + if is_resume { + break; + } + } + } + let cardinality = schema.len(); ensure!(cardinality > 0); while !rows.is_empty() { @@ -99,11 +112,7 @@ impl ValuesExecutor { let mut array_builders = schema.create_array_builders(chunk_size); for row in rows.by_ref().take(chunk_size) { for (expr, builder) in row.into_iter().zip_eq_fast(&mut array_builders) { - let out = expr - .eval_infallible(&one_row_chunk, |err| { - self.ctx.on_compute_error(err, self.identity.as_str()) - }) - .await; + let out = expr.eval_infallible(&one_row_chunk).await; builder.append_array(&out); } } @@ -123,7 +132,7 @@ impl ValuesExecutor { while let Some(barrier) = barrier_receiver.recv().await { if emit { - progress.finish(barrier.epoch.curr); + progress.finish(barrier.epoch.curr, 0); } yield Message::Barrier(barrier); } diff --git a/src/stream/src/executor/watermark_filter.rs b/src/stream/src/executor/watermark_filter.rs index 5f2a92e4e7f7b..ad332112ef269 100644 --- a/src/stream/src/executor/watermark_filter.rs +++ b/src/stream/src/executor/watermark_filter.rs @@ -23,7 +23,7 @@ use risingwave_common::row::{OwnedRow, Row}; use risingwave_common::types::{DataType, DefaultOrd, ScalarImpl}; use risingwave_common::{bail, row}; use risingwave_expr::expr::{ - build_func, BoxedExpression, Expression, InputRefExpression, LiteralExpression, + build_func_non_strict, BoxedExpression, Expression, InputRefExpression, LiteralExpression, }; use risingwave_expr::Result as ExprResult; use risingwave_pb::expr::expr_node::Type; @@ -36,6 +36,7 @@ use super::{ }; use crate::common::table::state_table::StateTable; use crate::executor::{expect_first_barrier, Watermark}; +use crate::task::ActorEvalErrorReport; /// The executor will generate a `Watermark` after each chunk. /// This will also guarantee all later rows with event time **less than** the watermark will be @@ -58,8 +59,13 @@ impl WatermarkFilterExecutor { event_time_col_idx: usize, ctx: ActorContextRef, table: StateTable, + executor_id: u64, ) -> Self { - let info = input.info(); + let info = ExecutorInfo { + schema: input.info().schema, + pk_indices: input.info().pk_indices, + identity: format!("WatermarkFilterExecutor {:X}", executor_id), + }; Self { input, @@ -106,6 +112,11 @@ impl WatermarkFilterExecutor { mut table, } = *self; + let eval_error_report = ActorEvalErrorReport { + actor_context: ctx.clone(), + identity: info.identity.into(), + }; + let watermark_type = watermark_expr.return_type(); assert_eq!( watermark_type, @@ -119,16 +130,17 @@ impl WatermarkFilterExecutor { yield Message::Barrier(first_barrier); // Initiate and yield the first watermark. - let mut current_watermark = - Self::get_global_max_watermark(&table, watermark_type.clone()).await?; + let mut current_watermark = Self::get_global_max_watermark(&table).await?; - let mut last_checkpoint_watermark = watermark_type.min_value(); + let mut last_checkpoint_watermark = None; - yield Message::Watermark(Watermark::new( - event_time_col_idx, - watermark_type.clone(), - current_watermark.clone(), - )); + if let Some(watermark) = current_watermark.clone() { + yield Message::Watermark(Watermark::new( + event_time_col_idx, + watermark_type.clone(), + watermark.clone(), + )); + } // If the input is idle let mut idle_input = true; @@ -145,18 +157,20 @@ impl WatermarkFilterExecutor { continue; } - let watermark_array = watermark_expr - .eval_infallible(chunk.data_chunk(), |err| { - ctx.on_compute_error(err, &info.identity) - }) - .await; + let watermark_array = watermark_expr.eval_infallible(chunk.data_chunk()).await; // Build the expression to calculate watermark filter. - let watermark_filter_expr = Self::build_watermark_filter_expr( - watermark_type.clone(), - event_time_col_idx, - current_watermark.clone(), - )?; + let watermark_filter_expr = current_watermark + .clone() + .map(|watermark| { + Self::build_watermark_filter_expr( + watermark_type.clone(), + event_time_col_idx, + watermark, + eval_error_report.clone(), + ) + }) + .transpose()?; // NULL watermark should not be considered. let max_watermark = watermark_array @@ -166,41 +180,49 @@ impl WatermarkFilterExecutor { if let Some(max_watermark) = max_watermark { // Assign a new watermark. - current_watermark = cmp::max_by( - current_watermark, + current_watermark = Some(current_watermark.map_or( max_watermark.into_scalar_impl(), - DefaultOrd::default_cmp, - ); + |watermark| { + cmp::max_by( + watermark, + max_watermark.into_scalar_impl(), + DefaultOrd::default_cmp, + ) + }, + )); } - let pred_output = watermark_filter_expr - .eval_infallible(chunk.data_chunk(), |err| { - ctx.on_compute_error(err, &info.identity) - }) - .await; - - if let Some(output_chunk) = FilterExecutor::filter(chunk, pred_output)? { - yield Message::Chunk(output_chunk); - }; - - idle_input = false; - yield Message::Watermark(Watermark::new( - event_time_col_idx, - watermark_type.clone(), - current_watermark.clone(), - )); + if let Some(expr) = watermark_filter_expr { + let pred_output = expr.eval_infallible(chunk.data_chunk()).await; + + if let Some(output_chunk) = FilterExecutor::filter(chunk, pred_output)? { + yield Message::Chunk(output_chunk); + }; + } else { + // No watermark + yield Message::Chunk(chunk); + } + + if let Some(watermark) = current_watermark.clone() { + idle_input = false; + yield Message::Watermark(Watermark::new( + event_time_col_idx, + watermark_type.clone(), + watermark, + )); + } } Message::Watermark(watermark) => { if watermark.col_idx == event_time_col_idx { tracing::warn!("WatermarkFilterExecutor received a watermark on the event it is filtering."); let watermark = watermark.val; - if current_watermark.default_cmp(&watermark).is_lt() { - current_watermark = watermark; + if let Some(cur_watermark) = current_watermark.clone() && cur_watermark.default_cmp(&watermark).is_lt() { + current_watermark = Some(watermark.clone()); idle_input = false; yield Message::Watermark(Watermark::new( event_time_col_idx, watermark_type.clone(), - current_watermark.clone(), + watermark, )); } } else { @@ -215,9 +237,7 @@ impl WatermarkFilterExecutor { // Take the global max watermark when scaling happens. if previous_vnode_bitmap != vnode_bitmap { - current_watermark = - Self::get_global_max_watermark(&table, watermark_type.clone()) - .await?; + current_watermark = Self::get_global_max_watermark(&table).await?; } } @@ -226,12 +246,14 @@ impl WatermarkFilterExecutor { { last_checkpoint_watermark = current_watermark.clone(); // Persist the watermark when checkpoint arrives. - let vnodes = table.get_vnodes(); - for vnode in vnodes.iter_vnodes() { - let pk = Some(ScalarImpl::Int16(vnode.to_scalar())); - let row = [pk, Some(current_watermark.clone())]; - // FIXME(yuhao): use upsert. - table.insert(row); + if let Some(watermark) = current_watermark.clone() { + let vnodes = table.get_vnodes(); + for vnode in vnodes.iter_vnodes() { + let pk = Some(ScalarImpl::Int16(vnode.to_scalar())); + let row = [pk, Some(watermark.clone())]; + // This is an upsert. + table.insert(row); + } } table.commit(barrier.epoch).await?; } else { @@ -242,18 +264,24 @@ impl WatermarkFilterExecutor { if idle_input { // Align watermark let global_max_watermark = - Self::get_global_max_watermark(&table, watermark_type.clone()) - .await?; - current_watermark = cmp::max_by( - current_watermark, - global_max_watermark, - DefaultOrd::default_cmp, - ); - yield Message::Watermark(Watermark::new( - event_time_col_idx, - watermark_type.clone(), - current_watermark.clone(), - )); + Self::get_global_max_watermark(&table).await?; + + current_watermark = if let Some(global_max_watermark) = global_max_watermark.clone() && let Some(watermark) = current_watermark.clone(){ + Some(cmp::max_by( + watermark, + global_max_watermark, + DefaultOrd::default_cmp, + )) + } else { + current_watermark.or(global_max_watermark) + }; + if let Some(watermark) = current_watermark.clone() { + yield Message::Watermark(Watermark::new( + event_time_col_idx, + watermark_type.clone(), + watermark, + )); + } } else { idle_input = true; } @@ -269,21 +297,23 @@ impl WatermarkFilterExecutor { watermark_type: DataType, event_time_col_idx: usize, watermark: ScalarImpl, + eval_error_report: ActorEvalErrorReport, ) -> ExprResult { - build_func( + build_func_non_strict( Type::GreaterThanOrEqual, DataType::Boolean, vec![ InputRefExpression::new(watermark_type.clone(), event_time_col_idx).boxed(), LiteralExpression::new(watermark_type, Some(watermark)).boxed(), ], + eval_error_report, ) } + /// If the returned if `Ok(None)`, it means there is no global max watermark. async fn get_global_max_watermark( table: &StateTable, - watermark_type: DataType, - ) -> StreamExecutorResult { + ) -> StreamExecutorResult> { let watermark_iter_futures = (0..VirtualNode::COUNT).map(|vnode| async move { let pk = row::once(Some(ScalarImpl::Int16(vnode as _))); let watermark_row: Option = table.get_row(pk).await?; @@ -307,8 +337,7 @@ impl WatermarkFilterExecutor { let watermark = watermarks .into_iter() .flatten() - .max_by(DefaultOrd::default_cmp) - .unwrap_or_else(|| watermark_type.min_value()); + .max_by(DefaultOrd::default_cmp); Ok(watermark) } @@ -389,6 +418,7 @@ mod tests { 1, ActorContext::create(123), table, + 0, ) .boxed(), tx, @@ -431,13 +461,6 @@ mod tests { }; } - // Init watermark - let watermark = executor.next().await.unwrap().unwrap(); - assert_eq!( - watermark.into_watermark().unwrap(), - watermark!(WATERMARK_TYPE.min_value()), - ); - // push the 1st chunk tx.push_chunk(chunk1); let chunk = executor.next().await.unwrap().unwrap(); diff --git a/src/stream/src/executor/wrapper.rs b/src/stream/src/executor/wrapper.rs index 3109ba6ffe676..1b52911b2b509 100644 --- a/src/stream/src/executor/wrapper.rs +++ b/src/stream/src/executor/wrapper.rs @@ -17,11 +17,7 @@ use std::sync::Arc; use futures::StreamExt; use risingwave_common::catalog::Schema; -use super::monitor::StreamingMetrics; -use super::{ - BoxedExecutor, BoxedMessageStream, Executor, ExecutorInfo, MessageStream, PkIndicesRef, -}; -use crate::task::ActorId; +use super::*; mod epoch_check; mod epoch_provide; @@ -29,21 +25,11 @@ mod schema_check; mod trace; mod update_check; -struct ExtraInfo { - /// Index of input to this operator. - input_pos: usize, - - actor_id: ActorId, - executor_id: u64, - - metrics: Arc, -} - /// [`WrapperExecutor`] will do some sanity checks and logging for the wrapped executor. pub struct WrapperExecutor { input: BoxedExecutor, - extra: ExtraInfo, + actor_ctx: ActorContextRef, enable_executor_row_count: bool, } @@ -51,29 +37,19 @@ pub struct WrapperExecutor { impl WrapperExecutor { pub fn new( input: BoxedExecutor, - input_pos: usize, - actor_id: ActorId, - executor_id: u64, - metrics: Arc, + actor_ctx: ActorContextRef, enable_executor_row_count: bool, ) -> Self { Self { input, - extra: ExtraInfo { - input_pos, - actor_id, - executor_id, - metrics, - }, + actor_ctx, enable_executor_row_count, } } #[allow(clippy::let_and_return)] fn wrap_debug( - _enable_executor_row_count: bool, info: Arc, - _extra: ExtraInfo, stream: impl MessageStream + 'static, ) -> impl MessageStream + 'static { // Update check @@ -85,14 +61,13 @@ impl WrapperExecutor { fn wrap( enable_executor_row_count: bool, info: Arc, - extra: ExtraInfo, + actor_ctx: ActorContextRef, stream: impl MessageStream + 'static, ) -> BoxedMessageStream { // -- Shared wrappers -- // Await tree - let stream = - trace::instrument_await_tree(info.clone(), extra.actor_id, extra.executor_id, stream); + let stream = trace::instrument_await_tree(info.clone(), actor_ctx.id, stream); // Schema check let stream = schema_check::schema_check(info.clone(), stream); @@ -103,18 +78,10 @@ impl WrapperExecutor { let stream = epoch_provide::epoch_provide(stream); // Trace - let stream = trace::trace( - enable_executor_row_count, - info.clone(), - extra.input_pos, - extra.actor_id, - extra.executor_id, - extra.metrics.clone(), - stream, - ); + let stream = trace::trace(enable_executor_row_count, info.clone(), actor_ctx, stream); if cfg!(debug_assertions) { - Self::wrap_debug(enable_executor_row_count, info, extra, stream).boxed() + Self::wrap_debug(info, stream).boxed() } else { stream.boxed() } @@ -127,7 +94,7 @@ impl Executor for WrapperExecutor { Self::wrap( self.enable_executor_row_count, info, - self.extra, + self.actor_ctx, self.input.execute(), ) .boxed() @@ -138,7 +105,7 @@ impl Executor for WrapperExecutor { Self::wrap( self.enable_executor_row_count, info, - self.extra, + self.actor_ctx, self.input.execute_with_epoch(epoch), ) .boxed() diff --git a/src/stream/src/executor/wrapper/schema_check.rs b/src/stream/src/executor/wrapper/schema_check.rs index d23eca2b455c6..3e8738db8327a 100644 --- a/src/stream/src/executor/wrapper/schema_check.rs +++ b/src/stream/src/executor/wrapper/schema_check.rs @@ -45,7 +45,7 @@ pub async fn schema_check(info: Arc, input: impl MessageStream) { } Message::Barrier(_) => Ok(()), } - .unwrap_or_else(|e| panic!("schema check failed on {}: {}", info.identity, e)); + .unwrap_or_else(|e| panic!("schema check failed on {:?}: {}", info, e)); yield message; } diff --git a/src/stream/src/executor/wrapper/trace.rs b/src/stream/src/executor/wrapper/trace.rs index 0b18d54a0bb58..fbf22c5d6d34b 100644 --- a/src/stream/src/executor/wrapper/trace.rs +++ b/src/stream/src/executor/wrapper/trace.rs @@ -20,8 +20,7 @@ use futures_async_stream::try_stream; use tracing::{Instrument, Span}; use crate::executor::error::StreamExecutorError; -use crate::executor::monitor::StreamingMetrics; -use crate::executor::{ExecutorInfo, Message, MessageStream}; +use crate::executor::{ActorContextRef, ExecutorInfo, Message, MessageStream}; use crate::task::ActorId; /// Streams wrapped by `trace` will be traced with `tracing` spans and reported to `opentelemetry`. @@ -29,19 +28,21 @@ use crate::task::ActorId; pub async fn trace( enable_executor_row_count: bool, info: Arc, - _input_pos: usize, - actor_id: ActorId, - executor_id: u64, - metrics: Arc, + actor_ctx: ActorContextRef, input: impl MessageStream, ) { - let actor_id_string = actor_id.to_string(); - - let span_name = pretty_identity(&info.identity, actor_id, executor_id); - - let is_sink_or_mv = info.identity.contains("Materialize") || info.identity.contains("Sink"); - - let new_span = || tracing::info_span!("executor", "otel.name" = span_name, actor_id); + let actor_id_str = actor_ctx.id.to_string(); + let fragment_id_str = actor_ctx.fragment_id.to_string(); + + let span_name = pretty_identity(&info.identity, actor_ctx.id); + + let new_span = || { + tracing::info_span!( + "executor", + "otel.name" = span_name, + "actor_id" = actor_ctx.id + ) + }; let mut span = new_span(); pin_mut!(input); @@ -51,13 +52,14 @@ pub async fn trace( span.in_scope(|| match &message { Message::Chunk(chunk) => { if chunk.cardinality() > 0 { - if enable_executor_row_count || is_sink_or_mv { - metrics + if enable_executor_row_count { + actor_ctx + .streaming_metrics .executor_row_count - .with_label_values(&[&actor_id_string, &span_name]) + .with_label_values(&[&actor_id_str, &fragment_id_str, &info.identity]) .inc_by(chunk.cardinality() as u64); } - tracing::trace!( + tracing::debug!( target: "events::stream::message::chunk", cardinality = chunk.cardinality(), capacity = chunk.capacity(), @@ -66,14 +68,14 @@ pub async fn trace( } } Message::Watermark(watermark) => { - tracing::trace!( + tracing::debug!( target: "events::stream::message::watermark", value = ?watermark.val, col_idx = watermark.col_idx, ); } Message::Barrier(barrier) => { - tracing::trace!( + tracing::debug!( target: "events::stream::message::barrier", prev_epoch = barrier.epoch.prev, curr_epoch = barrier.epoch.curr, @@ -100,13 +102,8 @@ pub async fn trace( } } -fn pretty_identity(identity: &str, actor_id: ActorId, executor_id: u64) -> String { - format!( - "{} (actor {}, operator {})", - identity, - actor_id, - executor_id as u32 // The lower 32 bit is for the operator id. - ) +fn pretty_identity(identity: &str, actor_id: ActorId) -> String { + format!("{} (actor {})", identity, actor_id) } /// Streams wrapped by `instrument_await_tree` will be able to print the spans of the @@ -115,12 +112,11 @@ fn pretty_identity(identity: &str, actor_id: ActorId, executor_id: u64) -> Strin pub async fn instrument_await_tree( info: Arc, actor_id: ActorId, - executor_id: u64, input: impl MessageStream, ) { pin_mut!(input); - let span: await_tree::Span = pretty_identity(&info.identity, actor_id, executor_id).into(); + let span: await_tree::Span = pretty_identity(&info.identity, actor_id).into(); while let Some(message) = input .next() diff --git a/src/stream/src/from_proto/chain.rs b/src/stream/src/from_proto/chain.rs index 667772fcfdd60..81030526b82f3 100644 --- a/src/stream/src/from_proto/chain.rs +++ b/src/stream/src/from_proto/chain.rs @@ -172,6 +172,7 @@ impl ExecutorBuilder for ChainExecutorBuilder { params.pk_indices, stream.streaming_metrics.clone(), params.env.config().developer.chunk_size, + params.executor_id, ) .boxed() } diff --git a/src/stream/src/from_proto/filter.rs b/src/stream/src/from_proto/filter.rs index 32341f1c5ebb1..47661e105c506 100644 --- a/src/stream/src/from_proto/filter.rs +++ b/src/stream/src/from_proto/filter.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use risingwave_expr::expr::build_from_prost; +use risingwave_expr::expr::build_non_strict_from_prost; use risingwave_pb::stream_plan::FilterNode; use super::*; @@ -31,7 +31,8 @@ impl ExecutorBuilder for FilterExecutorBuilder { _stream: &mut LocalStreamManagerCore, ) -> StreamResult { let [input]: [_; 1] = params.input.try_into().unwrap(); - let search_condition = build_from_prost(node.get_search_condition()?)?; + let search_condition = + build_non_strict_from_prost(node.get_search_condition()?, params.eval_error_report)?; Ok(FilterExecutor::new( params.actor_context, diff --git a/src/stream/src/from_proto/group_top_n.rs b/src/stream/src/from_proto/group_top_n.rs index a91d4a91a6ef0..a7fc0d741206e 100644 --- a/src/stream/src/from_proto/group_top_n.rs +++ b/src/stream/src/from_proto/group_top_n.rs @@ -21,7 +21,7 @@ use risingwave_pb::stream_plan::GroupTopNNode; use super::*; use crate::common::table::state_table::StateTable; -use crate::executor::{ActorContextRef, AppendOnlyGroupTopNExecutor, GroupTopNExecutor, PkIndices}; +use crate::executor::{ActorContextRef, AppendOnlyGroupTopNExecutor, GroupTopNExecutor}; use crate::task::AtomicU64Ref; pub struct GroupTopNExecutorBuilder; @@ -60,27 +60,23 @@ impl ExecutorBuilder for GroupTopNExecutorBuilder ExecutorBuilder for GroupTopNExecutorBuilder { input: BoxedExecutor, ctx: ActorContextRef, + info: ExecutorInfo, storage_key: Vec, offset_and_limit: (usize, usize), order_by: Vec, - executor_id: u64, group_by: Vec, state_table: StateTable, watermark_epoch: AtomicU64Ref, group_key_types: Vec, - pk_indices: PkIndices, with_ties: bool, append_only: bool, @@ -115,14 +110,13 @@ impl HashKeyDispatcher for GroupTopNExecutorDispatcherArgs { Ok($excutor::::new( self.input, self.ctx, + self.info, self.storage_key, self.offset_and_limit, self.order_by, - self.executor_id, self.group_by, self.state_table, self.watermark_epoch, - self.pk_indices, )? .boxed()) }; diff --git a/src/stream/src/from_proto/hash_agg.rs b/src/stream/src/from_proto/hash_agg.rs index 3d110624c2784..a369f8124ebfb 100644 --- a/src/stream/src/from_proto/hash_agg.rs +++ b/src/stream/src/from_proto/hash_agg.rs @@ -18,7 +18,7 @@ use std::sync::Arc; use risingwave_common::hash::{HashKey, HashKeyDispatcher}; use risingwave_common::types::DataType; -use risingwave_expr::agg::AggCall; +use risingwave_expr::aggregate::AggCall; use risingwave_pb::stream_plan::HashAggNode; use super::agg_common::{ @@ -114,6 +114,11 @@ impl ExecutorBuilder for HashAggExecutorBuilder { extra: HashAggExecutorExtraArgs { group_key_indices, chunk_size: params.env.config().developer.chunk_size, + max_dirty_groups_heap_size: params + .env + .config() + .developer + .hash_agg_max_dirty_groups_heap_size, emit_on_window_close: node.get_emit_on_window_close(), }, }, diff --git a/src/stream/src/from_proto/hash_join.rs b/src/stream/src/from_proto/hash_join.rs index 7f63fba21221f..44799af9405c2 100644 --- a/src/stream/src/from_proto/hash_join.rs +++ b/src/stream/src/from_proto/hash_join.rs @@ -17,7 +17,9 @@ use std::sync::Arc; use risingwave_common::hash::{HashKey, HashKeyDispatcher}; use risingwave_common::types::DataType; -use risingwave_expr::expr::{build_from_prost, build_func, BoxedExpression, InputRefExpression}; +use risingwave_expr::expr::{ + build_func_non_strict, build_non_strict_from_prost, BoxedExpression, InputRefExpression, +}; pub use risingwave_pb::expr::expr_node::Type as ExprType; use risingwave_pb::plan_common::JoinType as JoinTypeProto; use risingwave_pb::stream_plan::HashJoinNode; @@ -80,7 +82,10 @@ impl ExecutorBuilder for HashJoinExecutorBuilder { .collect_vec(); let condition = match node.get_condition() { - Ok(cond_prost) => Some(build_from_prost(cond_prost)?), + Ok(cond_prost) => Some(build_non_strict_from_prost( + cond_prost, + params.eval_error_report.clone(), + )?), Err(_) => None, }; trace!("Join non-equi condition: {:?}", condition); @@ -96,13 +101,17 @@ impl ExecutorBuilder for HashJoinExecutorBuilder { let data_type = source_l.schema().fields [min(key_required_larger, key_required_smaller)] .data_type(); - Some(build_func( + Some(build_func_non_strict( delta_expression.delta_type(), data_type.clone(), vec![ Box::new(InputRefExpression::new(data_type, 0)), - build_from_prost(delta_expression.delta.as_ref().unwrap())?, + build_non_strict_from_prost( + delta_expression.delta.as_ref().unwrap(), + params.eval_error_report.clone(), + )?, ], + params.eval_error_report.clone(), )?) } else { None diff --git a/src/stream/src/from_proto/hop_window.rs b/src/stream/src/from_proto/hop_window.rs index 7a4ea721a58ab..5bf0240155fc2 100644 --- a/src/stream/src/from_proto/hop_window.rs +++ b/src/stream/src/from_proto/hop_window.rs @@ -14,7 +14,7 @@ use risingwave_common::catalog::{Field, Schema}; use risingwave_common::types::DataType; -use risingwave_expr::expr::build_from_prost; +use risingwave_expr::expr::build_non_strict_from_prost; use risingwave_pb::stream_plan::HopWindowNode; use super::*; @@ -52,12 +52,12 @@ impl ExecutorBuilder for HopWindowExecutorBuilder { let window_start_exprs: Vec<_> = node .get_window_start_exprs() .iter() - .map(build_from_prost) + .map(|e| build_non_strict_from_prost(e, params.eval_error_report.clone())) .try_collect()?; let window_end_exprs: Vec<_> = node .get_window_end_exprs() .iter() - .map(build_from_prost) + .map(|e| build_non_strict_from_prost(e, params.eval_error_report.clone())) .try_collect()?; let time_col = node.get_time_col() as usize; diff --git a/src/stream/src/from_proto/mod.rs b/src/stream/src/from_proto/mod.rs index bdb19f022ec37..2ec9476d0e904 100644 --- a/src/stream/src/from_proto/mod.rs +++ b/src/stream/src/from_proto/mod.rs @@ -166,5 +166,6 @@ pub async fn create_executor( NodeBody::NoOp => NoOpExecutorBuilder, NodeBody::EowcOverWindow => EowcOverWindowExecutorBuilder, NodeBody::OverWindow => OverWindowExecutorBuilder, + NodeBody::StreamFsFetch => FsFetchExecutorBuilder, } } diff --git a/src/stream/src/from_proto/mview.rs b/src/stream/src/from_proto/mview.rs index 9c4d084def8ba..d64490b29b84a 100644 --- a/src/stream/src/from_proto/mview.rs +++ b/src/stream/src/from_proto/mview.rs @@ -48,14 +48,19 @@ impl ExecutorBuilder for MaterializeExecutorBuilder { let conflict_behavior = ConflictBehavior::from_protobuf(&table.handle_pk_conflict_behavior()); + let info = ExecutorInfo { + schema: params.schema, + pk_indices: params.pk_indices, + identity: params.identity, + }; macro_rules! new_executor { ($SD:ident) => { MaterializeExecutor::<_, $SD>::new( input, + info, store, order_key, - params.executor_id, params.actor_context, params.vnode_bitmap.map(Arc::new), table, @@ -106,11 +111,16 @@ impl ExecutorBuilder for ArrangeExecutorBuilder { let vnodes = params.vnode_bitmap.map(Arc::new); let conflict_behavior = ConflictBehavior::from_protobuf(&table.handle_pk_conflict_behavior()); + let info = ExecutorInfo { + schema: params.schema, + pk_indices: params.pk_indices, + identity: params.identity, + }; let executor = MaterializeExecutor::<_, BasicSerde>::new( input, + info, store, keys, - params.executor_id, params.actor_context, vnodes, table, diff --git a/src/stream/src/from_proto/project.rs b/src/stream/src/from_proto/project.rs index 111cc46ace641..ea01fd5c129c8 100644 --- a/src/stream/src/from_proto/project.rs +++ b/src/stream/src/from_proto/project.rs @@ -14,7 +14,7 @@ use multimap::MultiMap; use risingwave_common::util::iter_util::ZipEqFast; -use risingwave_expr::expr::build_from_prost; +use risingwave_expr::expr::build_non_strict_from_prost; use risingwave_pb::expr::expr_node::RexNode; use risingwave_pb::stream_plan::ProjectNode; @@ -37,7 +37,7 @@ impl ExecutorBuilder for ProjectExecutorBuilder { let project_exprs: Vec<_> = node .get_select_list() .iter() - .map(build_from_prost) + .map(|e| build_non_strict_from_prost(e, params.eval_error_report.clone())) .try_collect()?; let watermark_derivations = MultiMap::from_iter( diff --git a/src/stream/src/from_proto/simple_agg.rs b/src/stream/src/from_proto/simple_agg.rs index 403d82dc02e9a..5423e4fd2043f 100644 --- a/src/stream/src/from_proto/simple_agg.rs +++ b/src/stream/src/from_proto/simple_agg.rs @@ -14,7 +14,7 @@ //! Streaming Simple Aggregator -use risingwave_expr::agg::AggCall; +use risingwave_expr::aggregate::AggCall; use risingwave_pb::stream_plan::SimpleAggNode; use super::agg_common::{ diff --git a/src/stream/src/from_proto/sink.rs b/src/stream/src/from_proto/sink.rs index a95b7fce22738..47f21c0a223cf 100644 --- a/src/stream/src/from_proto/sink.rs +++ b/src/stream/src/from_proto/sink.rs @@ -14,15 +14,19 @@ use std::sync::Arc; +use anyhow::anyhow; use risingwave_common::catalog::ColumnCatalog; -use risingwave_connector::sink::catalog::SinkType; -use risingwave_connector::sink::{SinkParam, SinkWriterParam}; +use risingwave_connector::match_sink_name_str; +use risingwave_connector::sink::catalog::{SinkFormatDesc, SinkType}; +use risingwave_connector::sink::{ + SinkError, SinkParam, SinkWriterParam, CONNECTOR_TYPE_KEY, SINK_TYPE_OPTION, +}; use risingwave_pb::stream_plan::{SinkLogStoreType, SinkNode}; use risingwave_storage::dispatch_state_store; use super::*; -use crate::common::log_store::in_mem::BoundedInMemLogStoreFactory; -use crate::common::log_store::kv_log_store::KvLogStoreFactory; +use crate::common::log_store_impl::in_mem::BoundedInMemLogStoreFactory; +use crate::common::log_store_impl::kv_log_store::{KvLogStoreFactory, KvLogStoreMetrics}; use crate::executor::SinkExecutor; pub struct SinkExecutorBuilder; @@ -56,6 +60,35 @@ impl ExecutorBuilder for SinkExecutorBuilder { .into_iter() .map(ColumnCatalog::from) .collect_vec(); + + let connector = { + let sink_type = properties.get(CONNECTOR_TYPE_KEY).ok_or_else(|| { + SinkError::Config(anyhow!("missing config: {}", CONNECTOR_TYPE_KEY)) + })?; + + match_sink_name_str!( + sink_type.to_lowercase().as_str(), + SinkType, + Ok(SinkType::SINK_NAME), + |other| { + Err(SinkError::Config(anyhow!( + "unsupported sink connector {}", + other + ))) + } + ) + }?; + let format_desc = match &sink_desc.format_desc { + // Case A: new syntax `format ... encode ...` + Some(f) => Some(f.clone().try_into()?), + None => match sink_desc.properties.get(SINK_TYPE_OPTION) { + // Case B: old syntax `type = '...'` + Some(t) => SinkFormatDesc::from_legacy_type(connector, t)?, + // Case C: no format + encode required + None => None, + }, + }; + let sink_param = SinkParam { sink_id, properties, @@ -66,10 +99,28 @@ impl ExecutorBuilder for SinkExecutorBuilder { .collect(), downstream_pk, sink_type, + format_desc, db_name, sink_from_name, }; + let identity = format!("SinkExecutor {:X?}", params.executor_id); + let sink_id_str = format!("{}", sink_id.sink_id); + + let sink_metrics = stream.streaming_metrics.new_sink_metrics( + identity.as_str(), + sink_id_str.as_str(), + connector, + ); + + let sink_write_param = SinkWriterParam { + connector_params: params.env.connector_params(), + executor_id: params.executor_id, + vnode_bitmap: params.vnode_bitmap.clone(), + meta_client: params.env.meta_client(), + sink_metrics, + }; + match node.log_store_type() { // Default value is the normal in memory log store to be backward compatible with the // previously unset value @@ -78,13 +129,7 @@ impl ExecutorBuilder for SinkExecutorBuilder { Ok(Box::new( SinkExecutor::new( input_executor, - stream.streaming_metrics.clone(), - SinkWriterParam { - connector_params: params.env.connector_params(), - executor_id: params.executor_id, - vnode_bitmap: params.vnode_bitmap, - meta_client: params.env.meta_client(), - }, + sink_write_param, sink_param, columns, params.actor_context, @@ -95,24 +140,26 @@ impl ExecutorBuilder for SinkExecutorBuilder { )) } SinkLogStoreType::KvLogStore => { + let metrics = KvLogStoreMetrics::new( + ¶ms.executor_stats, + &sink_write_param, + &sink_param, + connector, + ); + // TODO: support setting max row count in config dispatch_state_store!(params.env.state_store(), state_store, { let factory = KvLogStoreFactory::new( state_store, node.table.as_ref().unwrap().clone(), params.vnode_bitmap.clone().map(Arc::new), - 0, + 65536, + metrics, ); Ok(Box::new( SinkExecutor::new( input_executor, - stream.streaming_metrics.clone(), - SinkWriterParam { - connector_params: params.env.connector_params(), - executor_id: params.executor_id, - vnode_bitmap: params.vnode_bitmap, - meta_client: params.env.meta_client(), - }, + sink_write_param, sink_param, columns, params.actor_context, diff --git a/src/stream/src/from_proto/source/fs_fetch.rs b/src/stream/src/from_proto/source/fs_fetch.rs new file mode 100644 index 0000000000000..b6df84c8560e4 --- /dev/null +++ b/src/stream/src/from_proto/source/fs_fetch.rs @@ -0,0 +1,120 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use risingwave_common::catalog::{ColumnId, Field, Schema, TableId}; +use risingwave_common::types::DataType; +use risingwave_connector::source::SourceCtrlOpts; +use risingwave_pb::stream_plan::StreamFsFetchNode; +use risingwave_source::source_desc::SourceDescBuilder; +use risingwave_storage::StateStore; + +use crate::error::StreamResult; +use crate::executor::{ + BoxedExecutor, Executor, FlowControlExecutor, FsFetchExecutor, SourceStateTableHandler, + StreamSourceCore, +}; +use crate::from_proto::ExecutorBuilder; +use crate::task::{ExecutorParams, LocalStreamManagerCore}; + +pub struct FsFetchExecutorBuilder; + +#[async_trait::async_trait] +impl ExecutorBuilder for FsFetchExecutorBuilder { + type Node = StreamFsFetchNode; + + async fn new_boxed_executor( + params: ExecutorParams, + node: &Self::Node, + store: impl StateStore, + _stream: &mut LocalStreamManagerCore, + ) -> StreamResult { + let [upstream]: [_; 1] = params.input.try_into().unwrap(); + + let source = node.node_inner.as_ref().unwrap(); + + let source_id = TableId::new(source.source_id); + let source_name = source.source_name.clone(); + let source_info = source.get_info()?; + + let source_desc_builder = SourceDescBuilder::new( + source.columns.clone(), + params.env.source_metrics(), + source.row_id_index.map(|x| x as _), + source.properties.clone(), + source_info.clone(), + params.env.connector_params(), + params.env.config().developer.connector_message_buffer_size, + params.pk_indices.clone(), + ); + + let source_ctrl_opts = SourceCtrlOpts { + chunk_size: params.env.config().developer.chunk_size, + }; + + let column_ids: Vec<_> = source + .columns + .iter() + .map(|column| ColumnId::from(column.get_column_desc().unwrap().column_id)) + .collect(); + let fields = source + .columns + .iter() + .map(|prost| { + let column_desc = prost.column_desc.as_ref().unwrap(); + let data_type = DataType::from(column_desc.column_type.as_ref().unwrap()); + let name = column_desc.name.clone(); + Field::with_name(data_type, name) + }) + .collect(); + let schema = Schema::new(fields); + + let vnodes = Some(Arc::new( + params + .vnode_bitmap + .expect("vnodes not set for fetch executor"), + )); + let state_table_handler = SourceStateTableHandler::from_table_catalog_with_vnodes( + source.state_table.as_ref().unwrap(), + store.clone(), + vnodes, + ) + .await; + let stream_source_core = StreamSourceCore::new( + source_id, + source_name, + column_ids, + source_desc_builder, + state_table_handler, + ); + + let executor = FsFetchExecutor::new( + params.actor_context, + schema, + params.pk_indices, + stream_source_core, + params.executor_id, + upstream, + source_ctrl_opts, + params.env.connector_params(), + ) + .boxed(); + + if let Ok(rate_limit) = source.get_rate_limit() { + return Ok(FlowControlExecutor::new(executor, *rate_limit).boxed()); + } + Ok(executor) + } +} diff --git a/src/stream/src/from_proto/source/mod.rs b/src/stream/src/from_proto/source/mod.rs new file mode 100644 index 0000000000000..cb83889465a73 --- /dev/null +++ b/src/stream/src/from_proto/source/mod.rs @@ -0,0 +1,20 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod trad_source; +pub use trad_source::SourceExecutorBuilder; +mod fs_fetch; +pub use fs_fetch::FsFetchExecutorBuilder; + +use super::*; diff --git a/src/stream/src/from_proto/source.rs b/src/stream/src/from_proto/source/trad_source.rs similarity index 89% rename from src/stream/src/from_proto/source.rs rename to src/stream/src/from_proto/source/trad_source.rs index 77bbcc53e69c5..3f0793595c7c5 100644 --- a/src/stream/src/from_proto/source.rs +++ b/src/stream/src/from_proto/source/trad_source.rs @@ -16,7 +16,7 @@ use risingwave_common::catalog::{ColumnId, Field, Schema, TableId}; use risingwave_common::types::DataType; use risingwave_common::util::sort_util::OrderType; use risingwave_connector::source::external::{ExternalTableType, SchemaTableName}; -use risingwave_connector::source::SourceCtrlOpts; +use risingwave_connector::source::{ConnectorProperties, SourceCtrlOpts}; use risingwave_pb::stream_plan::SourceNode; use risingwave_source::source_desc::SourceDescBuilder; use risingwave_storage::panic_store::PanicStateStore; @@ -24,7 +24,7 @@ use tokio::sync::mpsc::unbounded_channel; use super::*; use crate::executor::external::ExternalStorageTable; -use crate::executor::source::StreamSourceCore; +use crate::executor::source::{FsListExecutor, StreamSourceCore}; use crate::executor::source_executor::SourceExecutor; use crate::executor::state_table_handler::SourceStateTableHandler; use crate::executor::{CdcBackfillExecutor, FlowControlExecutor, FsSourceExecutor}; @@ -115,6 +115,8 @@ impl ExecutorBuilder for SourceExecutorBuilder { .map(|c| c.to_ascii_lowercase()) .unwrap_or_default(); let is_fs_connector = FS_CONNECTORS.contains(&connector.as_str()); + let is_fs_v2_connector = + ConnectorProperties::is_new_fs_connector_hash_map(&source.properties); if is_fs_connector { FsSourceExecutor::new( @@ -129,6 +131,20 @@ impl ExecutorBuilder for SourceExecutorBuilder { source_ctrl_opts, )? .boxed() + } else if is_fs_v2_connector { + FsListExecutor::new( + params.actor_context.clone(), + schema.clone(), + params.pk_indices.clone(), + Some(stream_source_core), + params.executor_stats.clone(), + barrier_receiver, + system_params, + params.executor_id, + source_ctrl_opts.clone(), + params.env.connector_params(), + ) + .boxed() } else { let source_exec = SourceExecutor::new( params.actor_context.clone(), @@ -146,13 +162,12 @@ impl ExecutorBuilder for SourceExecutorBuilder { let table_type = ExternalTableType::from_properties(&source.properties); if table_type.can_backfill() && let Some(table_desc) = source_info.upstream_table.clone() { let upstream_table_name = SchemaTableName::from_properties(&source.properties); - let pk_indices = table_desc + let table_pk_indices = table_desc .pk .iter() .map(|k| k.column_index as usize) .collect_vec(); - - let order_types = table_desc + let table_pk_order_types = table_desc .pk .iter() .map(|desc| OrderType::from_protobuf(desc.get_order_type().unwrap())) @@ -164,8 +179,8 @@ impl ExecutorBuilder for SourceExecutorBuilder { upstream_table_name, table_reader, schema.clone(), - order_types, - pk_indices.clone(), + table_pk_order_types, + table_pk_indices, (0..table_desc.columns.len()).collect_vec(), ); @@ -181,7 +196,7 @@ impl ExecutorBuilder for SourceExecutorBuilder { (0..source.columns.len()).collect_vec(), // eliminate the last column (_rw_offset) None, schema.clone(), - pk_indices, + params.pk_indices, params.executor_stats, source_state_handler, source_ctrl_opts.chunk_size diff --git a/src/stream/src/from_proto/stateless_simple_agg.rs b/src/stream/src/from_proto/stateless_simple_agg.rs index 37e4be0d7109e..f26316c86e6d0 100644 --- a/src/stream/src/from_proto/stateless_simple_agg.rs +++ b/src/stream/src/from_proto/stateless_simple_agg.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use risingwave_expr::agg::AggCall; +use risingwave_expr::aggregate::AggCall; use risingwave_pb::stream_plan::SimpleAggNode; use super::*; diff --git a/src/stream/src/from_proto/temporal_join.rs b/src/stream/src/from_proto/temporal_join.rs index 1883281f35e8f..8b7b3b6af1335 100644 --- a/src/stream/src/from_proto/temporal_join.rs +++ b/src/stream/src/from_proto/temporal_join.rs @@ -18,7 +18,7 @@ use risingwave_common::catalog::{ColumnDesc, TableId, TableOption}; use risingwave_common::hash::{HashKey, HashKeyDispatcher}; use risingwave_common::types::DataType; use risingwave_common::util::sort_util::OrderType; -use risingwave_expr::expr::{build_from_prost, BoxedExpression}; +use risingwave_expr::expr::{build_non_strict_from_prost, BoxedExpression}; use risingwave_pb::plan_common::{JoinType as JoinTypeProto, StorageTableDesc}; use risingwave_storage::table::batch_table::storage_table::StorageTable; use risingwave_storage::table::Distribution; @@ -133,7 +133,10 @@ impl ExecutorBuilder for TemporalJoinExecutorBuilder { let null_safe = node.get_null_safe().to_vec(); let condition = match node.get_condition() { - Ok(cond_prost) => Some(build_from_prost(cond_prost)?), + Ok(cond_prost) => Some(build_non_strict_from_prost( + cond_prost, + params.eval_error_report, + )?), Err(_) => None, }; diff --git a/src/stream/src/from_proto/top_n.rs b/src/stream/src/from_proto/top_n.rs index cf8a0280522c2..f0aa967aae143 100644 --- a/src/stream/src/from_proto/top_n.rs +++ b/src/stream/src/from_proto/top_n.rs @@ -49,15 +49,21 @@ impl ExecutorBuilder for TopNExecutorBuilder { Ok($excutor::<_, $with_ties>::new( input, params.actor_context, + info, storage_key, (node.offset as usize, node.limit as usize), order_by, - params.executor_id, state_table, )? .boxed()) diff --git a/src/stream/src/from_proto/values.rs b/src/stream/src/from_proto/values.rs index 6857e1fc56ef9..077eea3511108 100644 --- a/src/stream/src/from_proto/values.rs +++ b/src/stream/src/from_proto/values.rs @@ -14,7 +14,7 @@ use itertools::Itertools; use risingwave_common::catalog::{Field, Schema}; -use risingwave_expr::expr::build_from_prost; +use risingwave_expr::expr::build_non_strict_from_prost; use risingwave_pb::stream_plan::ValuesNode; use risingwave_storage::StateStore; use tokio::sync::mpsc::unbounded_channel; @@ -53,7 +53,9 @@ impl ExecutorBuilder for ValuesExecutorBuilder { tuple .get_cells() .iter() - .map(|node| build_from_prost(node).unwrap()) + .map(|node| { + build_non_strict_from_prost(node, params.eval_error_report.clone()).unwrap() + }) .collect_vec() }) .collect_vec(); diff --git a/src/stream/src/from_proto/watermark_filter.rs b/src/stream/src/from_proto/watermark_filter.rs index 84b39288c7048..52c452115a4ce 100644 --- a/src/stream/src/from_proto/watermark_filter.rs +++ b/src/stream/src/from_proto/watermark_filter.rs @@ -14,7 +14,7 @@ use std::sync::Arc; -use risingwave_expr::expr::build_from_prost; +use risingwave_expr::expr::build_non_strict_from_prost; use risingwave_pb::stream_plan::WatermarkFilterNode; use super::*; @@ -36,7 +36,8 @@ impl ExecutorBuilder for WatermarkFilterBuilder { let [input]: [_; 1] = params.input.try_into().unwrap(); let watermark_descs = node.get_watermark_descs().clone(); let [watermark_desc]: [_; 1] = watermark_descs.try_into().unwrap(); - let watermark_expr = build_from_prost(&watermark_desc.expr.unwrap())?; + let watermark_expr = + build_non_strict_from_prost(&watermark_desc.expr.unwrap(), params.eval_error_report)?; let event_time_col_idx = watermark_desc.watermark_idx as usize; let vnodes = Arc::new( params @@ -55,6 +56,7 @@ impl ExecutorBuilder for WatermarkFilterBuilder { event_time_col_idx, params.actor_context, table, + params.executor_id, ) .boxed()) } diff --git a/src/stream/src/lib.rs b/src/stream/src/lib.rs index db1a3fe7819b6..5a68b1b712b26 100644 --- a/src/stream/src/lib.rs +++ b/src/stream/src/lib.rs @@ -18,7 +18,6 @@ #![feature(type_alias_impl_trait)] #![feature(more_qualified_paths)] #![feature(lint_reasons)] -#![feature(binary_heap_drain_sorted)] #![feature(let_chains)] #![feature(hash_extract_if)] #![feature(extract_if)] @@ -54,3 +53,6 @@ pub mod error; pub mod executor; mod from_proto; pub mod task; + +#[cfg(test)] +risingwave_expr_impl::enable!(); diff --git a/src/stream/src/task/barrier_manager.rs b/src/stream/src/task/barrier_manager.rs index 5581a8529c067..996881d3ff4b0 100644 --- a/src/stream/src/task/barrier_manager.rs +++ b/src/stream/src/task/barrier_manager.rs @@ -101,7 +101,7 @@ impl LocalBarrierManager { /// Register sender for source actors, used to send barriers. pub fn register_sender(&mut self, actor_id: ActorId, sender: UnboundedSender) { - tracing::trace!( + tracing::debug!( target: "events::stream::barrier::manager", actor_id = actor_id, "register sender" @@ -132,7 +132,7 @@ impl LocalBarrierManager { } }; let to_collect: HashSet = actor_ids_to_collect.into_iter().collect(); - trace!( + debug!( target: "events::stream::barrier::manager::send", "send barrier {:?}, senders = {:?}, actor_ids_to_collect = {:?}", barrier, @@ -172,7 +172,7 @@ impl LocalBarrierManager { // Actors to stop should still accept this barrier, but won't get sent to in next times. if let Some(actors) = barrier.all_stop_actors() { - trace!( + debug!( target: "events::stream::barrier::manager", "remove actors {:?} from senders", actors diff --git a/src/stream/src/task/barrier_manager/managed_state.rs b/src/stream/src/task/barrier_manager/managed_state.rs index c438272033831..43aeb4afba46b 100644 --- a/src/stream/src/task/barrier_manager/managed_state.rs +++ b/src/stream/src/task/barrier_manager/managed_state.rs @@ -112,14 +112,14 @@ impl ManagedBarrierState { .into_iter() .map(|(actor, state)| CreateMviewProgress { chain_actor_id: actor, - done: matches!(state, ChainState::Done), + done: matches!(state, ChainState::Done(_)), consumed_epoch: match state { ChainState::ConsumingUpstream(consumed_epoch, _) => consumed_epoch, - ChainState::Done => epoch, + ChainState::Done(_) => epoch, }, consumed_rows: match state { ChainState::ConsumingUpstream(_, consumed_rows) => consumed_rows, - ChainState::Done => 0, + ChainState::Done(consumed_rows) => consumed_rows, }, }) .collect(); @@ -193,12 +193,10 @@ impl ManagedBarrierState { /// Collect a `barrier` from the actor with `actor_id`. pub(super) fn collect(&mut self, actor_id: ActorId, barrier: &Barrier) { - tracing::trace!( + tracing::debug!( target: "events::stream::barrier::manager::collect", - "collect_barrier: epoch = {}, actor_id = {}, state = {:#?}", - barrier.epoch.curr, - actor_id, - self + epoch = barrier.epoch.curr, actor_id, state = ?self, + "collect_barrier", ); match self.epoch_barrier_state_map.get_mut(&barrier.epoch.curr) { diff --git a/src/stream/src/task/barrier_manager/progress.rs b/src/stream/src/task/barrier_manager/progress.rs index adea59cdf656a..5abeab216cd00 100644 --- a/src/stream/src/task/barrier_manager/progress.rs +++ b/src/stream/src/task/barrier_manager/progress.rs @@ -23,7 +23,7 @@ type ConsumedRows = u64; #[derive(Debug, Clone, Copy)] pub(super) enum ChainState { ConsumingUpstream(ConsumedEpoch, ConsumedRows), - Done, + Done(ConsumedRows), } impl LocalBarrierManager { @@ -129,10 +129,15 @@ impl CreateMviewProgress { ) { match self.state { Some(ChainState::ConsumingUpstream(last, last_consumed_rows)) => { - assert!(last < consumed_epoch); + assert!( + last < consumed_epoch, + "last_epoch: {:#?} must be greater than consumed epoch: {:#?}", + last, + consumed_epoch + ); assert!(last_consumed_rows <= current_consumed_rows); } - Some(ChainState::Done) => unreachable!(), + Some(ChainState::Done(_)) => unreachable!(), None => {} }; self.update_inner( @@ -143,11 +148,11 @@ impl CreateMviewProgress { /// Finish the progress. If the progress is already finished, then perform no-op. /// `current_epoch` should be provided to locate the barrier under concurrent checkpoint. - pub fn finish(&mut self, current_epoch: u64) { - if let Some(ChainState::Done) = self.state { + pub fn finish(&mut self, current_epoch: u64, current_consumed_rows: ConsumedRows) { + if let Some(ChainState::Done(_)) = self.state { return; } - self.update_inner(current_epoch, ChainState::Done); + self.update_inner(current_epoch, ChainState::Done(current_consumed_rows)); } } diff --git a/src/stream/src/task/stream_manager.rs b/src/stream/src/task/stream_manager.rs index dc07c6986c8e0..f54eb9921f77c 100644 --- a/src/stream/src/task/stream_manager.rs +++ b/src/stream/src/task/stream_manager.rs @@ -105,10 +105,26 @@ pub struct LocalStreamManager { total_mem_val: Arc>, } +/// Report expression evaluation errors to the actor context. +/// +/// The struct can be cheaply cloned. +#[derive(Clone)] +pub struct ActorEvalErrorReport { + pub actor_context: ActorContextRef, + pub identity: Arc, +} + +impl risingwave_expr::expr::EvalErrorReport for ActorEvalErrorReport { + fn report(&self, err: risingwave_expr::ExprError) { + self.actor_context.on_compute_error(err, &self.identity); + } +} + pub struct ExecutorParams { pub env: StreamEnvironment, /// Indices of primary keys + // TODO: directly use it for `ExecutorInfo` pub pk_indices: PkIndices, /// Executor id, unique across all actors. @@ -117,12 +133,18 @@ pub struct ExecutorParams { /// Operator id, unique for each operator in fragment. pub operator_id: u64, - /// Information of the operator from plan node. + /// Information of the operator from plan node, like `StreamHashJoin { .. }`. + // TODO: use it for `identity` pub op_info: String, /// The output schema of the executor. + // TODO: directly use it for `ExecutorInfo` pub schema: Schema, + /// The identity of the executor, like `HashJoin 1234ABCD`. + // TODO: directly use it for `ExecutorInfo` + pub identity: String, + /// The input executor. pub input: Vec, @@ -137,6 +159,9 @@ pub struct ExecutorParams { /// Vnodes owned by this executor. Represented in bitmap. pub vnode_bitmap: Option, + + /// Used for reporting expression evaluation errors. + pub eval_error_report: ActorEvalErrorReport, } impl Debug for ExecutorParams { @@ -479,7 +504,6 @@ impl LocalStreamManagerCore { &mut self, fragment_id: FragmentId, node: &stream_plan::StreamNode, - input_pos: usize, env: StreamEnvironment, store: impl StateStore, actor_context: &ActorContextRef, @@ -506,12 +530,11 @@ impl LocalStreamManagerCore { // Create the input executor before creating itself let mut input = Vec::with_capacity(node.input.iter().len()); - for (input_pos, input_stream_node) in node.input.iter().enumerate() { + for input_stream_node in &node.input { input.push( self.create_nodes_inner( fragment_id, input_stream_node, - input_pos, env.clone(), store.clone(), actor_context, @@ -536,12 +559,19 @@ impl LocalStreamManagerCore { let operator_id = unique_operator_id(fragment_id, node.operator_id); let schema = node.fields.iter().map(Field::from).collect(); + let identity = format!("{} {:X}", node.get_node_body().unwrap(), executor_id); + let eval_error_report = ActorEvalErrorReport { + actor_context: actor_context.clone(), + identity: identity.clone().into(), + }; + // Build the executor with params. let executor_params = ExecutorParams { env: env.clone(), - pk_indices, + pk_indices: pk_indices.clone(), executor_id, operator_id, + identity: identity.clone(), op_info, schema, input, @@ -549,17 +579,21 @@ impl LocalStreamManagerCore { executor_stats: self.streaming_metrics.clone(), actor_context: actor_context.clone(), vnode_bitmap, + eval_error_report, }; let executor = create_executor(executor_params, self, node, store).await?; + assert_eq!( + executor.pk_indices(), + &pk_indices, + "`pk_indices` of {} not consistent with what derived by optimizer", + executor.identity() + ); // Wrap the executor for debug purpose. let executor = WrapperExecutor::new( executor, - input_pos, - actor_context.id, - executor_id, - self.streaming_metrics.clone(), + actor_context.clone(), self.config.developer.enable_executor_row_count, ) .boxed(); @@ -595,7 +629,6 @@ impl LocalStreamManagerCore { self.create_nodes_inner( fragment_id, node, - 0, env, store, actor_context, @@ -682,13 +715,14 @@ impl LocalStreamManagerCore { { let metrics = self.streaming_metrics.clone(); let actor_id_str = actor_id.to_string(); + let fragment_id_str = actor_context.fragment_id.to_string(); let allocation_stated = task_stats_alloc::allocation_stat( instrumented, Duration::from_millis(1000), move |bytes| { metrics .actor_memory_usage - .with_label_values(&[&actor_id_str]) + .with_label_values(&[&actor_id_str, &fragment_id_str]) .set(bytes as i64); actor_context.store_mem_usage(bytes); diff --git a/src/stream/tests/integration_tests/eowc_over_window.rs b/src/stream/tests/integration_tests/eowc_over_window.rs index 35cc4954aff45..9407b6013dc03 100644 --- a/src/stream/tests/integration_tests/eowc_over_window.rs +++ b/src/stream/tests/integration_tests/eowc_over_window.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use risingwave_expr::agg::{AggArgs, AggKind}; +use risingwave_expr::aggregate::{AggArgs, AggKind}; use risingwave_expr::window_function::{Frame, FrameBound, WindowFuncCall, WindowFuncKind}; use risingwave_stream::executor::{EowcOverWindowExecutor, EowcOverWindowExecutorArgs}; diff --git a/src/stream/tests/integration_tests/hash_agg.rs b/src/stream/tests/integration_tests/hash_agg.rs index 07ab974e24b46..1b61bc5cd1d7f 100644 --- a/src/stream/tests/integration_tests/hash_agg.rs +++ b/src/stream/tests/integration_tests/hash_agg.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use risingwave_expr::agg::AggCall; +use risingwave_expr::aggregate::AggCall; use risingwave_stream::executor::test_utils::agg_executor::new_boxed_hash_agg_executor; use crate::prelude::*; diff --git a/src/stream/tests/integration_tests/hop_window.rs b/src/stream/tests/integration_tests/hop_window.rs index fff1908fbcf69..167857cc7d9fc 100644 --- a/src/stream/tests/integration_tests/hop_window.rs +++ b/src/stream/tests/integration_tests/hop_window.rs @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use risingwave_common::cast::str_to_timestamp; use risingwave_common::types::test_utils::IntervalTestExt; -use risingwave_common::types::Interval; +use risingwave_common::types::{Interval, Timestamp}; use risingwave_expr::expr::test_utils::make_hop_window_expression; use risingwave_stream::executor::{ExecutorInfo, HopWindowExecutor}; @@ -70,28 +69,28 @@ fn push_watermarks(tx: &mut MessageSender) { tx.push_watermark( TIME_COL_IDX, DataType::Timestamp, - str_to_timestamp("2023-07-06 18:27:03").unwrap().into(), + "2023-07-06 18:27:03".parse::().unwrap().into(), ); tx.push_watermark( TIME_COL_IDX, DataType::Timestamp, - str_to_timestamp("2023-07-06 18:29:59").unwrap().into(), + "2023-07-06 18:29:59".parse::().unwrap().into(), ); tx.push_watermark( TIME_COL_IDX, DataType::Timestamp, - str_to_timestamp("2023-07-06 18:30:00").unwrap().into(), + "2023-07-06 18:30:00".parse::().unwrap().into(), ); tx.push_watermark(0, DataType::Int64, 100.into()); tx.push_watermark( TIME_COL_IDX, DataType::Timestamp, - str_to_timestamp("2023-07-06 18:43:40").unwrap().into(), + "2023-07-06 18:43:40".parse::().unwrap().into(), ); tx.push_watermark( TIME_COL_IDX, DataType::Timestamp, - str_to_timestamp("2023-07-06 18:50:00").unwrap().into(), + "2023-07-06 18:50:00".parse::().unwrap().into(), ); } diff --git a/src/stream/tests/integration_tests/main.rs b/src/stream/tests/integration_tests/main.rs index 01b4f4c4d899c..f57aea9bcec14 100644 --- a/src/stream/tests/integration_tests/main.rs +++ b/src/stream/tests/integration_tests/main.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +risingwave_expr_impl::enable!(); + // tests mod eowc_over_window; mod hash_agg; diff --git a/src/stream/tests/integration_tests/over_window.rs b/src/stream/tests/integration_tests/over_window.rs index 2377ce12e9147..f59a92df492f1 100644 --- a/src/stream/tests/integration_tests/over_window.rs +++ b/src/stream/tests/integration_tests/over_window.rs @@ -13,7 +13,7 @@ // limitations under the License. use risingwave_common::session_config::OverWindowCachePolicy; -use risingwave_expr::agg::{AggArgs, AggKind}; +use risingwave_expr::aggregate::{AggArgs, AggKind}; use risingwave_expr::window_function::{ Frame, FrameBound, FrameExclusion, WindowFuncCall, WindowFuncKind, }; diff --git a/src/tests/compaction_test/Cargo.toml b/src/tests/compaction_test/Cargo.toml index dd3e5d0a53699..87ad5946b26d5 100644 --- a/src/tests/compaction_test/Cargo.toml +++ b/src/tests/compaction_test/Cargo.toml @@ -27,6 +27,7 @@ risingwave_compactor = { workspace = true } risingwave_hummock_sdk = { workspace = true } risingwave_hummock_test = { workspace = true } risingwave_meta = { workspace = true } +risingwave_meta_node = { workspace = true } risingwave_object_store = { workspace = true } risingwave_pb = { workspace = true } risingwave_rpc_client = { workspace = true } diff --git a/src/tests/compaction_test/src/compaction_test_runner.rs b/src/tests/compaction_test/src/compaction_test_runner.rs index 3e2f993cf9613..cf3e35b48c692 100644 --- a/src/tests/compaction_test/src/compaction_test_runner.rs +++ b/src/tests/compaction_test/src/compaction_test_runner.rs @@ -129,7 +129,7 @@ pub async fn compaction_test_main( } pub async fn start_meta_node(listen_addr: String, state_store: String, config_path: String) { - let meta_opts = risingwave_meta::MetaNodeOpts::parse_from([ + let meta_opts = risingwave_meta_node::MetaNodeOpts::parse_from([ "meta-node", "--listen-addr", &listen_addr, @@ -154,7 +154,7 @@ pub async fn start_meta_node(listen_addr: String, state_store: String, config_pa "enable_compaction_deterministic should be set" ); - risingwave_meta::start(meta_opts).await + risingwave_meta_node::start(meta_opts).await } async fn start_compactor_node( diff --git a/src/tests/compaction_test/src/delete_range_runner.rs b/src/tests/compaction_test/src/delete_range_runner.rs index 5b53ba70b86bd..346cf2fe6acf8 100644 --- a/src/tests/compaction_test/src/delete_range_runner.rs +++ b/src/tests/compaction_test/src/delete_range_runner.rs @@ -37,7 +37,7 @@ use risingwave_meta::hummock::test_utils::setup_compute_env_with_config; use risingwave_meta::hummock::MockHummockMetaClient; use risingwave_object_store::object::object_metrics::ObjectStoreMetrics; use risingwave_object_store::object::parse_remote_object_store; -use risingwave_pb::catalog::{PbStreamJobStatus, PbTable}; +use risingwave_pb::catalog::{PbCreateType, PbStreamJobStatus, PbTable}; use risingwave_pb::hummock::{CompactionConfig, CompactionGroupInfo}; use risingwave_pb::meta::SystemParams; use risingwave_rpc_client::HummockMetaClient; @@ -91,7 +91,8 @@ pub fn start_delete_range(opts: CompactionTestOpts) -> Pin anyhow::Result<()> { let config = load_config(&opts.config_path, NoOverride); - let compaction_config = CompactionConfigBuilder::new().build(); + let compaction_config = + CompactionConfigBuilder::with_opt(&config.meta.compaction_config).build(); compaction_test( compaction_config, config, @@ -152,6 +153,7 @@ async fn compaction_test( created_at_epoch: None, cleaned_by_watermark: false, stream_job_status: PbStreamJobStatus::Created.into(), + create_type: PbCreateType::Foreground.into(), }; let mut delete_range_table = delete_key_table.clone(); delete_range_table.id = 2; @@ -209,6 +211,7 @@ async fn compaction_test( 0, FileCache::none(), FileCache::none(), + None, )); let store = HummockStorage::new( @@ -581,21 +584,27 @@ fn run_compactor_thread( tokio::task::JoinHandle<()>, tokio::sync::oneshot::Sender<()>, ) { + let filter_key_extractor_manager = + FilterKeyExtractorManager::RpcFilterKeyExtractorManager(filter_key_extractor_manager); let compactor_context = CompactorContext { storage_opts, sstable_store, compactor_metrics, is_share_buffer_compact: false, compaction_executor: Arc::new(CompactionExecutor::new(None)), - filter_key_extractor_manager: FilterKeyExtractorManager::RpcFilterKeyExtractorManager( - filter_key_extractor_manager, - ), + memory_limiter: MemoryLimiter::unlimit(), task_progress_manager: Default::default(), await_tree_reg: None, running_task_count: Arc::new(AtomicU32::new(0)), }; - start_compactor(compactor_context, meta_client, sstable_object_id_manager) + + start_compactor( + compactor_context, + meta_client, + sstable_object_id_manager, + filter_key_extractor_manager, + ) } #[cfg(test)] diff --git a/src/tests/libpq_test/Cargo.lock b/src/tests/libpq_test/Cargo.lock index 38710822ca506..74beef901a9ea 100644 --- a/src/tests/libpq_test/Cargo.lock +++ b/src/tests/libpq_test/Cargo.lock @@ -96,9 +96,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.3.3" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "cc" @@ -249,9 +249,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.147" +version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "libloading" @@ -288,9 +288,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.3" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" [[package]] name = "log" @@ -402,11 +402,11 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.38.4" +version = "0.38.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5" +checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed" dependencies = [ - "bitflags 2.3.3", + "bitflags 2.4.1", "errno", "libc", "linux-raw-sys", diff --git a/src/tests/regress/data/schedule b/src/tests/regress/data/schedule index b735b1b6f332e..90fd97b9ffec0 100644 --- a/src/tests/regress/data/schedule +++ b/src/tests/regress/data/schedule @@ -11,3 +11,4 @@ test: boolean varchar text int2 int4 int8 float4 float8 comments test: strings date time timestamp interval test: case arrays test: jsonb +test: regex diff --git a/src/tests/regress/data/sql/regex.sql b/src/tests/regress/data/sql/regex.sql index b03a8d9ac220b..280748746f407 100644 --- a/src/tests/regress/data/sql/regex.sql +++ b/src/tests/regress/data/sql/regex.sql @@ -21,16 +21,16 @@ select 'abc abd abc' ~ '^(.+)( \1)+$' as f; select 'abc abc abd' ~ '^(.+)( \1)+$' as f; -- Test some cases that crashed in 9.2beta1 due to pmatch[] array overrun -select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)'); -select substring('a' from '((a))+'); -select substring('a' from '((a)+)'); +--@ select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)'); +--@ select substring('a' from '((a))+'); +--@ select substring('a' from '((a)+)'); -- Test regexp_match() select regexp_match('abc', ''); select regexp_match('abc', 'bc'); select regexp_match('abc', 'd') is null; select regexp_match('abc', '(B)(c)', 'i'); -select regexp_match('abc', 'Bd', 'ig'); -- error +--@ select regexp_match('abc', 'Bd', 'ig'); -- error -- Test lookahead constraints select regexp_matches('ab', 'a(?=b)b*'); @@ -47,7 +47,7 @@ select regexp_matches('abb', '(?<=a)b*'); select regexp_matches('a', 'a(?<=a)b*'); select regexp_matches('abc', 'a(?<=a)b*(?<=b)c*'); select regexp_matches('ab', 'a(?<=a)b*(?<=b)c*'); -select regexp_matches('ab', 'a*(? Result<()> { + self.run("FLUSH").await?; + Ok(()) + } } /// Options for killing nodes. diff --git a/src/tests/simulation/src/lib.rs b/src/tests/simulation/src/lib.rs index 68c1d0446944d..6cf880d7d66fb 100644 --- a/src/tests/simulation/src/lib.rs +++ b/src/tests/simulation/src/lib.rs @@ -23,3 +23,5 @@ pub mod kafka; pub mod nexmark; pub mod slt; pub mod utils; + +risingwave_expr_impl::enable!(); diff --git a/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs b/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs new file mode 100644 index 0000000000000..1fd5c90e59e4b --- /dev/null +++ b/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs @@ -0,0 +1,96 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use anyhow::Result; +use risingwave_simulation::cluster::{Cluster, Configuration, KillOpts}; +use risingwave_simulation::utils::AssertResult; +use tokio::time::sleep; + +async fn kill_cn_and_wait_recover(cluster: &Cluster) { + // Kill it again + for _ in 0..5 { + cluster + .kill_node(&KillOpts { + kill_rate: 1.0, + kill_meta: false, + kill_frontend: false, + kill_compute: true, + kill_compactor: false, + restart_delay_secs: 1, + }) + .await; + sleep(Duration::from_secs(2)).await; + } + sleep(Duration::from_secs(10)).await; +} + +async fn kill_and_wait_recover(cluster: &Cluster) { + // Kill it again + for _ in 0..5 { + sleep(Duration::from_secs(2)).await; + cluster.kill_node(&KillOpts::ALL).await; + } + sleep(Duration::from_secs(20)).await; +} + +#[tokio::test] +async fn test_background_mv_barrier_recovery() -> Result<()> { + let mut cluster = Cluster::start(Configuration::for_backfill()).await?; + let mut session = cluster.start_session(); + + session.run("CREATE TABLE t1 (v1 int);").await?; + session + .run("INSERT INTO t1 select * from generate_series(1, 400000)") + .await?; + session.run("flush").await?; + session.run("SET BACKGROUND_DDL=true;").await?; + session + .run("create materialized view m1 as select * from t1;") + .await?; + + kill_cn_and_wait_recover(&cluster).await; + + // Send some upstream updates. + cluster + .run("INSERT INTO t1 select * from generate_series(1, 100000);") + .await?; + cluster.run("flush;").await?; + + kill_cn_and_wait_recover(&cluster).await; + + kill_and_wait_recover(&cluster).await; + + // Send some upstream updates. + cluster + .run("INSERT INTO t1 select * from generate_series(1, 100000);") + .await?; + cluster.run("flush;").await?; + + // Now just wait for it to complete. + + sleep(Duration::from_secs(10)).await; + + // Make sure after finished, we should have 5000_000 rows. + session + .run("SELECT COUNT(v1) FROM m1") + .await? + .assert_result_eq("600000"); + + session.run("DROP MATERIALIZED VIEW m1").await?; + session.run("DROP TABLE t1").await?; + + Ok(()) +} diff --git a/src/tests/simulation/tests/integration_tests/recovery/mod.rs b/src/tests/simulation/tests/integration_tests/recovery/mod.rs index 565487e8d7dbd..2430daad760a1 100644 --- a/src/tests/simulation/tests/integration_tests/recovery/mod.rs +++ b/src/tests/simulation/tests/integration_tests/recovery/mod.rs @@ -13,5 +13,6 @@ // limitations under the License. mod backfill; +mod background_ddl; mod nexmark_recovery; mod pause_on_bootstrap; diff --git a/src/tests/simulation/tests/integration_tests/recovery/pause_on_bootstrap.rs b/src/tests/simulation/tests/integration_tests/recovery/pause_on_bootstrap.rs index d0288e6931e88..0eea61da67dfb 100644 --- a/src/tests/simulation/tests/integration_tests/recovery/pause_on_bootstrap.rs +++ b/src/tests/simulation/tests/integration_tests/recovery/pause_on_bootstrap.rs @@ -15,29 +15,43 @@ use std::time::Duration; use anyhow::Result; -use risingwave_simulation::cluster::Configuration; +use risingwave_simulation::cluster::{Cluster, Configuration}; use risingwave_simulation::nexmark::NexmarkCluster; use risingwave_simulation::utils::AssertResult; use tokio::time::{sleep, timeout}; -const CREATE_TABLE: &str = "CREATE TABLE t (v int)"; -const INSERT_INTO_TABLE: &str = "INSERT INTO t VALUES (1)"; -const SELECT_COUNT_TABLE: &str = "SELECT COUNT(*) FROM t"; - -const CREATE: &str = "CREATE MATERIALIZED VIEW count_bid as SELECT COUNT(*) FROM bid"; -const SELECT: &str = "SELECT * FROM count_bid"; - -const CREATE_2: &str = "CREATE MATERIALIZED VIEW count_auction as SELECT COUNT(*) FROM auction"; -const SELECT_2: &str = "SELECT * FROM count_auction"; - const SET_PARAMETER: &str = "ALTER SYSTEM SET pause_on_next_bootstrap TO true"; +#[derive(Clone, Copy)] enum ResumeBy { Risectl, Restart, } +impl ResumeBy { + async fn resume(self, cluster: &mut Cluster) -> Result<()> { + match self { + ResumeBy::Risectl => cluster.resume().await?, + ResumeBy::Restart => cluster.kill_nodes(["meta-1"], 0).await, + }; + Ok(()) + } +} + async fn test_impl(resume_by: ResumeBy) -> Result<()> { + const CREATE_TABLE: &str = "CREATE TABLE t (v int)"; + const INSERT_INTO_TABLE: &str = "INSERT INTO t VALUES (1)"; + const SELECT_COUNT_TABLE: &str = "SELECT COUNT(*) FROM t"; + + const CREATE: &str = "CREATE MATERIALIZED VIEW count_bid as SELECT COUNT(*) FROM bid"; + const SELECT: &str = "SELECT * FROM count_bid"; + + const CREATE_2: &str = "CREATE MATERIALIZED VIEW count_auction as SELECT COUNT(*) FROM auction"; + const SELECT_2: &str = "SELECT * FROM count_auction"; + + const CREATE_VALUES: &str = "CREATE MATERIALIZED VIEW values as VALUES (1), (2), (3)"; + const SELECT_VALUES: &str = "SELECT count(*) FROM values"; + let mut cluster = NexmarkCluster::new( Configuration { meta_nodes: 1, @@ -77,18 +91,21 @@ async fn test_impl(resume_by: ResumeBy) -> Result<()> { // New streaming jobs should also start from paused. cluster.run(CREATE_2).await?; sleep(Duration::from_secs(10)).await; - cluster.run(SELECT_2).await?.assert_result_eq("0"); // even there's no data from source, the + cluster.run(SELECT_2).await?.assert_result_eq("0"); // even there's no data from source, the aggregation // result will be 0 instead of empty or NULL + // `VALUES` should also be paused. + tokio::time::timeout(Duration::from_secs(10), cluster.run(CREATE_VALUES)) + .await + .expect_err("`VALUES` should be paused so creation should never complete"); + // DML on tables should be blocked. let result = timeout(Duration::from_secs(10), cluster.run(INSERT_INTO_TABLE)).await; assert!(result.is_err()); cluster.run(SELECT_COUNT_TABLE).await?.assert_result_eq("0"); - match resume_by { - ResumeBy::Risectl => cluster.resume().await?, - ResumeBy::Restart => cluster.kill_nodes(["meta-1"], 0).await, - } + // Resume the cluster. + resume_by.resume(&mut cluster).await?; sleep(Duration::from_secs(10)).await; // The source should be resumed. @@ -100,17 +117,22 @@ async fn test_impl(resume_by: ResumeBy) -> Result<()> { { let mut session = cluster.start_session(); - session.run("FLUSH").await?; + session.flush().await?; let count: i64 = session.run(SELECT_COUNT_TABLE).await?.parse().unwrap(); session.run(INSERT_INTO_TABLE).await?; - session.run("FLUSH").await?; + session.flush().await?; session .run(SELECT_COUNT_TABLE) .await? .assert_result_eq(format!("{}", count + 1)); } + if let ResumeBy::Risectl = resume_by { + // `VALUES` should be successfully created + cluster.run(SELECT_VALUES).await?.assert_result_eq("3"); + } + Ok(()) } @@ -123,3 +145,64 @@ async fn test_pause_on_bootstrap_resume_by_risectl() -> Result<()> { async fn test_pause_on_bootstrap_resume_by_restart() -> Result<()> { test_impl(ResumeBy::Restart).await } + +// The idea is similar to `e2e_test/batch/transaction/now.slt`. +async fn test_temporal_filter(resume_by: ResumeBy) -> Result<()> { + const CREATE_TABLE: &str = "create table t (ts timestamp)"; + const CREATE_TEMPORAL_FILTER: &str = + "create materialized view mv as select count(*) from t where ts at time zone 'utc' >= now()"; + const INSERT_TIMESTAMPS: &str = " + insert into t select * from generate_series( + now() at time zone 'utc' - interval '10' second, + now() at time zone 'utc' + interval '20' second, + interval '1' second / 20 + ); + "; + const SELECT: &str = "select * from mv"; + + let mut cluster = Cluster::start(Configuration { + meta_nodes: 1, + ..Configuration::for_scale() + }) + .await?; + + cluster.run(SET_PARAMETER).await?; + + { + let mut session = cluster.start_session(); + session.run(CREATE_TABLE).await?; + session.run(CREATE_TEMPORAL_FILTER).await?; + session.run(INSERT_TIMESTAMPS).await?; + session.flush().await?; + }; + + // Kill the meta node and wait for the service to recover. + cluster.kill_nodes(["meta-1"], 0).await; + sleep(Duration::from_secs(10)).await; + + let count: i32 = cluster.run(SELECT).await?.parse()?; + assert_ne!(count, 0, "the following tests are meaningless"); + + sleep(Duration::from_secs(10)).await; + let new_count: i32 = cluster.run(SELECT).await?.parse()?; + assert_eq!(count, new_count, "temporal filter should have been paused"); + + // Resume the cluster. + resume_by.resume(&mut cluster).await?; + sleep(Duration::from_secs(40)).await; // 40 seconds is enough for all timestamps to be expired + + let count: i32 = cluster.run(SELECT).await?.parse()?; + assert_eq!(count, 0, "temporal filter should have been resumed"); + + Ok(()) +} + +#[tokio::test] +async fn test_pause_on_bootstrap_temporal_filter_resume_by_risectl() -> Result<()> { + test_temporal_filter(ResumeBy::Risectl).await +} + +#[tokio::test] +async fn test_pause_on_bootstrap_temporal_filter_resume_by_restart() -> Result<()> { + test_temporal_filter(ResumeBy::Restart).await +} diff --git a/src/tests/simulation/tests/integration_tests/sink/basic.rs b/src/tests/simulation/tests/integration_tests/sink/basic.rs index c0f9f7253f373..bceb45a8a2389 100644 --- a/src/tests/simulation/tests/integration_tests/sink/basic.rs +++ b/src/tests/simulation/tests/integration_tests/sink/basic.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::io::Write; +use std::iter::once; use std::sync::atomic::AtomicUsize; use std::sync::atomic::Ordering::Relaxed; use std::sync::Arc; @@ -20,15 +21,24 @@ use std::time::Duration; use anyhow::Result; use async_trait::async_trait; +use futures::stream::select_all; +use futures::StreamExt; use itertools::Itertools; use rand::prelude::SliceRandom; -use risingwave_common::array::StreamChunk; +use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::buffer::Bitmap; +use risingwave_common::types::{DataType, ScalarImpl}; +use risingwave_common::util::chunk_coalesce::DataChunkBuilder; use risingwave_connector::sink::boxed::{BoxCoordinator, BoxWriter}; use risingwave_connector::sink::test_sink::registry_build_sink; -use risingwave_connector::sink::{Sink, SinkWriter, SinkWriterParam}; +use risingwave_connector::sink::writer::SinkWriter; +use risingwave_connector::sink::{Sink, SinkWriterParam}; +use risingwave_connector::source::test_source::{registry_test_source, BoxSource, TestSourceSplit}; +use risingwave_connector::source::StreamChunkWithState; use risingwave_simulation::cluster::{Cluster, ConfigPath, Configuration}; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver}; use tokio::time::sleep; +use tokio_stream::wrappers::UnboundedReceiverStream; struct TestWriter { row_counter: Arc, @@ -65,30 +75,19 @@ impl Drop for TestWriter { } } -struct TestSink { - row_counter: Arc, - parallelism_counter: Arc, -} - -#[async_trait] -impl Sink for TestSink { - type Coordinator = BoxCoordinator; - type Writer = BoxWriter<()>; - - async fn validate(&self) -> risingwave_connector::sink::Result<()> { - Ok(()) - } - - async fn new_writer( - &self, - _writer_param: SinkWriterParam, - ) -> risingwave_connector::sink::Result { - self.parallelism_counter.fetch_add(1, Relaxed); - Ok(Box::new(TestWriter { - parallelism_counter: self.parallelism_counter.clone(), - row_counter: self.row_counter.clone(), - })) +fn build_stream_chunk(row_iter: impl Iterator) -> StreamChunk { + let mut builder = DataChunkBuilder::new(vec![DataType::Int32, DataType::Varchar], 100000); + for (id, name) in row_iter { + assert!(builder + .append_one_row([ + Some(ScalarImpl::Int32(id)), + Some(ScalarImpl::Utf8(name.into())), + ]) + .is_none()); } + let chunk = builder.consume_all().unwrap(); + let ops = (0..chunk.cardinality()).map(|_| Op::Insert).collect_vec(); + StreamChunk::from_parts(ops, chunk) } #[tokio::test] @@ -118,40 +117,67 @@ async fn test_sink_basic() -> Result<()> { let _sink_guard = registry_build_sink({ let row_counter = row_counter.clone(); let parallelism_counter = parallelism_counter.clone(); - move |_param| { - Ok(Box::new(TestSink { + move |_, _| { + parallelism_counter.fetch_add(1, Relaxed); + Box::new(TestWriter { row_counter: row_counter.clone(), parallelism_counter: parallelism_counter.clone(), - })) + }) } }); + let source_parallelism = 12; + let mut txs = Vec::new(); + let mut rxs = Vec::new(); + for _ in 0..source_parallelism { + let (tx, rx): (_, UnboundedReceiver) = unbounded_channel(); + txs.push(tx); + rxs.push(Some(rx)); + } + + let _source_guard = registry_test_source(BoxSource::new( + move |_, _| { + Ok((0..source_parallelism) + .map(|i: usize| TestSourceSplit { + id: format!("{}", i).as_str().into(), + properties: Default::default(), + offset: "".to_string(), + }) + .collect_vec()) + }, + move |_, splits, _, _, _| { + select_all(splits.into_iter().map(|split| { + let id: usize = split.id.parse().unwrap(); + let rx = rxs[id].take().unwrap(); + UnboundedReceiverStream::new(rx).map(|chunk| Ok(StreamChunkWithState::from(chunk))) + })) + .boxed() + }, + )); + let mut session = cluster.start_session(); session.run("set streaming_parallelism = 6").await?; session.run("set sink_decouple = false").await?; session - .run("create table test_table (id int, name varchar)") + .run("create table test_table (id int primary key, name varchar) with (connector = 'test') FORMAT PLAIN ENCODE JSON") .await?; session .run("create sink test_sink from test_table with (connector = 'test')") .await?; let mut count = 0; - let mut id_list = (0..100000).collect_vec(); + let mut id_list: Vec = (0..100000).collect_vec(); id_list.shuffle(&mut rand::thread_rng()); let flush_freq = 50; - for id in &id_list[0..1000] { - session - .run(format!( - "insert into test_table values ({}, 'name-{}')", - id, id - )) - .await?; + for id in &id_list[0..10000] { + let chunk = build_stream_chunk(once((*id as i32, format!("name-{}", id)))); + txs[id % source_parallelism].send(chunk).unwrap(); count += 1; if count % flush_freq == 0 { - session.run("flush").await?; + sleep(Duration::from_millis(10)).await; } } + sleep(Duration::from_millis(10000)).await; assert_eq!(6, parallelism_counter.load(Relaxed)); assert_eq!(count, row_counter.load(Relaxed)); @@ -190,20 +216,50 @@ async fn test_sink_decouple_basic() -> Result<()> { let _sink_guard = registry_build_sink({ let row_counter = row_counter.clone(); let parallelism_counter = parallelism_counter.clone(); - move |_param| { - Ok(Box::new(TestSink { + move |_, _| { + parallelism_counter.fetch_add(1, Relaxed); + Box::new(TestWriter { row_counter: row_counter.clone(), parallelism_counter: parallelism_counter.clone(), - })) + }) } }); + let source_parallelism = 12; + let mut txs = Vec::new(); + let mut rxs = Vec::new(); + for _ in 0..source_parallelism { + let (tx, rx): (_, UnboundedReceiver) = unbounded_channel(); + txs.push(tx); + rxs.push(Some(rx)); + } + + let _source_guard = registry_test_source(BoxSource::new( + move |_, _| { + Ok((0..source_parallelism) + .map(|i: usize| TestSourceSplit { + id: format!("{}", i).as_str().into(), + properties: Default::default(), + offset: "".to_string(), + }) + .collect_vec()) + }, + move |_, splits, _, _, _| { + select_all(splits.into_iter().map(|split| { + let id: usize = split.id.parse().unwrap(); + let rx = rxs[id].take().unwrap(); + UnboundedReceiverStream::new(rx).map(|chunk| Ok(StreamChunkWithState::from(chunk))) + })) + .boxed() + }, + )); + let mut session = cluster.start_session(); session.run("set streaming_parallelism = 6").await?; session.run("set sink_decouple = true").await?; session - .run("create table test_table (id int, name varchar)") + .run("create table test_table (id int primary key, name varchar) with (connector = 'test') FORMAT PLAIN ENCODE JSON") .await?; session .run("create sink test_sink from test_table with (connector = 'test')") @@ -214,16 +270,12 @@ async fn test_sink_decouple_basic() -> Result<()> { let mut id_list = (0..100000).collect_vec(); id_list.shuffle(&mut rand::thread_rng()); let flush_freq = 50; - for id in &id_list[0..1000] { - session - .run(format!( - "insert into test_table values ({}, 'name-{}')", - id, id - )) - .await?; + for id in &id_list[0..10000] { + let chunk = build_stream_chunk(once((*id as i32, format!("name-{}", id)))); + txs[id % source_parallelism].send(chunk).unwrap(); count += 1; if count % flush_freq == 0 { - session.run("flush").await?; + sleep(Duration::from_millis(10)).await; } } @@ -239,3 +291,81 @@ async fn test_sink_decouple_basic() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn test_sink_decouple_blackhole() -> Result<()> { + let config_path = { + let mut file = tempfile::NamedTempFile::new().expect("failed to create temp config file"); + file.write_all(include_bytes!("../../../../../config/ci-sim.toml")) + .expect("failed to write config file"); + file.into_temp_path() + }; + + let mut cluster = Cluster::start(Configuration { + config_path: ConfigPath::Temp(config_path.into()), + frontend_nodes: 1, + compute_nodes: 3, + meta_nodes: 1, + compactor_nodes: 1, + compute_node_cores: 2, + etcd_timeout_rate: 0.0, + etcd_data_path: None, + }) + .await?; + + let source_parallelism = 12; + let mut txs = Vec::new(); + let mut rxs = Vec::new(); + for _ in 0..source_parallelism { + let (tx, rx): (_, UnboundedReceiver) = unbounded_channel(); + txs.push(tx); + rxs.push(Some(rx)); + } + + let _source_guard = registry_test_source(BoxSource::new( + move |_, _| { + Ok((0..source_parallelism) + .map(|i: usize| TestSourceSplit { + id: format!("{}", i).as_str().into(), + properties: Default::default(), + offset: "".to_string(), + }) + .collect_vec()) + }, + move |_, splits, _, _, _| { + select_all(splits.into_iter().map(|split| { + let id: usize = split.id.parse().unwrap(); + let rx = rxs[id].take().unwrap(); + UnboundedReceiverStream::new(rx).map(|chunk| Ok(StreamChunkWithState::from(chunk))) + })) + .boxed() + }, + )); + + let mut session = cluster.start_session(); + + session.run("set streaming_parallelism = 6").await?; + session.run("set sink_decouple = true").await?; + session + .run("create table test_table (id int primary key, name varchar) with (connector = 'test') FORMAT PLAIN ENCODE JSON") + .await?; + session + .run("create sink test_sink from test_table with (connector = 'blackhole')") + .await?; + + let mut count = 0; + let mut id_list = (0..100000).collect_vec(); + id_list.shuffle(&mut rand::thread_rng()); + let flush_freq = 50; + for id in &id_list[0..10000] { + let chunk = build_stream_chunk(once((*id as i32, format!("name-{}", id)))); + txs[id % source_parallelism].send(chunk).unwrap(); + count += 1; + if count % flush_freq == 0 { + sleep(Duration::from_millis(10)).await; + } + } + + session.run("drop sink test_sink").await?; + Ok(()) +} diff --git a/src/tests/sqlsmith/Cargo.toml b/src/tests/sqlsmith/Cargo.toml index 57acbc8d94cca..402c6119cd1cf 100644 --- a/src/tests/sqlsmith/Cargo.toml +++ b/src/tests/sqlsmith/Cargo.toml @@ -23,10 +23,11 @@ rand_chacha = { version = "0.3.1" } regex = "1" risingwave_common = { workspace = true } risingwave_expr = { workspace = true } +risingwave_expr_impl = { workspace = true } risingwave_frontend = { workspace = true } risingwave_pb = { workspace = true } risingwave_sqlparser = { workspace = true } -similar = "2.2.1" +similar = "2.3.0" tokio = { version = "0.2", package = "madsim-tokio" } tokio-postgres = "0.7" tracing = "0.1" diff --git a/src/tests/sqlsmith/src/lib.rs b/src/tests/sqlsmith/src/lib.rs index 2f7e1ce5eb14b..ebb0682a7aaaf 100644 --- a/src/tests/sqlsmith/src/lib.rs +++ b/src/tests/sqlsmith/src/lib.rs @@ -17,6 +17,8 @@ #![feature(lazy_cell)] #![feature(box_patterns)] +risingwave_expr_impl::enable!(); + use std::collections::{HashMap, HashSet}; use anyhow::{bail, Result}; diff --git a/src/tests/sqlsmith/src/sql_gen/agg.rs b/src/tests/sqlsmith/src/sql_gen/agg.rs index 6c1bd2bd26dcf..c42eb6c7b0ffc 100644 --- a/src/tests/sqlsmith/src/sql_gen/agg.rs +++ b/src/tests/sqlsmith/src/sql_gen/agg.rs @@ -15,7 +15,8 @@ use rand::seq::SliceRandom; use rand::Rng; use risingwave_common::types::DataType; -use risingwave_expr::agg::AggKind; +use risingwave_expr::aggregate::AggKind; +use risingwave_expr::sig::SigDataType; use risingwave_sqlparser::ast::{ Expr, Function, FunctionArg, FunctionArgExpr, Ident, ObjectName, OrderByExpr, }; @@ -30,13 +31,12 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { Some(funcs) => funcs, }; let func = funcs.choose(&mut self.rng).unwrap(); - if matches!( - (func.func, func.inputs_type.as_slice()), - ( - AggKind::Min | AggKind::Max, - [DataType::Boolean | DataType::Jsonb] + if matches!(func.name.as_aggregate(), AggKind::Min | AggKind::Max) + && matches!( + func.ret_type, + SigDataType::Exact(DataType::Boolean | DataType::Jsonb) ) - ) { + { return self.gen_simple_scalar(ret); } @@ -45,13 +45,13 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { let exprs: Vec = func .inputs_type .iter() - .map(|t| self.gen_expr(t, context)) + .map(|t| self.gen_expr(t.as_exact(), context)) .collect(); // DISTINCT now only works with agg kinds except `ApproxCountDistinct`, and with at least // one argument and only the first being non-constant. See `Binder::bind_normal_agg` // for more details. - let distinct_allowed = func.func != AggKind::ApproxCountDistinct + let distinct_allowed = func.name.as_aggregate() != AggKind::ApproxCountDistinct && !exprs.is_empty() && exprs.iter().skip(1).all(|e| matches!(e, Expr::Value(_))); let distinct = distinct_allowed && self.flip_coin(); @@ -79,7 +79,7 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { } else { vec![] }; - self.make_agg_expr(func.func, &exprs, distinct, filter, order_by) + self.make_agg_expr(func.name.as_aggregate(), &exprs, distinct, filter, order_by) .unwrap_or_else(|| self.gen_simple_scalar(ret)) } diff --git a/src/tests/sqlsmith/src/sql_gen/expr.rs b/src/tests/sqlsmith/src/sql_gen/expr.rs index f9772c97d4b5c..9999dcd9ea641 100644 --- a/src/tests/sqlsmith/src/sql_gen/expr.rs +++ b/src/tests/sqlsmith/src/sql_gen/expr.rs @@ -16,7 +16,8 @@ use itertools::Itertools; use rand::seq::SliceRandom; use rand::Rng; use risingwave_common::types::{DataType, DataTypeName, StructType}; -use risingwave_frontend::expr::{agg_func_sigs, cast_sigs, func_sigs}; +use risingwave_expr::sig::cast::cast_sigs; +use risingwave_expr::sig::FUNCTION_REGISTRY; use risingwave_sqlparser::ast::{Expr, Ident, OrderByExpr, Value}; use crate::sql_gen::types::data_type_to_ast_data_type; @@ -302,29 +303,25 @@ pub(crate) fn sql_null() -> Expr { // Add variadic function signatures. Can add these functions // to a FUNC_TABLE too. pub fn print_function_table() -> String { - let func_str = func_sigs() + let func_str = FUNCTION_REGISTRY + .iter_scalars() .map(|sign| { format!( - "{:?}({}) -> {:?}", - sign.func, - sign.inputs_type - .iter() - .map(|arg| format!("{:?}", arg)) - .join(", "), + "{}({}) -> {}", + sign.name, + sign.inputs_type.iter().format(", "), sign.ret_type, ) }) .join("\n"); - let agg_func_str = agg_func_sigs() + let agg_func_str = FUNCTION_REGISTRY + .iter_aggregates() .map(|sign| { format!( - "{:?}({}) -> {:?}", - sign.func, - sign.inputs_type - .iter() - .map(|arg| format!("{:?}", arg)) - .join(", "), + "{}({}) -> {}", + sign.name, + sign.inputs_type.iter().format(", "), sign.ret_type, ) }) diff --git a/src/tests/sqlsmith/src/sql_gen/functions.rs b/src/tests/sqlsmith/src/sql_gen/functions.rs index 6af491bd8a64d..01cbb0604d262 100644 --- a/src/tests/sqlsmith/src/sql_gen/functions.rs +++ b/src/tests/sqlsmith/src/sql_gen/functions.rs @@ -49,6 +49,7 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { 4 => self.gen_overlay(context), _ => unreachable!(), }, + T::Bytea => self.gen_decode(context), _ => match self.rng.gen_bool(0.5) { true => self.gen_case(ret, context), false => self.gen_coalesce(ret, context), @@ -121,36 +122,46 @@ impl<'a, R: Rng> SqlGenerator<'a, R> { .collect() } + fn gen_decode(&mut self, context: SqlGeneratorContext) -> Expr { + let input_string = self.gen_expr(&DataType::Varchar, context); + let encoding = &["base64", "hex", "escape"].choose(&mut self.rng).unwrap(); + let args = vec![ + input_string, + Expr::Value(Value::SingleQuotedString(encoding.to_string())), + ]; + Expr::Function(make_simple_func("decode", &args)) + } + fn gen_fixed_func(&mut self, ret: &DataType, context: SqlGeneratorContext) -> Expr { let funcs = match FUNC_TABLE.get(ret) { None => return self.gen_simple_scalar(ret), Some(funcs) => funcs, }; let func = funcs.choose(&mut self.rng).unwrap(); - let can_implicit_cast = INVARIANT_FUNC_SET.contains(&func.func); + let can_implicit_cast = INVARIANT_FUNC_SET.contains(&func.name.as_scalar()); let exprs: Vec = func .inputs_type .iter() .map(|t| { - if let Some(from_tys) = IMPLICIT_CAST_TABLE.get(t) + if let Some(from_tys) = IMPLICIT_CAST_TABLE.get(t.as_exact()) && can_implicit_cast && self.flip_coin() { let from_ty = &from_tys.choose(&mut self.rng).unwrap().from_type; self.gen_implicit_cast(from_ty, context) } else { - self.gen_expr(t, context) + self.gen_expr(t.as_exact(), context) } }) .collect(); let expr = if exprs.len() == 1 { - make_unary_op(func.func, &exprs[0]) + make_unary_op(func.name.as_scalar(), &exprs[0]) } else if exprs.len() == 2 { - make_bin_op(func.func, &exprs) + make_bin_op(func.name.as_scalar(), &exprs) } else { None }; - expr.or_else(|| make_general_expr(func.func, exprs)) + expr.or_else(|| make_general_expr(func.name.as_scalar(), exprs)) .unwrap_or_else(|| self.gen_simple_scalar(ret)) } } diff --git a/src/tests/sqlsmith/src/sql_gen/types.rs b/src/tests/sqlsmith/src/sql_gen/types.rs index 939d869744ea5..06d170e604ace 100644 --- a/src/tests/sqlsmith/src/sql_gen/types.rs +++ b/src/tests/sqlsmith/src/sql_gen/types.rs @@ -19,10 +19,9 @@ use std::sync::LazyLock; use itertools::Itertools; use risingwave_common::types::{DataType, DataTypeName}; -use risingwave_expr::agg::AggKind; -use risingwave_expr::sig::agg::{agg_func_sigs, AggFuncSig as RwAggFuncSig}; +use risingwave_expr::aggregate::AggKind; use risingwave_expr::sig::cast::{cast_sigs, CastContext, CastSig as RwCastSig}; -use risingwave_expr::sig::func::{func_sigs, FuncSign as RwFuncSig}; +use risingwave_expr::sig::{FuncSign, FUNCTION_REGISTRY}; use risingwave_frontend::expr::ExprType; use risingwave_sqlparser::ast::{BinaryOperator, DataType as AstDataType, StructField}; @@ -104,73 +103,16 @@ impl TryFrom for CastSig { } } -/// Provide internal `FuncSig` which can be used for `struct` and `list`. -#[derive(Clone)] -pub struct FuncSig { - pub func: ExprType, - pub inputs_type: Vec, - pub ret_type: DataType, -} - -impl TryFrom<&RwFuncSig> for FuncSig { - type Error = String; - - fn try_from(value: &RwFuncSig) -> Result { - if let Some(inputs_type) = value - .inputs_type - .iter() - .map(data_type_name_to_ast_data_type) - .collect() - && let Some(ret_type) = data_type_name_to_ast_data_type(&value.ret_type) - { - Ok(FuncSig { - inputs_type, - ret_type, - func: value.func, - }) - } else { - Err(format!("unsupported func sig: {:?}", value)) - } - } -} - -/// Provide internal `AggFuncSig` which can be used for `struct` and `list`. -#[derive(Clone)] -pub struct AggFuncSig { - pub func: AggKind, - pub inputs_type: Vec, - pub ret_type: DataType, -} - -impl TryFrom<&RwAggFuncSig> for AggFuncSig { - type Error = String; - - fn try_from(value: &RwAggFuncSig) -> Result { - if let Some(inputs_type) = value - .inputs_type - .iter() - .map(data_type_name_to_ast_data_type) - .collect() - && let Some(ret_type) = data_type_name_to_ast_data_type(&value.ret_type) - { - Ok(AggFuncSig { - inputs_type, - ret_type, - func: value.func, - }) - } else { - Err(format!("unsupported agg_func sig: {:?}", value)) - } - } -} - /// Function ban list. /// These functions should be generated eventually, by adding expression constraints. /// If we naively generate arguments for these functions, it will affect sqlsmith /// effectiveness, e.g. cause it to crash. static FUNC_BAN_LIST: LazyLock> = LazyLock::new(|| { [ - ExprType::Repeat, // FIXME: https://github.com/risingwavelabs/risingwave/issues/8003 + // FIXME: https://github.com/risingwavelabs/risingwave/issues/8003 + ExprType::Repeat, + // The format argument needs to be handled specially. It is still generated in `gen_special_func`. + ExprType::Decode, ] .into_iter() .collect() @@ -178,26 +120,38 @@ static FUNC_BAN_LIST: LazyLock> = LazyLock::new(|| { /// Table which maps functions' return types to possible function signatures. // ENABLE: https://github.com/risingwavelabs/risingwave/issues/5826 -pub(crate) static FUNC_TABLE: LazyLock>> = LazyLock::new(|| { - let mut funcs = HashMap::>::new(); - func_sigs() - .filter(|func| { - func.inputs_type - .iter() - .all(|t| *t != DataTypeName::Timestamptz) - && !FUNC_BAN_LIST.contains(&func.func) - && !func.deprecated // deprecated functions are not accepted by frontend - }) - .filter_map(|func| func.try_into().ok()) - .for_each(|func: FuncSig| funcs.entry(func.ret_type.clone()).or_default().push(func)); - funcs -}); +// TODO: Create a `SPECIAL_FUNC` table. +// Otherwise when we dump the function table, we won't include those functions in +// gen_special_func. +pub(crate) static FUNC_TABLE: LazyLock>> = + LazyLock::new(|| { + let mut funcs = HashMap::>::new(); + FUNCTION_REGISTRY + .iter_scalars() + .filter(|func| { + func.inputs_type.iter().all(|t| { + t.is_exact() + && t.as_exact() != &DataType::Timestamptz + && t.as_exact() != &DataType::Serial + }) && func.ret_type.is_exact() + && !FUNC_BAN_LIST.contains(&func.name.as_scalar()) + && !func.deprecated // deprecated functions are not accepted by frontend + }) + .for_each(|func| { + funcs + .entry(func.ret_type.as_exact().clone()) + .or_default() + .push(func) + }); + funcs + }); /// Set of invariant functions // ENABLE: https://github.com/risingwavelabs/risingwave/issues/5826 pub(crate) static INVARIANT_FUNC_SET: LazyLock> = LazyLock::new(|| { - func_sigs() - .map(|sig| sig.func) + FUNCTION_REGISTRY + .iter_scalars() + .map(|sig| sig.name.as_scalar()) .counts() .into_iter() .filter(|(_key, count)| *count == 1) @@ -207,14 +161,16 @@ pub(crate) static INVARIANT_FUNC_SET: LazyLock> = LazyLock::ne /// Table which maps aggregate functions' return types to possible function signatures. // ENABLE: https://github.com/risingwavelabs/risingwave/issues/5826 -pub(crate) static AGG_FUNC_TABLE: LazyLock>> = LazyLock::new( - || { - let mut funcs = HashMap::>::new(); - agg_func_sigs() +pub(crate) static AGG_FUNC_TABLE: LazyLock>> = + LazyLock::new(|| { + let mut funcs = HashMap::>::new(); + FUNCTION_REGISTRY + .iter_aggregates() .filter(|func| { func.inputs_type .iter() - .all(|t| *t != DataTypeName::Timestamptz) + .all(|t| t.is_exact() && t.as_exact() != &DataType::Timestamptz && t.as_exact() != &DataType::Serial) + && func.ret_type.is_exact() // Ignored functions && ![ AggKind::Sum0, // Used internally @@ -226,25 +182,23 @@ pub(crate) static AGG_FUNC_TABLE: LazyLock>> = AggKind::PercentileDisc, AggKind::Mode, ] - .contains(&func.func) + .contains(&func.name.as_aggregate()) // Exclude 2 phase agg global sum. // Sum(Int64) -> Int64. // Otherwise it conflicts with normal aggregation: // Sum(Int64) -> Decimal. // And sqlsmith will generate expressions with wrong types. - && if func.func == AggKind::Sum { - !(func.inputs_type[0] == DataTypeName::Int64 && func.ret_type == DataTypeName::Int64) + && if func.name.as_aggregate() == AggKind::Sum { + !(func.inputs_type[0].as_exact() == &DataType::Int64 && func.ret_type.as_exact() == &DataType::Int64) } else { true } }) - .filter_map(|func| func.try_into().ok()) - .for_each(|func: AggFuncSig| { - funcs.entry(func.ret_type.clone()).or_default().push(func) + .for_each(|func| { + funcs.entry(func.ret_type.as_exact().clone()).or_default().push(func) }); funcs - }, -); + }); /// Build a cast map from return types to viable cast-signatures. /// NOTE: We avoid cast from varchar to other datatypes apart from itself. @@ -299,28 +253,24 @@ pub(crate) static BINARY_INEQUALITY_OP_TABLE: LazyLock< HashMap<(DataType, DataType), Vec>, > = LazyLock::new(|| { let mut funcs = HashMap::<(DataType, DataType), Vec>::new(); - func_sigs() + FUNCTION_REGISTRY + .iter_scalars() .filter(|func| { - !FUNC_BAN_LIST.contains(&func.func) - && func.ret_type == DataTypeName::Boolean + !FUNC_BAN_LIST.contains(&func.name.as_scalar()) + && func.ret_type == DataType::Boolean.into() && func.inputs_type.len() == 2 && func .inputs_type .iter() - .all(|t| *t != DataTypeName::Timestamptz) + .all(|t| t.is_exact() && t.as_exact() != &DataType::Timestamptz) }) .filter_map(|func| { - let Some(lhs) = data_type_name_to_ast_data_type(&func.inputs_type[0]) else { - return None; - }; - let Some(rhs) = data_type_name_to_ast_data_type(&func.inputs_type[1]) else { - return None; - }; - let args = (lhs, rhs); - let Some(op) = expr_type_to_inequality_op(func.func) else { + let lhs = func.inputs_type[0].as_exact().clone(); + let rhs = func.inputs_type[1].as_exact().clone(); + let Some(op) = expr_type_to_inequality_op(func.name.as_scalar()) else { return None; }; - Some((args, op)) + Some(((lhs, rhs), op)) }) .for_each(|(args, op)| funcs.entry(args).or_default().push(op)); funcs diff --git a/src/tests/state_cleaning_test/Cargo.toml b/src/tests/state_cleaning_test/Cargo.toml index 2116e1d58659a..d9154309f4a99 100644 --- a/src/tests/state_cleaning_test/Cargo.toml +++ b/src/tests/state_cleaning_test/Cargo.toml @@ -25,7 +25,7 @@ serde_with = "3" tokio = { version = "0.2", package = "madsim-tokio" } tokio-postgres = "0.7" tokio-stream = { version = "0.1", features = ["fs"] } -toml = "0.7" +toml = "0.8" tracing = "0.1" [target.'cfg(not(madsim))'.dependencies] diff --git a/src/udf/src/error.rs b/src/udf/src/error.rs index 98b9faa57171b..4a2ae18f61357 100644 --- a/src/udf/src/error.rs +++ b/src/udf/src/error.rs @@ -45,7 +45,7 @@ pub enum Error { ServiceError(String), } -static_assertions::const_assert_eq!(std::mem::size_of::(), 32); +static_assertions::const_assert_eq!(std::mem::size_of::(), 40); impl From for Error { fn from(status: tonic::Status) -> Self { diff --git a/src/utils/pgwire/Cargo.toml b/src/utils/pgwire/Cargo.toml index cfa82c1393de8..c6d46e356518a 100644 --- a/src/utils/pgwire/Cargo.toml +++ b/src/utils/pgwire/Cargo.toml @@ -16,7 +16,7 @@ normal = ["workspace-hack"] [dependencies] anyhow = { version = "1.0", default-features = false } -byteorder = "1.4" +byteorder = "1.5" bytes = "1" futures = { version = "0.3", default-features = false, features = ["alloc"] } itertools = "0.11" diff --git a/src/utils/runtime/Cargo.toml b/src/utils/runtime/Cargo.toml index f63f7d63d7e7b..8bd4e49d808a9 100644 --- a/src/utils/runtime/Cargo.toml +++ b/src/utils/runtime/Cargo.toml @@ -16,20 +16,15 @@ normal = ["workspace-hack"] [dependencies] await-tree = { workspace = true } -chrono = { version = "0.4", default-features = false, features = [ - "clock", - "std", -] } console = "0.15" -console-subscriber = "0.1.10" +console-subscriber = "0.2.0" either = "1" futures = { version = "0.3", default-features = false, features = ["alloc"] } hostname = "0.3" opentelemetry-otlp = { version = "0.13" } opentelemetry-semantic-conventions = "0.12" parking_lot = { version = "0.12", features = ["deadlock_detection"] } -pprof = { version = "0.12", features = ["flamegraph"] } -prometheus = { version = "0.13" } +pprof = { version = "0.13", features = ["flamegraph"] } risingwave_common = { workspace = true } risingwave_variables = { workspace = true } rlimit = "0.10" diff --git a/src/utils/runtime/src/logger.rs b/src/utils/runtime/src/logger.rs index 4a4b77936b800..916dd93d7a32b 100644 --- a/src/utils/runtime/src/logger.rs +++ b/src/utils/runtime/src/logger.rs @@ -28,29 +28,6 @@ use tracing_subscriber::{filter, EnvFilter}; const PGWIRE_QUERY_LOG: &str = "pgwire_query_log"; const SLOW_QUERY_LOG: &str = "risingwave_frontend_slow_query_log"; -/// Configure log targets for some `RisingWave` crates. -/// -/// Other RisingWave crates will follow the default level (`DEBUG` or `INFO` according to -/// the `debug_assertions` and `is_ci` flag). -fn configure_risingwave_targets_fmt(targets: filter::Targets) -> filter::Targets { - targets - // force a lower level for important logs - .with_target("risingwave_stream", Level::DEBUG) - .with_target("risingwave_storage", Level::DEBUG) - // force a higher level for noisy logs - .with_target("risingwave_sqlparser", Level::INFO) - .with_target("pgwire", Level::INFO) - .with_target(PGWIRE_QUERY_LOG, Level::OFF) - // force a higher level for foyer logs - .with_target("foyer", Level::WARN) - .with_target("foyer_common", Level::WARN) - .with_target("foyer_intrusive", Level::WARN) - .with_target("foyer_memory", Level::WARN) - .with_target("foyer_storage", Level::WARN) - // disable events that are too verbose - .with_target("events", Level::ERROR) -} - pub struct LoggerSettings { /// The name of the service. name: String, @@ -122,9 +99,12 @@ impl LoggerSettings { /// Overrides default level and tracing targets of the fmt layer (formatting and /// logging to `stdout` or `stderr`). /// +/// Note that only verbosity levels below or equal to `DEBUG` are effective in +/// release builds. +/// /// e.g., /// ```bash -/// RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" +/// RUST_LOG="info,risingwave_stream=debug,events=debug" /// ``` /// /// ### `RW_QUERY_LOG_PATH` @@ -159,7 +139,20 @@ pub fn init_risingwave_logger(settings: LoggerSettings) { // Default filter for logging to stdout and tracing. let default_filter = { - let mut filter = filter::Targets::new() + let mut filter = filter::Targets::new(); + + // Configure levels for some RisingWave crates. + // Other RisingWave crates like `stream` and `storage` will follow the default level. + filter = filter + .with_target("risingwave_sqlparser", Level::INFO) + .with_target("pgwire", Level::INFO) + .with_target(PGWIRE_QUERY_LOG, Level::OFF) + // debug-purposed events are disabled unless `RUST_LOG` overrides + .with_target("events", Level::OFF); + + // Configure levels for external crates. + filter = filter + .with_target("foyer", Level::WARN) .with_target("aws_sdk_ec2", Level::INFO) .with_target("aws_sdk_s3", Level::INFO) .with_target("aws_config", Level::WARN) @@ -175,10 +168,8 @@ pub fn init_risingwave_logger(settings: LoggerSettings) { .with_target("reqwest", Level::WARN) .with_target("sled", Level::INFO); - filter = configure_risingwave_targets_fmt(filter); - - // For all other crates - filter = filter.with_default(match Deployment::current() { + // For all other crates, apply default level depending on the deployment and `debug_assertions` flag. + let default_level = match deployment { Deployment::Ci => Level::INFO, _ => { if cfg!(debug_assertions) { @@ -187,22 +178,23 @@ pub fn init_risingwave_logger(settings: LoggerSettings) { Level::INFO } } - }); + }; + filter = filter.with_default(default_level); - // Overrides from settings + // Overrides from settings. filter = filter.with_targets(settings.targets); if let Some(default_level) = settings.default_level { filter = filter.with_default(default_level); } - // Overrides from env var + // Overrides from env var. if let Ok(rust_log) = std::env::var(EnvFilter::DEFAULT_ENV) && !rust_log.is_empty() { - let rust_log_targets: Targets = rust_log.parse().expect("failed to parse `RUST_LOG`"); - if let Some(default_level) = rust_log_targets.default_level() { - filter = filter.with_default(default_level); - } - filter = filter.with_targets(rust_log_targets) - }; + let rust_log_targets: Targets = rust_log.parse().expect("failed to parse `RUST_LOG`"); + if let Some(default_level) = rust_log_targets.default_level() { + filter = filter.with_default(default_level); + } + filter = filter.with_targets(rust_log_targets) + }; filter }; diff --git a/src/utils/workspace-config/Cargo.toml b/src/utils/workspace-config/Cargo.toml index d8b2dd800ab1b..df70a2c6d0054 100644 --- a/src/utils/workspace-config/Cargo.toml +++ b/src/utils/workspace-config/Cargo.toml @@ -25,5 +25,10 @@ zstd-sys = { version = "2", optional = true, default-features = false, features # workspace-hack = { path = "../../workspace-hack" } # Don't add workspace-hack into this crate! +# FIXME(xxchan): This is a temporary fix due to how cargo and hakari works. See related PR for more details. +# We will revisit how to handle workspace-hack and build-dependency issues later. +[build-dependencies] +openssl-sys = { version = "=0.9.92", optional = true, features = ["vendored"] } + [lints] workspace = true diff --git a/src/workspace-hack/Cargo.toml b/src/workspace-hack/Cargo.toml index 41d9a50d5b022..44b7ced021501 100644 --- a/src/workspace-hack/Cargo.toml +++ b/src/workspace-hack/Cargo.toml @@ -21,6 +21,7 @@ publish = false ahash = { version = "0.8" } allocator-api2 = { version = "0.2", default-features = false, features = ["alloc", "nightly"] } anyhow = { version = "1", features = ["backtrace"] } +async-std = { version = "1", features = ["attributes", "tokio1"] } aws-credential-types = { version = "0.55", default-features = false, features = ["hardcoded-credentials"] } aws-sdk-s3 = { version = "0.28", features = ["native-tls"] } aws-smithy-client = { version = "0.55", default-features = false, features = ["native-tls", "rustls"] } @@ -29,10 +30,10 @@ bit-vec = { version = "0.6" } bitflags = { version = "2", default-features = false, features = ["serde", "std"] } byteorder = { version = "1" } bytes = { version = "1", features = ["serde"] } -chrono = { version = "0.4", features = ["alloc", "serde"] } +chrono = { version = "0.4", features = ["serde"] } clap = { version = "4", features = ["cargo", "derive", "env"] } clap_builder = { version = "4", default-features = false, features = ["cargo", "color", "env", "help", "std", "suggestions", "usage"] } -combine = { version = "4" } +combine = { version = "4", features = ["tokio"] } crossbeam-epoch = { version = "0.9" } crossbeam-queue = { version = "0.3" } crossbeam-utils = { version = "0.8" } @@ -52,10 +53,10 @@ futures-task = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["nightly", "raw"] } hashbrown-5ef9efb8ec2df382 = { package = "hashbrown", version = "0.12", features = ["nightly", "raw"] } -heck = { version = "0.4", features = ["unicode"] } hyper = { version = "0.14", features = ["full"] } indexmap = { version = "1", default-features = false, features = ["serde", "std"] } -itertools = { version = "0.10" } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } +itertools-a6292c17cd707f01 = { package = "itertools", version = "0.11" } jni = { version = "0.21", features = ["invocation"] } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } lexical-core = { version = "0.8", features = ["format"] } @@ -66,12 +67,11 @@ lexical-write-float = { version = "0.8", default-features = false, features = [" lexical-write-integer = { version = "0.8", default-features = false, features = ["format", "std"] } libc = { version = "0.2", features = ["extra_traits"] } lock_api = { version = "0.4", features = ["arc_lock"] } -log = { version = "0.4", default-features = false, features = ["std"] } -madsim-rdkafka = { git = "https://github.com/madsim-rs/madsim.git", rev = "fedb1e3", features = ["cmake-build", "gssapi", "ssl-vendored", "zstd"] } +log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] } +madsim-rdkafka = { version = "0.3", features = ["cmake-build", "gssapi", "ssl-vendored", "zstd"] } madsim-tokio = { version = "0.2", default-features = false, features = ["fs", "io-util", "macros", "net", "process", "rt", "rt-multi-thread", "signal", "sync", "time", "tracing"] } md-5 = { version = "0.10" } mio = { version = "0.8", features = ["net", "os-ext"] } -multimap = { version = "0.8" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } @@ -79,6 +79,7 @@ num-iter = { version = "0.1", default-features = false, features = ["i128", "std num-traits = { version = "0.2", features = ["i128", "libm"] } opentelemetry_api = { version = "0.20", features = ["logs", "metrics"] } opentelemetry_sdk = { version = "0.20", features = ["logs", "metrics"] } +ordered-float = { version = "3" } parking_lot = { version = "0.12", features = ["arc_lock", "deadlock_detection"] } parking_lot_core = { version = "0.9", default-features = false, features = ["deadlock_detection"] } petgraph = { version = "0.6" } @@ -86,17 +87,24 @@ phf = { version = "0.11", features = ["uncased"] } phf_shared = { version = "0.11", features = ["uncased"] } postgres-types = { version = "0.2", default-features = false, features = ["derive", "with-chrono-0_4", "with-serde_json-1"] } prometheus = { version = "0.13", features = ["process"] } -prost = { version = "0.11", features = ["no-recursion-limit"] } +prost-5ef9efb8ec2df382 = { package = "prost", version = "0.12", features = ["no-recursion-limit"] } +prost-a6292c17cd707f01 = { package = "prost", version = "0.11" } +prost-types = { version = "0.12" } rand = { version = "0.8", features = ["small_rng"] } rand_chacha = { version = "0.3" } rand_core = { version = "0.6", default-features = false, features = ["std"] } +redis = { version = "0.23", features = ["async-std-comp", "tokio-comp"] } regex = { version = "1" } -regex-automata = { version = "0.3", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } -regex-syntax = { version = "0.7" } +regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } +regex-syntax = { version = "0.8" } reqwest = { version = "0.11", features = ["blocking", "json", "rustls-tls"] } ring = { version = "0.16", features = ["std"] } rust_decimal = { version = "1", features = ["db-postgres", "maths"] } +rustls = { version = "0.21" } scopeguard = { version = "1" } +sea-orm = { version = "0.12", features = ["runtime-tokio-native-tls", "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite"] } +sea-query = { version = "0.30", default-features = false, features = ["backend-mysql", "backend-postgres", "backend-sqlite", "derive", "hashable-value", "postgres-array", "thread-safe", "with-bigdecimal", "with-chrono", "with-json", "with-rust_decimal", "with-time", "with-uuid"] } +sea-query-binder = { version = "0.5", default-features = false, features = ["postgres-array", "runtime-tokio-native-tls", "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", "with-bigdecimal", "with-chrono", "with-json", "with-rust_decimal", "with-time", "with-uuid"] } serde = { version = "1", features = ["alloc", "derive", "rc"] } serde_json = { version = "1", features = ["alloc", "preserve_order", "raw_value"] } serde_with = { version = "3", features = ["json"] } @@ -104,10 +112,12 @@ sha1 = { version = "0.10" } sha2 = { version = "0.10" } signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] } smallvec = { version = "1", default-features = false, features = ["serde", "union", "write"] } -sqlx-core = { version = "0.7", features = ["_rt-tokio", "any", "chrono", "json", "migrate", "offline"] } -sqlx-mysql = { version = "0.7", default-features = false, features = ["any", "chrono", "json", "migrate", "offline"] } -sqlx-postgres = { version = "0.7", default-features = false, features = ["any", "chrono", "json", "migrate", "offline"] } -sqlx-sqlite = { version = "0.7", default-features = false, features = ["any", "chrono", "json", "migrate", "offline"] } +sqlx = { version = "0.7", default-features = false, features = ["bigdecimal", "chrono", "json", "mysql", "postgres", "runtime-tokio-native-tls", "rust_decimal", "sqlite", "time", "uuid"] } +sqlx-core = { version = "0.7", features = ["_rt-tokio", "_tls-native-tls", "bigdecimal", "chrono", "json", "migrate", "offline", "rust_decimal", "time", "uuid"] } +sqlx-mysql = { version = "0.7", default-features = false, features = ["bigdecimal", "chrono", "json", "rust_decimal", "time", "uuid"] } +sqlx-postgres = { version = "0.7", default-features = false, features = ["bigdecimal", "chrono", "json", "rust_decimal", "time", "uuid"] } +sqlx-sqlite = { version = "0.7", default-features = false, features = ["chrono", "json", "time", "uuid"] } +strum = { version = "0.25", features = ["derive"] } subtle = { version = "2" } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } tinyvec = { version = "1", features = ["alloc", "grab_spare_slice", "rustc_1_55"] } @@ -117,7 +127,8 @@ tokio-stream = { git = "https://github.com/madsim-rs/tokio.git", rev = "fe39bb8e tokio-util = { version = "0.7", features = ["codec", "io"] } toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } toml_edit = { version = "0.19", features = ["serde"] } -tonic = { version = "0.9", features = ["gzip", "tls-webpki-roots"] } +tonic-274715c4dabd11b0 = { package = "tonic", version = "0.9", features = ["gzip", "tls-webpki-roots"] } +tonic-93f6ce9d446188ac = { package = "tonic", version = "0.10" } tower = { version = "0.4", features = ["balance", "buffer", "filter", "limit", "load-shed", "retry", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } @@ -133,55 +144,36 @@ ahash = { version = "0.8" } allocator-api2 = { version = "0.2", default-features = false, features = ["alloc", "nightly"] } anyhow = { version = "1", features = ["backtrace"] } auto_enums = { version = "0.8", features = ["futures03"] } -base64 = { version = "0.21", features = ["alloc"] } bitflags = { version = "2", default-features = false, features = ["serde", "std"] } -byteorder = { version = "1" } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } -chrono = { version = "0.4", features = ["alloc", "serde"] } -crossbeam-queue = { version = "0.3" } -crossbeam-utils = { version = "0.8" } deranged = { version = "0.3", default-features = false, features = ["serde", "std"] } -digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1", features = ["serde"] } fixedbitset = { version = "0.4" } frunk_core = { version = "0.4", default-features = false, features = ["std"] } -futures-channel = { version = "0.3", features = ["sink"] } -futures-core = { version = "0.3" } -futures-io = { version = "0.3" } -futures-sink = { version = "0.3" } -futures-task = { version = "0.3" } -futures-util = { version = "0.3", features = ["channel", "io", "sink"] } hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["nightly", "raw"] } -heck = { version = "0.4", features = ["unicode"] } -itertools = { version = "0.10" } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } +itertools-a6292c17cd707f01 = { package = "itertools", version = "0.11" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits"] } -lock_api = { version = "0.4", features = ["arc_lock"] } -log = { version = "0.4", default-features = false, features = ["std"] } -madsim-tokio = { version = "0.2", default-features = false, features = ["fs", "io-util", "macros", "net", "process", "rt", "rt-multi-thread", "signal", "sync", "time", "tracing"] } -md-5 = { version = "0.10" } -mio = { version = "0.8", features = ["net", "os-ext"] } -multimap = { version = "0.8" } +log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } -num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } -parking_lot = { version = "0.12", features = ["arc_lock", "deadlock_detection"] } -parking_lot_core = { version = "0.9", default-features = false, features = ["deadlock_detection"] } petgraph = { version = "0.6" } phf = { version = "0.11", features = ["uncased"] } phf_shared = { version = "0.11", features = ["uncased"] } proc-macro2 = { version = "1", features = ["span-locations"] } -prost = { version = "0.11", features = ["no-recursion-limit"] } +prost-5ef9efb8ec2df382 = { package = "prost", version = "0.12", features = ["no-recursion-limit"] } +prost-a6292c17cd707f01 = { package = "prost", version = "0.11" } +prost-types = { version = "0.12" } rand = { version = "0.8", features = ["small_rng"] } rand_chacha = { version = "0.3" } rand_core = { version = "0.6", default-features = false, features = ["std"] } regex = { version = "1" } -regex-automata = { version = "0.3", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } -regex-syntax = { version = "0.7" } -scopeguard = { version = "1" } +regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } +regex-syntax = { version = "0.8" } serde = { version = "1", features = ["alloc", "derive", "rc"] } serde_json = { version = "1", features = ["alloc", "preserve_order", "raw_value"] } sha1 = { version = "0.10" } @@ -197,16 +189,7 @@ syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-trai syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } -tinyvec = { version = "1", features = ["alloc", "grab_spare_slice", "rustc_1_55"] } -tokio = { version = "1", features = ["full", "stats", "tracing"] } -tokio-stream = { git = "https://github.com/madsim-rs/tokio.git", rev = "fe39bb8e", features = ["fs", "net"] } toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } toml_edit = { version = "0.19", features = ["serde"] } -tracing = { version = "0.1", features = ["log"] } -tracing-core = { version = "0.1" } -unicode-bidi = { version = "0.3" } -unicode-normalization = { version = "0.1" } -url = { version = "2", features = ["serde"] } -whoami = { version = "1" } ### END HAKARI SECTION