From 7bcb908039547d7b762d73b7d63f3d02e9b00a7b Mon Sep 17 00:00:00 2001 From: xxchan Date: Mon, 16 Dec 2024 11:12:52 +0800 Subject: [PATCH 1/8] fix: alter shared source fresh schema will make it non-shared (#19802) Signed-off-by: xxchan --- .../kafka/alter/add_column_shared.slt | 35 +++--- .../kafka/protobuf/alter_source_shared.slt | 10 ++ src/connector/src/with_options.rs | 6 ++ .../src/handler/alter_source_with_sr.rs | 15 +-- src/frontend/src/handler/create_source.rs | 101 +++++++++++++----- src/frontend/src/handler/create_table.rs | 17 +-- 6 files changed, 132 insertions(+), 52 deletions(-) diff --git a/e2e_test/source_inline/kafka/alter/add_column_shared.slt b/e2e_test/source_inline/kafka/alter/add_column_shared.slt index 45454df818afb..bbb03c178fa2f 100644 --- a/e2e_test/source_inline/kafka/alter/add_column_shared.slt +++ b/e2e_test/source_inline/kafka/alter/add_column_shared.slt @@ -50,6 +50,16 @@ alter source s add column v3 varchar; # New MV will have v3. +# Check it should still be shared source +query +explain create materialized view mv_after_alter as select * from s; +---- +StreamMaterialize { columns: [v1, v2, v3, _row_id(hidden)], stream_key: [_row_id], pk_columns: [_row_id], pk_conflict: NoCheck } +└─StreamProject { exprs: [v1, v2, v3, _row_id] } + └─StreamRowIdGen { row_id_index: 5 } + └─StreamSourceScan { columns: [v1, v2, _rw_kafka_timestamp, _rw_kafka_partition, _rw_kafka_offset, _row_id, v3] } + + statement ok create materialized view mv_after_alter as select * from s; @@ -106,16 +116,6 @@ select * from mv_after_alter; 7 g g1 8 h h1 -query error -select * from mv_after_alter_2; ----- -db error: ERROR: Failed to run the query - -Caused by these errors (recent errors listed first): - 1: Catalog error - 2: table or source not found: mv_after_alter_2 - - # Batch select from source will have v3. @@ -146,6 +146,19 @@ select * from mv_before_alter; 8 h +query ?? rowsort +select * from mv_before_alter; +---- +1 a +2 b +3 c +4 d +5 e +6 f +7 g +8 h + + statement ok drop source s cascade; @@ -195,5 +208,3 @@ drop source s cascade; system ok rpk topic delete shared_source_alter; - -# TODO: test alter source with schema registry diff --git a/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt b/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt index e4edec6d535dc..5301eda7679b1 100644 --- a/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt +++ b/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt @@ -54,6 +54,16 @@ sleep 5s statement ok ALTER SOURCE src_user REFRESH SCHEMA; +# Check it should still be shared source +query +EXPLAIN CREATE MATERIALIZED VIEW mv_user_more AS SELECT * FROM src_user; +---- +StreamMaterialize { columns: [id, name, address, city, gender, sc, _rw_kafka_timestamp, age, _row_id(hidden)], stream_key: [_row_id], pk_columns: [_row_id], pk_conflict: NoCheck } +└─StreamProject { exprs: [id, name, address, city, gender, sc, _rw_kafka_timestamp, age, _row_id] } + └─StreamRowIdGen { row_id_index: 9 } + └─StreamSourceScan { columns: [id, name, address, city, gender, sc, _rw_kafka_timestamp, _rw_kafka_partition, _rw_kafka_offset, _row_id, age] } + + statement ok CREATE MATERIALIZED VIEW mv_user_more AS SELECT * FROM src_user; diff --git a/src/connector/src/with_options.rs b/src/connector/src/with_options.rs index 6c857de148fad..59310e5a15509 100644 --- a/src/connector/src/with_options.rs +++ b/src/connector/src/with_options.rs @@ -230,3 +230,9 @@ impl TryFrom<&WithOptionsSecResolved> for Option { } } } + +impl Get for WithOptionsSecResolved { + fn get(&self, key: &str) -> Option<&String> { + self.inner.get(key) + } +} diff --git a/src/frontend/src/handler/alter_source_with_sr.rs b/src/frontend/src/handler/alter_source_with_sr.rs index d4cec17b8b460..41dcc43b0f8f6 100644 --- a/src/frontend/src/handler/alter_source_with_sr.rs +++ b/src/frontend/src/handler/alter_source_with_sr.rs @@ -31,15 +31,14 @@ use risingwave_sqlparser::ast::{ use risingwave_sqlparser::parser::Parser; use super::alter_table_column::schema_has_schema_registry; -use super::create_source::{ - bind_columns_from_source, generate_stream_graph_for_source, validate_compatibility, -}; +use super::create_source::{generate_stream_graph_for_source, validate_compatibility}; use super::util::SourceSchemaCompatExt; use super::{HandlerArgs, RwPgResponse}; use crate::catalog::root_catalog::SchemaPath; use crate::catalog::source_catalog::SourceCatalog; use crate::catalog::{DatabaseId, SchemaId}; use crate::error::{ErrorCode, Result}; +use crate::handler::create_source::{bind_columns_from_source, CreateSourceType}; use crate::session::SessionImpl; use crate::utils::resolve_secret_ref_in_with_options; use crate::{Binder, WithOptions}; @@ -164,8 +163,13 @@ pub async fn refresh_sr_and_get_columns_diff( bail_not_implemented!("altering a cdc source is not supported"); } - let (Some(columns_from_resolve_source), source_info) = - bind_columns_from_source(session, format_encode, Either::Right(&with_properties)).await? + let (Some(columns_from_resolve_source), source_info) = bind_columns_from_source( + session, + format_encode, + Either::Right(&with_properties), + CreateSourceType::from_with_properties(session, &with_properties), + ) + .await? else { // Source without schema registry is rejected. unreachable!("source without schema registry is rejected") @@ -277,7 +281,6 @@ pub async fn handle_alter_source_with_sr( source.version += 1; let pb_source = source.to_prost(schema_id, database_id); - let catalog_writer = session.catalog_writer()?; if source.info.is_shared() { let graph = generate_stream_graph_for_source(handler_args, source.clone())?; diff --git a/src/frontend/src/handler/create_source.rs b/src/frontend/src/handler/create_source.rs index 09aebe2be26f0..a6e9ca9b1d93d 100644 --- a/src/frontend/src/handler/create_source.rs +++ b/src/frontend/src/handler/create_source.rs @@ -302,12 +302,71 @@ fn get_name_strategy_or_default(name_strategy: Option) -> Result Self { + if with_properties.is_shareable_cdc_connector() { + CreateSourceType::SharedCdc + } else if with_properties.is_shareable_non_cdc_connector() + && session + .env() + .streaming_config() + .developer + .enable_shared_source + && session.config().streaming_use_shared_source() + { + CreateSourceType::SharedNonCdc + } else { + CreateSourceType::NonShared + } + } + + pub fn is_shared(&self) -> bool { + matches!( + self, + CreateSourceType::SharedCdc | CreateSourceType::SharedNonCdc + ) + } +} + /// Resolves the schema of the source from external schema file. /// See for more information. /// /// Note: the returned schema strictly corresponds to the schema. /// Other special columns like additional columns (`INCLUDE`), and `row_id` column are not included. -pub(crate) async fn bind_columns_from_source( +pub async fn bind_columns_from_source( + session: &SessionImpl, + format_encode: &FormatEncodeOptions, + with_properties: Either<&WithOptions, &WithOptionsSecResolved>, + create_source_type: CreateSourceType, +) -> Result<(Option>, StreamSourceInfo)> { + let (columns_from_resolve_source, mut source_info) = + if create_source_type == CreateSourceType::SharedCdc { + bind_columns_from_source_for_cdc(session, format_encode)? + } else { + bind_columns_from_source_for_non_cdc(session, format_encode, with_properties).await? + }; + if create_source_type.is_shared() { + // Note: this field should be called is_shared. Check field doc for more details. + source_info.cdc_source_job = true; + source_info.is_distributed = create_source_type == CreateSourceType::SharedNonCdc; + } + Ok((columns_from_resolve_source, source_info)) +} + +async fn bind_columns_from_source_for_non_cdc( session: &SessionImpl, format_encode: &FormatEncodeOptions, with_properties: Either<&WithOptions, &WithOptionsSecResolved>, @@ -1542,9 +1601,7 @@ pub async fn bind_create_source_or_table_with_connector( source_info: StreamSourceInfo, include_column_options: IncludeOption, col_id_gen: &mut ColumnIdGenerator, - // `true` for "create source", `false` for "create table with connector" - is_create_source: bool, - is_shared_non_cdc: bool, + create_source_type: CreateSourceType, source_rate_limit: Option, ) -> Result<(SourceCatalog, DatabaseId, SchemaId)> { let session = &handler_args.session; @@ -1553,6 +1610,7 @@ pub async fn bind_create_source_or_table_with_connector( let (database_id, schema_id) = session.get_database_and_schema_id_for_create(schema_name.clone())?; + let is_create_source = create_source_type != CreateSourceType::Table; if !is_create_source && with_properties.is_iceberg_connector() { return Err(ErrorCode::BindError( "can't CREATE TABLE with iceberg connector\n\nHint: use CREATE SOURCE instead" @@ -1609,7 +1667,7 @@ pub async fn bind_create_source_or_table_with_connector( // For shared sources, we will include partition and offset cols in the SourceExecutor's *output*, to be used by the SourceBackfillExecutor. // For shared CDC source, the schema is different. See debezium_cdc_source_schema, CDC_BACKFILL_TABLE_ADDITIONAL_COLUMNS - if is_shared_non_cdc { + if create_source_type == CreateSourceType::SharedNonCdc { let (columns_exist, additional_columns) = source_add_partition_offset_cols( &columns, &with_properties.get_connector().unwrap(), @@ -1748,26 +1806,14 @@ pub async fn handle_create_source( let format_encode = stmt.format_encode.into_v2_with_warning(); let with_properties = bind_connector_props(&handler_args, &format_encode, true)?; - let create_cdc_source_job = with_properties.is_shareable_cdc_connector(); - let is_shared_non_cdc = with_properties.is_shareable_non_cdc_connector() - && session - .env() - .streaming_config() - .developer - .enable_shared_source - && session.config().streaming_use_shared_source(); - let is_shared = create_cdc_source_job || is_shared_non_cdc; - - let (columns_from_resolve_source, mut source_info) = if create_cdc_source_job { - bind_columns_from_source_for_cdc(&session, &format_encode)? - } else { - bind_columns_from_source(&session, &format_encode, Either::Left(&with_properties)).await? - }; - if is_shared { - // Note: this field should be called is_shared. Check field doc for more details. - source_info.cdc_source_job = true; - source_info.is_distributed = !create_cdc_source_job; - } + let create_source_type = CreateSourceType::from_with_properties(&session, &*with_properties); + let (columns_from_resolve_source, source_info) = bind_columns_from_source( + &session, + &format_encode, + Either::Left(&with_properties), + create_source_type, + ) + .await?; let mut col_id_gen = ColumnIdGenerator::new_initial(); let (source_catalog, database_id, schema_id) = bind_create_source_or_table_with_connector( @@ -1783,8 +1829,7 @@ pub async fn handle_create_source( source_info, stmt.include_column_options, &mut col_id_gen, - true, - is_shared_non_cdc, + create_source_type, overwrite_options.source_rate_limit, ) .await?; @@ -1802,7 +1847,7 @@ pub async fn handle_create_source( let catalog_writer = session.catalog_writer()?; - if is_shared { + if create_source_type.is_shared() { let graph = generate_stream_graph_for_source(handler_args, source_catalog)?; catalog_writer.create_source(source, Some(graph)).await?; } else { diff --git a/src/frontend/src/handler/create_table.rs b/src/frontend/src/handler/create_table.rs index 1520374d7e503..1fa960da971d9 100644 --- a/src/frontend/src/handler/create_table.rs +++ b/src/frontend/src/handler/create_table.rs @@ -59,6 +59,7 @@ use risingwave_sqlparser::ast::{ use risingwave_sqlparser::parser::{IncludeOption, Parser}; use thiserror_ext::AsReport; +use super::create_source::{bind_columns_from_source, CreateSourceType}; use super::{create_sink, create_source, RwPgResponse}; use crate::binder::{bind_data_type, bind_struct_field, Clause, SecureCompareContext}; use crate::catalog::root_catalog::SchemaPath; @@ -68,8 +69,8 @@ use crate::catalog::{check_valid_column_name, ColumnId, DatabaseId, SchemaId}; use crate::error::{ErrorCode, Result, RwError}; use crate::expr::{Expr, ExprImpl, ExprRewriter}; use crate::handler::create_source::{ - bind_columns_from_source, bind_connector_props, bind_create_source_or_table_with_connector, - bind_source_watermark, handle_addition_columns, UPSTREAM_SOURCE_KEY, + bind_connector_props, bind_create_source_or_table_with_connector, bind_source_watermark, + handle_addition_columns, UPSTREAM_SOURCE_KEY, }; use crate::handler::HandlerArgs; use crate::optimizer::plan_node::generic::{CdcScanOptions, SourceNodeKind}; @@ -497,8 +498,13 @@ pub(crate) async fn gen_create_table_plan_with_source( let session = &handler_args.session; let with_properties = bind_connector_props(&handler_args, &format_encode, false)?; - let (columns_from_resolve_source, source_info) = - bind_columns_from_source(session, &format_encode, Either::Left(&with_properties)).await?; + let (columns_from_resolve_source, source_info) = bind_columns_from_source( + session, + &format_encode, + Either::Left(&with_properties), + CreateSourceType::Table, + ) + .await?; let overwrite_options = OverwriteOptions::new(&mut handler_args); let rate_limit = overwrite_options.source_rate_limit; @@ -515,8 +521,7 @@ pub(crate) async fn gen_create_table_plan_with_source( source_info, include_column_options, &mut col_id_gen, - false, - false, + CreateSourceType::Table, rate_limit, ) .await?; From 6759a081c8e789f30ce24747218320a06356d9a8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 03:25:45 +0000 Subject: [PATCH 2/8] chore(deps): Bump sysinfo from 0.32.0 to 0.33.0 (#19801) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- src/common/Cargo.toml | 2 +- src/utils/resource_util/Cargo.toml | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e6e8ec35c5dc6..69cb010feae83 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2936,9 +2936,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "core2" @@ -14251,9 +14251,9 @@ checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" [[package]] name = "sysinfo" -version = "0.32.0" +version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b5ae3f4f7d64646c46c4cae4e3f01d1c5d255c7406fdd7c7f999a94e488791" +checksum = "948512566b1895f93b1592c7574baeb2de842f224f2aab158799ecadb8ebbb46" dependencies = [ "core-foundation-sys", "libc", diff --git a/src/common/Cargo.toml b/src/common/Cargo.toml index 4b002bd8084c6..0aa9498a7acc2 100644 --- a/src/common/Cargo.toml +++ b/src/common/Cargo.toml @@ -102,7 +102,7 @@ stacker = "0.1" static_assertions = "1" strum = "0.26" strum_macros = "0.26" -sysinfo = { version = "0.32", default-features = false, features = ["system"] } +sysinfo = { version = "0.33", default-features = false, features = ["system"] } thiserror = "1" thiserror-ext = { workspace = true } tinyvec = { version = "1", features = ["rustc_1_55", "grab_spare_slice"] } diff --git a/src/utils/resource_util/Cargo.toml b/src/utils/resource_util/Cargo.toml index 4f41d0274f93e..9680c5054bc31 100644 --- a/src/utils/resource_util/Cargo.toml +++ b/src/utils/resource_util/Cargo.toml @@ -13,7 +13,7 @@ normal = ["workspace-hack"] [dependencies] fs-err = "3" -sysinfo = { version = "0.32", default-features = false, features = ["system"] } +sysinfo = { version = "0.33", default-features = false, features = ["system"] } thiserror-ext = { workspace = true } tracing = "0.1" From 6b12555285f77aa4a4862b5c437ee5afd1abb4a6 Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Mon, 16 Dec 2024 11:28:35 +0800 Subject: [PATCH 3/8] feat(ci): more meta backend agnostic e2e tests in `main-cron` (#19595) Signed-off-by: Bugen Zhao --- ci/docker-compose.yml | 3 +- ci/workflows/main-cron.yml | 243 +++++++++--------- ci/workflows/pull-request.yml | 15 +- risedev.yml | 48 ++-- src/risedevtool/src/task/meta_node_service.rs | 22 +- 5 files changed, 161 insertions(+), 170 deletions(-) diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml index a9cb770e9d024..dda6de44382d1 100644 --- a/ci/docker-compose.yml +++ b/ci/docker-compose.yml @@ -110,7 +110,8 @@ services: volumes: - ..:/risingwave - pg-mysql-backend-test-env: + # Standard environment for CI, including MySQL and Postgres for metadata. + ci-standard-env: image: public.ecr.aws/w1p7b4n3/rw-build-env:v20240911 depends_on: - mysql diff --git a/ci/workflows/main-cron.yml b/ci/workflows/main-cron.yml index 5a60e3f42d4f0..d046c9db321bc 100644 --- a/ci/workflows/main-cron.yml +++ b/ci/workflows/main-cron.yml @@ -22,10 +22,11 @@ sql-backend: &sql-backend backend: "postgres" # PGPASSWORD=postgres psql -h db -p 5432 -U postgres -d rwmeta endpoint: "postgres://postgres:postgres@db:5432/rwmeta" - - with: - backend: "mysql" - # mysql -h mysql -P 3306 -u root -p123456 -D rwmeta - endpoint: "mysql://root:123456@mysql:3306/rwmeta" + # Temporarily disable tests for mysql backend as there are unresolved issues. + # - with: + # backend: "mysql" + # # mysql -h mysql-meta -P 3306 -u root -p123456 -D rwmeta + # endpoint: "mysql://root:123456@mysql-meta:3306/rwmeta" env: RISEDEV_SQL_ENDPOINT: "{{matrix.endpoint}}" @@ -33,7 +34,7 @@ docker-compose-common: &docker-compose-common config: ci/docker-compose.yml mount-buildkite-agent: true propagate-environment: true - run: rw-build-env + run: ci-standard-env steps: - label: "build" @@ -93,89 +94,65 @@ steps: timeout_in_minutes: 10 retry: *auto-retry - - label: "end-to-end test (release, {{matrix.backend}} backend)" - key: "e2e-test-release" - <<: *sql-backend - command: "ci/scripts/cron-e2e-test.sh -p ci-release -m ci-3streaming-2serving-3fe" - if: | - !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null - || build.pull_request.labels includes "ci/run-e2e-test" - || build.env("CI_STEPS") =~ /(^|,)e2e-tests?(,|$$)/ - depends_on: - - "build" - - "build-other" - - "docslt" - plugins: - - docker-compose#v5.5.0: - <<: *docker-compose-common - run: pg-mysql-backend-test-env - - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 30 - retry: *auto-retry - - - label: "slow end-to-end test (release)" - key: "slow-e2e-test-release" - command: "ci/scripts/slow-e2e-test.sh -p ci-release -m ci-3streaming-2serving-3fe" - if: | - !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null - || build.pull_request.labels includes "ci/run-slow-e2e-tests" - || build.env("CI_STEPS") =~ /(^|,)slow-e2e-tests?(,|$$)/ - depends_on: - - "build" - - "build-other" - plugins: - - docker-compose#v5.5.0: - run: rw-build-env - config: ci/docker-compose.yml - mount-buildkite-agent: true - - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 8 - retry: *auto-retry - - - label: "meta backup test (release)" - key: "e2e-meta-backup-test-release" - command: "ci/scripts/run-meta-backup-test.sh -p ci-release -m ci-3streaming-2serving-3fe" - if: | - !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null - || build.pull_request.labels includes "ci/run-e2e-meta-backup-test" - || build.env("CI_STEPS") =~ /(^|,)e2e-tests?(,|$$)/ - depends_on: - - "build" - - "build-other" - - "docslt" - plugins: - - docker-compose#v5.5.0: - run: rw-build-env - config: ci/docker-compose.yml - mount-buildkite-agent: true - - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 45 - retry: *auto-retry - - - label: "end-to-end test (parallel) (release)" - key: "e2e-test-release-parallel" - command: "ci/scripts/e2e-test-parallel.sh -p ci-release" - if: | - !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null - || build.pull_request.labels includes "ci/run-e2e-parallel-tests" - || build.env("CI_STEPS") =~ /(^|,)e2e-parallel-tests?(,|$$)/ - depends_on: - - "build" - - "docslt" - plugins: - - seek-oss/aws-sm#v2.3.2: - env: - BUILDKITE_ANALYTICS_TOKEN: buildkite-build-analytics-sqllogictest-token - - docker-compose#v5.5.0: - run: rw-build-env - config: ci/docker-compose.yml - mount-buildkite-agent: true - - test-collector#v1.0.0: - files: "*-junit.xml" - format: "junit" - - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 13 - retry: *auto-retry + - group: "end-to-end test (release)" + steps: + - label: "end-to-end test ({{matrix.backend}} backend)" + key: "e2e-test-release" + <<: *sql-backend + command: "ci/scripts/cron-e2e-test.sh -p ci-release -m ci-3streaming-2serving-3fe" + if: | + !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null + || build.pull_request.labels includes "ci/run-e2e-test" + || build.env("CI_STEPS") =~ /(^|,)e2e-tests?(,|$$)/ + depends_on: + - "build" + - "build-other" + - "docslt" + plugins: + - docker-compose#v5.5.0: *docker-compose-common + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 30 + retry: *auto-retry + + - label: "slow end-to-end test ({{matrix.backend}} backend)" + key: "slow-e2e-test-release" + <<: *sql-backend + command: "ci/scripts/slow-e2e-test.sh -p ci-release -m ci-3streaming-2serving-3fe" + if: | + !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null + || build.pull_request.labels includes "ci/run-slow-e2e-tests" + || build.env("CI_STEPS") =~ /(^|,)slow-e2e-tests?(,|$$)/ + depends_on: + - "build" + - "build-other" + plugins: + - docker-compose#v5.5.0: *docker-compose-common + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 8 + retry: *auto-retry + + - label: "end-to-end test (parallel, {{matrix.backend}} backend)" + key: "e2e-test-release-parallel" + <<: *sql-backend + command: "ci/scripts/e2e-test-parallel.sh -p ci-release" + if: | + !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null + || build.pull_request.labels includes "ci/run-e2e-parallel-tests" + || build.env("CI_STEPS") =~ /(^|,)e2e-parallel-tests?(,|$$)/ + depends_on: + - "build" + - "docslt" + plugins: + - seek-oss/aws-sm#v2.3.2: + env: + BUILDKITE_ANALYTICS_TOKEN: buildkite-build-analytics-sqllogictest-token + - docker-compose#v5.5.0: *docker-compose-common + - test-collector#v1.0.0: + files: "*-junit.xml" + format: "junit" + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 13 + retry: *auto-retry - label: "end-to-end test (parallel, in-memory) (release)" key: "e2e-test-release-parallel-memory" @@ -196,62 +173,84 @@ steps: timeout_in_minutes: 12 retry: *auto-retry - - label: "end-to-end source test (release)" - key: "e2e-test-release-source" - command: "ci/scripts/e2e-source-test.sh -p ci-release" - if: | - !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null - || build.pull_request.labels includes "ci/run-e2e-source-tests" - || build.env("CI_STEPS") =~ /(^|,)e2e-source-tests?(,|$$)/ - depends_on: - - "build" - - "build-other" - plugins: - - docker-compose#v5.5.0: - run: source-test-env - config: ci/docker-compose.yml - mount-buildkite-agent: true - - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 15 - retry: *auto-retry + - group: "end-to-end connector test (release)" + steps: + - label: "end-to-end source test ({{matrix.backend}} backend)" + key: "e2e-test-release-source" + <<: *sql-backend + command: "ci/scripts/e2e-source-test.sh -p ci-release" + if: | + !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null + || build.pull_request.labels includes "ci/run-e2e-source-tests" + || build.env("CI_STEPS") =~ /(^|,)e2e-source-tests?(,|$$)/ + depends_on: + - "build" + - "build-other" + plugins: + - docker-compose#v5.5.0: + <<: *docker-compose-common + run: source-test-env + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 15 + retry: *auto-retry + + - label: "end-to-end sink test ({{matrix.backend}} backend)" + key: "e2e-test-release-sink" + <<: *sql-backend + command: "ci/scripts/e2e-sink-test.sh -p ci-release" + if: | + !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null + || build.pull_request.labels includes "ci/run-e2e-sink-tests" + || build.env("CI_STEPS") =~ /(^|,)e2e-sink-tests?(,|$$)/ + depends_on: + - "build" + - "build-other" + plugins: + - docker-compose#v5.5.0: + <<: *docker-compose-common + run: sink-test-env + - ./ci/plugins/upload-failure-logs + timeout_in_minutes: 35 + retry: *auto-retry - - label: "end-to-end sink test (release)" - key: "e2e-test-release-sink" - command: "ci/scripts/e2e-sink-test.sh -p ci-release" + - label: "fuzz test" + key: "fuzz-test" + command: "ci/scripts/cron-fuzz-test.sh -p ci-release" if: | !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null - || build.pull_request.labels includes "ci/run-e2e-sink-tests" - || build.env("CI_STEPS") =~ /(^|,)e2e-sink-tests?(,|$$)/ + || build.pull_request.labels includes "ci/run-sqlsmith-fuzzing-tests" + || build.env("CI_STEPS") =~ /(^|,)sqlsmith-fuzzing-tests?(,|$$)/ depends_on: - "build" - - "build-other" + - "build-simulation" plugins: + - ./ci/plugins/swapfile - docker-compose#v5.5.0: - run: sink-test-env + run: rw-build-env config: ci/docker-compose.yml mount-buildkite-agent: true - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 35 + timeout_in_minutes: 20 retry: *auto-retry - - label: "fuzz test" - key: "fuzz-test" - command: "ci/scripts/cron-fuzz-test.sh -p ci-release" + - label: "meta backup test (release)" + key: "e2e-meta-backup-test-release" + command: "ci/scripts/run-meta-backup-test.sh -p ci-release -m ci-3streaming-2serving-3fe" if: | !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null - || build.pull_request.labels includes "ci/run-sqlsmith-fuzzing-tests" - || build.env("CI_STEPS") =~ /(^|,)sqlsmith-fuzzing-tests?(,|$$)/ + || build.pull_request.labels includes "ci/run-e2e-meta-backup-test" + || build.env("CI_STEPS") =~ /(^|,)e2e-tests?(,|$$)/ depends_on: - "build" - - "build-simulation" + - "build-other" + - "docslt" plugins: - - ./ci/plugins/swapfile - docker-compose#v5.5.0: run: rw-build-env config: ci/docker-compose.yml mount-buildkite-agent: true - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 20 + timeout_in_minutes: 45 retry: *auto-retry # The timeout should be strictly more than timeout in `pull-request.yml`. diff --git a/ci/workflows/pull-request.yml b/ci/workflows/pull-request.yml index 9ffc14228add1..65ebde255de55 100644 --- a/ci/workflows/pull-request.yml +++ b/ci/workflows/pull-request.yml @@ -33,10 +33,11 @@ other-sql-backend: &other-sql-backend label: "postgres" # PGPASSWORD=postgres psql -h db -p 5432 -U postgres -d rwmeta endpoint: "postgres://postgres:postgres@db:5432/rwmeta" - - with: - label: "mysql" - # mysql -h mysql -P 3306 -u root -p123456 -D rwmeta - endpoint: "mysql://root:123456@mysql:3306/rwmeta" + # Temporarily disable tests for mysql backend as there are unresolved issues. + # - with: + # label: "mysql" + # # mysql -h mysql-meta -P 3306 -u root -p123456 -D rwmeta + # endpoint: "mysql://root:123456@mysql-meta:3306/rwmeta" env: RISEDEV_SQL_ENDPOINT: "{{matrix.endpoint}}" @@ -44,7 +45,7 @@ docker-compose-common: &docker-compose-common config: ci/docker-compose.yml mount-buildkite-agent: true propagate-environment: true - run: rw-build-env + run: ci-standard-env steps: - label: "check ci image rebuild" @@ -840,9 +841,7 @@ steps: - "build-other" - "docslt" plugins: - - docker-compose#v5.5.0: - <<: *docker-compose-common - run: pg-mysql-backend-test-env + - docker-compose#v5.5.0: *docker-compose-common - ./ci/plugins/upload-failure-logs timeout_in_minutes: 32 retry: *auto-retry diff --git a/risedev.yml b/risedev.yml index b8fa94c1faaa6..1896cc111021b 100644 --- a/risedev.yml +++ b/risedev.yml @@ -572,9 +572,8 @@ profile: config-path: src/config/ci.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node enable-tiered-cache: true - use: frontend @@ -596,9 +595,8 @@ profile: config-path: src/config/ci.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node port: 5687 exporter-port: 1222 @@ -618,9 +616,8 @@ profile: config-path: src/config/ci-longer-streaming-upload-timeout.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node port: 5687 exporter-port: 1222 @@ -640,9 +637,8 @@ profile: config-path: src/config/ci-longer-streaming-upload-timeout.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node port: 5687 exporter-port: 1222 @@ -666,9 +662,8 @@ profile: - use: minio api-requests-max: 30 api-requests-deadline: 3s - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node port: 5687 exporter-port: 1222 @@ -690,9 +685,8 @@ profile: - use: minio api-requests-max: 30 api-requests-deadline: 2s - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node port: 5687 exporter-port: 1222 @@ -714,9 +708,8 @@ profile: config-path: src/config/ci.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node port: 5687 exporter-port: 1222 @@ -773,9 +766,8 @@ profile: ci-3cn-3fe-opendal-fs-backend: config-path: src/config/ci.toml steps: - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: opendal engine: fs bucket: "/tmp/rw_ci" @@ -856,9 +848,8 @@ profile: config-path: src/config/ci.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node enable-tiered-cache: true - use: frontend @@ -895,9 +886,8 @@ profile: config-path: src/config/ci-recovery.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node enable-tiered-cache: true - use: frontend @@ -929,9 +919,8 @@ profile: config-path: src/config/ci.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node enable-tiered-cache: true - use: frontend @@ -942,9 +931,8 @@ profile: config-path: src/config/ci-compaction-test.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node enable-tiered-cache: true total-memory-bytes: 17179869184 @@ -955,9 +943,8 @@ profile: config-path: src/config/ci-recovery.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node enable-tiered-cache: true - use: frontend @@ -967,9 +954,8 @@ profile: config-path: src/config/ci-recovery.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node port: 5687 exporter-port: 1222 @@ -989,9 +975,8 @@ profile: config-path: src/config/ci-recovery.toml steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node enable-tiered-cache: true - use: frontend @@ -1051,9 +1036,8 @@ profile: config-path: "src/config/ci-backfill.toml" steps: - use: minio - - use: sqlite - use: meta-node - meta-backend: sqlite + meta-backend: env - use: compute-node - use: frontend - use: compactor diff --git a/src/risedevtool/src/task/meta_node_service.rs b/src/risedevtool/src/task/meta_node_service.rs index 721f33573fa7d..60e681007fbda 100644 --- a/src/risedevtool/src/task/meta_node_service.rs +++ b/src/risedevtool/src/task/meta_node_service.rs @@ -20,7 +20,6 @@ use std::sync::LazyLock; use anyhow::{anyhow, bail, Context, Result}; use itertools::Itertools; use sqlx::{ConnectOptions, Database}; -use tempfile::NamedTempFile; use url::Url; use super::{risingwave_cmd, ExecuteContext, Task}; @@ -48,13 +47,20 @@ fn sql_endpoint_from_env() -> String { ); endpoint } else { - let temp_path = NamedTempFile::with_suffix(".db").unwrap().into_temp_path(); - let temp_sqlite_endpoint = format!("sqlite://{}?mode=rwc", temp_path.to_string_lossy()); + // `meta-backend: env` is specified, but env var is not set. + // Act as if `meta-backend: sqlite` is specified. + // Not using a temporary file because we want to persist the data across restarts. + let prefix_data = env::var("PREFIX_DATA").unwrap(); + let dir = PathBuf::from(&prefix_data).join("meta-backend-env-fallback-sqlite"); + fs_err::create_dir_all(&dir).unwrap(); + + let path = dir.join("metadata.db"); + let sqlite_endpoint = format!("sqlite://{}?mode=rwc", path.to_string_lossy()); tracing::warn!( - "env RISEDEV_SQL_ENDPOINT not set, use temporary sqlite `{}`", - temp_sqlite_endpoint + "env RISEDEV_SQL_ENDPOINT not set, use fallback sqlite `{}`", + sqlite_endpoint ); - temp_sqlite_endpoint + sqlite_endpoint } }); @@ -365,7 +371,9 @@ fn initialize_meta_store() -> Result<(), anyhow::Error> { // SQLite in-memory database does not need initialization. } else { let filename = options.get_filename(); - fs_err::write(filename, b"").context("failed to empty SQLite file")?; + if std::fs::exists(filename)? { + fs_err::write(filename, b"").context("failed to empty SQLite file")?; + } } return Ok(()); From 1e24ca482c29c08ba3aeb31ee44f672e3eaaef05 Mon Sep 17 00:00:00 2001 From: zwang28 <70626450+zwang28@users.noreply.github.com> Date: Mon, 16 Dec 2024 12:06:42 +0800 Subject: [PATCH 4/8] fix(meta): fix order when restoring metadata (#19791) --- src/meta/src/backup_restore/restore_impl/v2.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/meta/src/backup_restore/restore_impl/v2.rs b/src/meta/src/backup_restore/restore_impl/v2.rs index fc1009f519dd3..1050c1d7ecab4 100644 --- a/src/meta/src/backup_restore/restore_impl/v2.rs +++ b/src/meta/src/backup_restore/restore_impl/v2.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp; + use itertools::Itertools; use risingwave_backup::error::{BackupError, BackupResult}; use risingwave_backup::meta_snapshot::MetaSnapshot; @@ -108,8 +110,21 @@ impl Writer for WriterModelV2ToMetaStoreV2 { insert_models(metadata.worker_properties.clone(), db).await?; insert_models(metadata.users.clone(), db).await?; // The sort is required to pass table's foreign key check. + use risingwave_meta_model::object::ObjectType; insert_models( - metadata.objects.iter().sorted_by_key(|o| o.oid).cloned(), + metadata + .objects + .iter() + .sorted_by(|a, b| match (a.obj_type, b.obj_type) { + (ObjectType::Database, ObjectType::Database) => a.oid.cmp(&b.oid), + (ObjectType::Database, _) => cmp::Ordering::Less, + (_, ObjectType::Database) => cmp::Ordering::Greater, + (ObjectType::Schema, ObjectType::Schema) => a.oid.cmp(&b.oid), + (ObjectType::Schema, _) => cmp::Ordering::Less, + (_, ObjectType::Schema) => cmp::Ordering::Greater, + (_, _) => a.oid.cmp(&b.oid), + }) + .cloned(), db, ) .await?; From 45fcf8f33c3a5eb71dcf31c7f33d1ee80f7a832d Mon Sep 17 00:00:00 2001 From: xxchan Date: Mon, 16 Dec 2024 12:35:38 +0800 Subject: [PATCH 5/8] refactor(frontend): split create_source into smaller mods (#19803) Signed-off-by: xxchan --- .../src/parser/debezium/debezium_parser.rs | 25 +- .../src/handler/alter_source_with_sr.rs | 5 +- .../src/handler/alter_table_column.rs | 19 +- .../src/handler/alter_table_with_sr.rs | 3 +- src/frontend/src/handler/create_source.rs | 1053 +---------------- .../create_source/additional_column.rs | 130 ++ .../handler/create_source/external_schema.rs | 441 +++++++ .../create_source/external_schema/avro.rs | 52 + .../create_source/external_schema/debezium.rs | 24 + .../create_source/external_schema/iceberg.rs | 101 ++ .../create_source/external_schema/json.rs | 70 ++ .../create_source/external_schema/nexmark.rs | 69 ++ .../create_source/external_schema/protobuf.rs | 48 + .../src/handler/create_source/validate.rs | 246 ++++ .../plan_node/stream_cdc_table_scan.rs | 2 +- 15 files changed, 1229 insertions(+), 1059 deletions(-) create mode 100644 src/frontend/src/handler/create_source/additional_column.rs create mode 100644 src/frontend/src/handler/create_source/external_schema.rs create mode 100644 src/frontend/src/handler/create_source/external_schema/avro.rs create mode 100644 src/frontend/src/handler/create_source/external_schema/debezium.rs create mode 100644 src/frontend/src/handler/create_source/external_schema/iceberg.rs create mode 100644 src/frontend/src/handler/create_source/external_schema/json.rs create mode 100644 src/frontend/src/handler/create_source/external_schema/nexmark.rs create mode 100644 src/frontend/src/handler/create_source/external_schema/protobuf.rs create mode 100644 src/frontend/src/handler/create_source/validate.rs diff --git a/src/connector/src/parser/debezium/debezium_parser.rs b/src/connector/src/parser/debezium/debezium_parser.rs index 3986593920770..fe917de7d3696 100644 --- a/src/connector/src/parser/debezium/debezium_parser.rs +++ b/src/connector/src/parser/debezium/debezium_parser.rs @@ -15,6 +15,8 @@ use std::collections::BTreeMap; use risingwave_common::bail; +use risingwave_common::catalog::{ColumnCatalog, ColumnDesc, ColumnId}; +use risingwave_common::types::DataType; use super::simd_json_parser::DebeziumJsonAccessBuilder; use super::{DebeziumAvroAccessBuilder, DebeziumAvroParserConfig}; @@ -28,6 +30,20 @@ use crate::parser::{ }; use crate::source::{SourceColumnDesc, SourceContext, SourceContextRef}; +/// Note: these columns are added in `SourceStreamChunkRowWriter::do_action`. +/// May also look for the usage of `SourceColumnType`. +pub fn debezium_cdc_source_schema() -> Vec { + let columns = vec![ + ColumnCatalog { + column_desc: ColumnDesc::named("payload", ColumnId::placeholder(), DataType::Jsonb), + is_hidden: false, + }, + ColumnCatalog::offset_column(), + ColumnCatalog::cdc_table_name_column(), + ]; + columns +} + #[derive(Debug)] pub struct DebeziumParser { key_builder: AccessBuilderImpl, @@ -192,7 +208,7 @@ mod tests { use std::ops::Deref; use std::sync::Arc; - use risingwave_common::catalog::{ColumnCatalog, ColumnDesc, ColumnId}; + use risingwave_common::catalog::{ColumnCatalog, ColumnDesc, ColumnId, CDC_SOURCE_COLUMN_NUM}; use risingwave_common::row::Row; use risingwave_common::types::Timestamptz; use risingwave_pb::plan_common::{ @@ -327,4 +343,11 @@ mod tests { _ => panic!("unexpected parse result: {:?}", res), } } + + #[tokio::test] + async fn test_cdc_source_job_schema() { + let columns = debezium_cdc_source_schema(); + // make sure it doesn't broken by future PRs + assert_eq!(CDC_SOURCE_COLUMN_NUM, columns.len() as u32); + } } diff --git a/src/frontend/src/handler/alter_source_with_sr.rs b/src/frontend/src/handler/alter_source_with_sr.rs index 41dcc43b0f8f6..05548351492b9 100644 --- a/src/frontend/src/handler/alter_source_with_sr.rs +++ b/src/frontend/src/handler/alter_source_with_sr.rs @@ -30,8 +30,9 @@ use risingwave_sqlparser::ast::{ }; use risingwave_sqlparser::parser::Parser; -use super::alter_table_column::schema_has_schema_registry; -use super::create_source::{generate_stream_graph_for_source, validate_compatibility}; +use super::create_source::{ + generate_stream_graph_for_source, schema_has_schema_registry, validate_compatibility, +}; use super::util::SourceSchemaCompatExt; use super::{HandlerArgs, RwPgResponse}; use crate::catalog::root_catalog::SchemaPath; diff --git a/src/frontend/src/handler/alter_table_column.rs b/src/frontend/src/handler/alter_table_column.rs index 15554e919c77a..0342004f6b1b5 100644 --- a/src/frontend/src/handler/alter_table_column.rs +++ b/src/frontend/src/handler/alter_table_column.rs @@ -29,12 +29,12 @@ use risingwave_pb::ddl_service::TableJobType; use risingwave_pb::stream_plan::stream_node::PbNodeBody; use risingwave_pb::stream_plan::{ProjectNode, StreamFragmentGraph}; use risingwave_sqlparser::ast::{ - AlterTableOperation, ColumnDef, ColumnOption, DataType as AstDataType, Encode, - FormatEncodeOptions, Ident, ObjectName, Statement, StructField, TableConstraint, + AlterTableOperation, ColumnDef, ColumnOption, DataType as AstDataType, Ident, ObjectName, + Statement, StructField, TableConstraint, }; use risingwave_sqlparser::parser::Parser; -use super::create_source::get_json_schema_location; +use super::create_source::schema_has_schema_registry; use super::create_table::{generate_stream_graph_for_replace_table, ColumnIdGenerator}; use super::util::SourceSchemaCompatExt; use super::{HandlerArgs, RwPgResponse}; @@ -45,7 +45,7 @@ use crate::expr::{Expr, ExprImpl, InputRef, Literal}; use crate::handler::create_sink::{fetch_incoming_sinks, insert_merger_to_union_with_project}; use crate::handler::create_table::bind_table_constraints; use crate::session::SessionImpl; -use crate::{Binder, TableCatalog, WithOptions}; +use crate::{Binder, TableCatalog}; /// Used in auto schema change process pub async fn get_new_table_definition_for_cdc_table( @@ -475,17 +475,6 @@ pub async fn handle_alter_table_column( Ok(PgResponse::empty_result(StatementType::ALTER_TABLE)) } -pub fn schema_has_schema_registry(schema: &FormatEncodeOptions) -> bool { - match schema.row_encode { - Encode::Avro | Encode::Protobuf => true, - Encode::Json => { - let mut options = WithOptions::try_from(schema.row_options()).unwrap(); - matches!(get_json_schema_location(options.inner_mut()), Ok(Some(_))) - } - _ => false, - } -} - pub fn fetch_table_catalog_for_alter( session: &SessionImpl, table_name: &ObjectName, diff --git a/src/frontend/src/handler/alter_table_with_sr.rs b/src/frontend/src/handler/alter_table_with_sr.rs index d0daa6f5a82a5..7d6bdab5bf1b3 100644 --- a/src/frontend/src/handler/alter_table_with_sr.rs +++ b/src/frontend/src/handler/alter_table_with_sr.rs @@ -21,7 +21,8 @@ use risingwave_sqlparser::parser::Parser; use thiserror_ext::AsReport; use super::alter_source_with_sr::alter_definition_format_encode; -use super::alter_table_column::{fetch_table_catalog_for_alter, schema_has_schema_registry}; +use super::alter_table_column::fetch_table_catalog_for_alter; +use super::create_source::schema_has_schema_registry; use super::util::SourceSchemaCompatExt; use super::{get_replace_table_plan, HandlerArgs, RwPgResponse}; use crate::error::{ErrorCode, Result}; diff --git a/src/frontend/src/handler/create_source.rs b/src/frontend/src/handler/create_source.rs index a6e9ca9b1d93d..2981a96423edf 100644 --- a/src/frontend/src/handler/create_source.rs +++ b/src/frontend/src/handler/create_source.rs @@ -18,6 +18,9 @@ use std::sync::LazyLock; use anyhow::{anyhow, Context}; use either::Either; +use external_schema::debezium::extract_debezium_avro_table_pk_columns; +use external_schema::iceberg::check_iceberg_source; +use external_schema::nexmark::check_nexmark_schema; use itertools::Itertools; use maplit::{convert_args, hashmap, hashset}; use pgwire::pg_response::{PgResponse, StatementType}; @@ -99,138 +102,16 @@ use crate::utils::{ }; use crate::{bind_data_type, build_graph, OptimizerContext, WithOptions, WithOptionsSecResolved}; -/// Map a JSON schema to a relational schema -async fn extract_json_table_schema( - schema_config: &Option<(AstString, bool)>, - with_properties: &BTreeMap, - format_encode_options: &mut BTreeMap, -) -> Result>> { - match schema_config { - None => Ok(None), - Some((schema_location, use_schema_registry)) => { - let schema_registry_auth = use_schema_registry.then(|| { - let auth = SchemaRegistryAuth::from(&*format_encode_options); - try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_USERNAME); - try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_PASSWORD); - auth - }); - Ok(Some( - fetch_json_schema_and_map_to_columns( - &schema_location.0, - schema_registry_auth, - with_properties, - ) - .await? - .into_iter() - .map(|col| ColumnCatalog { - column_desc: col.into(), - is_hidden: false, - }) - .collect_vec(), - )) - } - } -} - -/// Note: these columns are added in `SourceStreamChunkRowWriter::do_action`. -/// May also look for the usage of `SourceColumnType`. -pub fn debezium_cdc_source_schema() -> Vec { - let columns = vec![ - ColumnCatalog { - column_desc: ColumnDesc::named("payload", ColumnId::placeholder(), DataType::Jsonb), - is_hidden: false, - }, - ColumnCatalog::offset_column(), - ColumnCatalog::cdc_table_name_column(), - ]; - columns -} - -fn json_schema_infer_use_schema_registry(schema_config: &Option<(AstString, bool)>) -> bool { - match schema_config { - None => false, - Some((_, use_registry)) => *use_registry, - } -} - -/// Map an Avro schema to a relational schema. -async fn extract_avro_table_schema( - info: &StreamSourceInfo, - with_properties: &WithOptionsSecResolved, - format_encode_options: &mut BTreeMap, - is_debezium: bool, -) -> Result> { - let parser_config = SpecificParserConfig::new(info, with_properties)?; - try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_USERNAME); - try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_PASSWORD); - consume_aws_config_from_options(format_encode_options); - - let vec_column_desc = if is_debezium { - let conf = DebeziumAvroParserConfig::new(parser_config.encoding_config).await?; - conf.map_to_columns()? - } else { - if let risingwave_connector::parser::EncodingProperties::Avro(avro_props) = - &parser_config.encoding_config - && matches!(avro_props.schema_location, SchemaLocation::File { .. }) - && format_encode_options - .get("with_deprecated_file_header") - .is_none_or(|v| v != "true") - { - bail_not_implemented!(issue = 12871, "avro without schema registry"); - } - let conf = AvroParserConfig::new(parser_config.encoding_config).await?; - conf.map_to_columns()? - }; - Ok(vec_column_desc - .into_iter() - .map(|col| ColumnCatalog { - column_desc: col.into(), - is_hidden: false, - }) - .collect_vec()) -} - -async fn extract_debezium_avro_table_pk_columns( - info: &StreamSourceInfo, - with_properties: &WithOptionsSecResolved, -) -> Result> { - let parser_config = SpecificParserConfig::new(info, with_properties)?; - let conf = DebeziumAvroParserConfig::new(parser_config.encoding_config).await?; - Ok(conf.extract_pks()?.drain(..).map(|c| c.name).collect()) -} - -/// Map a protobuf schema to a relational schema. -async fn extract_protobuf_table_schema( - schema: &ProtobufSchema, - with_properties: &WithOptionsSecResolved, - format_encode_options: &mut BTreeMap, -) -> Result> { - let info = StreamSourceInfo { - proto_message_name: schema.message_name.0.clone(), - row_schema_location: schema.row_schema_location.0.clone(), - use_schema_registry: schema.use_schema_registry, - format: FormatType::Plain.into(), - row_encode: EncodeType::Protobuf.into(), - format_encode_options: format_encode_options.clone(), - ..Default::default() - }; - let parser_config = SpecificParserConfig::new(&info, with_properties)?; - try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_USERNAME); - try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_PASSWORD); - consume_aws_config_from_options(format_encode_options); - - let conf = ProtobufParserConfig::new(parser_config.encoding_config).await?; - - let column_descs = conf.map_to_columns()?; - - Ok(column_descs - .into_iter() - .map(|col| ColumnCatalog { - column_desc: col.into(), - is_hidden: false, - }) - .collect_vec()) -} +mod external_schema; +pub use external_schema::{ + bind_columns_from_source, get_schema_location, schema_has_schema_registry, +}; +mod validate; +pub use validate::validate_compatibility; +use validate::{ALLOWED_CONNECTION_CONNECTOR, ALLOWED_CONNECTION_SCHEMA_REGISTRY}; +mod additional_column; +use additional_column::check_and_add_timestamp_column; +pub use additional_column::handle_addition_columns; fn non_generated_sql_columns(columns: &[ColumnDef]) -> Vec { columns @@ -260,48 +141,6 @@ fn consume_aws_config_from_options(format_encode_options: &mut BTreeMap, -) -> Result> { - let schema_location = try_consume_string_from_options(format_encode_options, "schema.location"); - let schema_registry = try_consume_string_from_options(format_encode_options, "schema.registry"); - match (schema_location, schema_registry) { - (None, None) => Ok(None), - (None, Some(schema_registry)) => Ok(Some((schema_registry, true))), - (Some(schema_location), None) => Ok(Some((schema_location, false))), - (Some(_), Some(_)) => Err(RwError::from(ProtocolError( - "only need either the schema location or the schema registry".to_owned(), - ))), - } -} - -fn get_schema_location( - format_encode_options: &mut BTreeMap, -) -> Result<(AstString, bool)> { - let schema_location = try_consume_string_from_options(format_encode_options, "schema.location"); - let schema_registry = try_consume_string_from_options(format_encode_options, "schema.registry"); - match (schema_location, schema_registry) { - (None, None) => Err(RwError::from(ProtocolError( - "missing either a schema location or a schema registry".to_owned(), - ))), - (None, Some(schema_registry)) => Ok((schema_registry, true)), - (Some(schema_location), None) => Ok((schema_location, false)), - (Some(_), Some(_)) => Err(RwError::from(ProtocolError( - "only need either the schema location or the schema registry".to_owned(), - ))), - } -} - -#[inline] -fn get_name_strategy_or_default(name_strategy: Option) -> Result> { - match name_strategy { - None => Ok(None), - Some(name) => Ok(Some(name_strategy_from_str(name.0.as_str()) - .ok_or_else(|| RwError::from(ProtocolError(format!("\ - expect strategy name in topic_name_strategy, record_name_strategy and topic_record_name_strategy, but got {}", name))))? as i32)), - } -} - #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum CreateSourceType { SharedCdc, @@ -341,435 +180,6 @@ impl CreateSourceType { } } -/// Resolves the schema of the source from external schema file. -/// See for more information. -/// -/// Note: the returned schema strictly corresponds to the schema. -/// Other special columns like additional columns (`INCLUDE`), and `row_id` column are not included. -pub async fn bind_columns_from_source( - session: &SessionImpl, - format_encode: &FormatEncodeOptions, - with_properties: Either<&WithOptions, &WithOptionsSecResolved>, - create_source_type: CreateSourceType, -) -> Result<(Option>, StreamSourceInfo)> { - let (columns_from_resolve_source, mut source_info) = - if create_source_type == CreateSourceType::SharedCdc { - bind_columns_from_source_for_cdc(session, format_encode)? - } else { - bind_columns_from_source_for_non_cdc(session, format_encode, with_properties).await? - }; - if create_source_type.is_shared() { - // Note: this field should be called is_shared. Check field doc for more details. - source_info.cdc_source_job = true; - source_info.is_distributed = create_source_type == CreateSourceType::SharedNonCdc; - } - Ok((columns_from_resolve_source, source_info)) -} - -async fn bind_columns_from_source_for_non_cdc( - session: &SessionImpl, - format_encode: &FormatEncodeOptions, - with_properties: Either<&WithOptions, &WithOptionsSecResolved>, -) -> Result<(Option>, StreamSourceInfo)> { - const MESSAGE_NAME_KEY: &str = "message"; - const KEY_MESSAGE_NAME_KEY: &str = "key.message"; - const NAME_STRATEGY_KEY: &str = "schema.registry.name.strategy"; - - let options_with_secret = match with_properties { - Either::Left(options) => { - let (sec_resolve_props, connection_type, _) = resolve_connection_ref_and_secret_ref( - options.clone(), - session, - TelemetryDatabaseObject::Source, - )?; - if !ALLOWED_CONNECTION_CONNECTOR.contains(&connection_type) { - return Err(RwError::from(ProtocolError(format!( - "connection type {:?} is not allowed, allowed types: {:?}", - connection_type, ALLOWED_CONNECTION_CONNECTOR - )))); - } - - sec_resolve_props - } - Either::Right(options_with_secret) => options_with_secret.clone(), - }; - - let is_kafka: bool = options_with_secret.is_kafka_connector(); - - // todo: need to resolve connection ref for schema registry - let (sec_resolve_props, connection_type, schema_registry_conn_ref) = - resolve_connection_ref_and_secret_ref( - WithOptions::try_from(format_encode.row_options())?, - session, - TelemetryDatabaseObject::Source, - )?; - ensure_connection_type_allowed(connection_type, &ALLOWED_CONNECTION_SCHEMA_REGISTRY)?; - - let (format_encode_options, format_encode_secret_refs) = sec_resolve_props.into_parts(); - // Need real secret to access the schema registry - let mut format_encode_options_to_consume = LocalSecretManager::global().fill_secrets( - format_encode_options.clone(), - format_encode_secret_refs.clone(), - )?; - - fn get_key_message_name(options: &mut BTreeMap) -> Option { - consume_string_from_options(options, KEY_MESSAGE_NAME_KEY) - .map(|ele| Some(ele.0)) - .unwrap_or(None) - } - fn get_sr_name_strategy_check( - options: &mut BTreeMap, - use_sr: bool, - ) -> Result> { - let name_strategy = get_name_strategy_or_default(try_consume_string_from_options( - options, - NAME_STRATEGY_KEY, - ))?; - if !use_sr && name_strategy.is_some() { - return Err(RwError::from(ProtocolError( - "schema registry name strategy only works with schema registry enabled".to_owned(), - ))); - } - Ok(name_strategy) - } - - let mut stream_source_info = StreamSourceInfo { - format: format_to_prost(&format_encode.format) as i32, - row_encode: row_encode_to_prost(&format_encode.row_encode) as i32, - format_encode_options, - format_encode_secret_refs, - connection_id: schema_registry_conn_ref, - ..Default::default() - }; - - if format_encode.format == Format::Debezium { - try_consume_string_from_options(&mut format_encode_options_to_consume, DEBEZIUM_IGNORE_KEY); - } - - let columns = match (&format_encode.format, &format_encode.row_encode) { - (Format::Native, Encode::Native) - | (Format::Plain, Encode::Bytes) - | (Format::DebeziumMongo, Encode::Json) => None, - (Format::Plain, Encode::Protobuf) | (Format::Upsert, Encode::Protobuf) => { - let (row_schema_location, use_schema_registry) = - get_schema_location(&mut format_encode_options_to_consume)?; - let protobuf_schema = ProtobufSchema { - message_name: consume_string_from_options( - &mut format_encode_options_to_consume, - MESSAGE_NAME_KEY, - )?, - row_schema_location, - use_schema_registry, - }; - let name_strategy = get_sr_name_strategy_check( - &mut format_encode_options_to_consume, - protobuf_schema.use_schema_registry, - )?; - - stream_source_info.use_schema_registry = protobuf_schema.use_schema_registry; - stream_source_info - .row_schema_location - .clone_from(&protobuf_schema.row_schema_location.0); - stream_source_info - .proto_message_name - .clone_from(&protobuf_schema.message_name.0); - stream_source_info.key_message_name = - get_key_message_name(&mut format_encode_options_to_consume); - stream_source_info.name_strategy = - name_strategy.unwrap_or(PbSchemaRegistryNameStrategy::Unspecified as i32); - - Some( - extract_protobuf_table_schema( - &protobuf_schema, - &options_with_secret, - &mut format_encode_options_to_consume, - ) - .await?, - ) - } - (format @ (Format::Plain | Format::Upsert | Format::Debezium), Encode::Avro) => { - if format_encode_options_to_consume - .remove(AWS_GLUE_SCHEMA_ARN_KEY) - .is_none() - { - // Legacy logic that assumes either `schema.location` or confluent `schema.registry`. - // The handling of newly added aws glue is centralized in `connector::parser`. - // TODO(xiangjinwu): move these option parsing to `connector::parser` as well. - - let (row_schema_location, use_schema_registry) = - get_schema_location(&mut format_encode_options_to_consume)?; - - if matches!(format, Format::Debezium) && !use_schema_registry { - return Err(RwError::from(ProtocolError( - "schema location for DEBEZIUM_AVRO row format is not supported".to_owned(), - ))); - } - - let message_name = try_consume_string_from_options( - &mut format_encode_options_to_consume, - MESSAGE_NAME_KEY, - ); - let name_strategy = get_sr_name_strategy_check( - &mut format_encode_options_to_consume, - use_schema_registry, - )?; - - stream_source_info.use_schema_registry = use_schema_registry; - stream_source_info - .row_schema_location - .clone_from(&row_schema_location.0); - stream_source_info.proto_message_name = - message_name.unwrap_or(AstString("".into())).0; - stream_source_info.key_message_name = - get_key_message_name(&mut format_encode_options_to_consume); - stream_source_info.name_strategy = - name_strategy.unwrap_or(PbSchemaRegistryNameStrategy::Unspecified as i32); - } - - Some( - extract_avro_table_schema( - &stream_source_info, - &options_with_secret, - &mut format_encode_options_to_consume, - matches!(format, Format::Debezium), - ) - .await?, - ) - } - (Format::Plain, Encode::Csv) => { - let chars = - consume_string_from_options(&mut format_encode_options_to_consume, "delimiter")?.0; - let delimiter = get_delimiter(chars.as_str()).context("failed to parse delimiter")?; - let has_header = try_consume_string_from_options( - &mut format_encode_options_to_consume, - "without_header", - ) - .map(|s| s.0 == "false") - .unwrap_or(true); - - if is_kafka && has_header { - return Err(RwError::from(ProtocolError( - "CSV HEADER is not supported when creating table with Kafka connector" - .to_owned(), - ))); - } - - stream_source_info.csv_delimiter = delimiter as i32; - stream_source_info.csv_has_header = has_header; - - None - } - // For parquet format, this step is implemented in parquet parser. - (Format::Plain, Encode::Parquet) => None, - ( - Format::Plain | Format::Upsert | Format::Maxwell | Format::Canal | Format::Debezium, - Encode::Json, - ) => { - if matches!( - format_encode.format, - Format::Plain | Format::Upsert | Format::Debezium - ) { - // Parse the value but throw it away. - // It would be too late to report error in `SpecificParserConfig::new`, - // which leads to recovery loop. - // TODO: rely on SpecificParserConfig::new to validate, like Avro - TimestamptzHandling::from_options(&format_encode_options_to_consume) - .map_err(|err| InvalidInputSyntax(err.message))?; - try_consume_string_from_options( - &mut format_encode_options_to_consume, - TimestamptzHandling::OPTION_KEY, - ); - } - - let schema_config = get_json_schema_location(&mut format_encode_options_to_consume)?; - stream_source_info.use_schema_registry = - json_schema_infer_use_schema_registry(&schema_config); - - extract_json_table_schema( - &schema_config, - &options_with_secret, - &mut format_encode_options_to_consume, - ) - .await? - } - (Format::None, Encode::None) => { - if options_with_secret.is_iceberg_connector() { - Some( - extract_iceberg_columns(&options_with_secret) - .await - .map_err(|err| ProtocolError(err.to_report_string()))?, - ) - } else { - None - } - } - (format, encoding) => { - return Err(RwError::from(ProtocolError(format!( - "Unknown combination {:?} {:?}", - format, encoding - )))); - } - }; - - if !format_encode_options_to_consume.is_empty() { - let err_string = format!( - "Get unknown format_encode_options for {:?} {:?}: {}", - format_encode.format, - format_encode.row_encode, - format_encode_options_to_consume - .keys() - .map(|k| k.to_string()) - .collect::>() - .join(","), - ); - session.notice_to_user(err_string); - } - Ok((columns, stream_source_info)) -} - -fn bind_columns_from_source_for_cdc( - session: &SessionImpl, - format_encode: &FormatEncodeOptions, -) -> Result<(Option>, StreamSourceInfo)> { - let with_options = WithOptions::try_from(format_encode.row_options())?; - if !with_options.connection_ref().is_empty() { - return Err(RwError::from(NotSupported( - "CDC connector does not support connection ref yet".to_owned(), - "Explicitly specify the connection in WITH clause".to_owned(), - ))); - } - let (format_encode_options, format_encode_secret_refs) = - resolve_secret_ref_in_with_options(with_options, session)?.into_parts(); - - // Need real secret to access the schema registry - let mut format_encode_options_to_consume = LocalSecretManager::global().fill_secrets( - format_encode_options.clone(), - format_encode_secret_refs.clone(), - )?; - - match (&format_encode.format, &format_encode.row_encode) { - (Format::Plain, Encode::Json) => (), - (format, encoding) => { - // Note: parser will also check this. Just be extra safe here - return Err(RwError::from(ProtocolError(format!( - "Row format for CDC connectors should be either omitted or set to `FORMAT PLAIN ENCODE JSON`, got: {:?} {:?}", - format, encoding - )))); - } - }; - - let columns = debezium_cdc_source_schema(); - let schema_config = get_json_schema_location(&mut format_encode_options_to_consume)?; - - let stream_source_info = StreamSourceInfo { - format: format_to_prost(&format_encode.format) as i32, - row_encode: row_encode_to_prost(&format_encode.row_encode) as i32, - format_encode_options, - use_schema_registry: json_schema_infer_use_schema_registry(&schema_config), - cdc_source_job: true, - is_distributed: false, - format_encode_secret_refs, - ..Default::default() - }; - if !format_encode_options_to_consume.is_empty() { - let err_string = format!( - "Get unknown format_encode_options for {:?} {:?}: {}", - format_encode.format, - format_encode.row_encode, - format_encode_options_to_consume - .keys() - .map(|k| k.to_string()) - .collect::>() - .join(","), - ); - session.notice_to_user(err_string); - } - Ok((Some(columns), stream_source_info)) -} - -// check the additional column compatibility with the format and encode -fn check_additional_column_compatibility( - column_def: &IncludeOptionItem, - format_encode: Option<&FormatEncodeOptions>, -) -> Result<()> { - // only allow header column have inner field - if column_def.inner_field.is_some() - && !column_def - .column_type - .real_value() - .eq_ignore_ascii_case("header") - { - return Err(RwError::from(ProtocolError(format!( - "Only header column can have inner field, but got {:?}", - column_def.column_type.real_value(), - )))); - } - - // Payload column only allowed when encode is JSON - if let Some(schema) = format_encode - && column_def - .column_type - .real_value() - .eq_ignore_ascii_case("payload") - && !matches!(schema.row_encode, Encode::Json) - { - return Err(RwError::from(ProtocolError(format!( - "INCLUDE payload is only allowed when using ENCODE JSON, but got ENCODE {:?}", - schema.row_encode - )))); - } - Ok(()) -} - -/// add connector-spec columns to the end of column catalog -pub fn handle_addition_columns( - format_encode: Option<&FormatEncodeOptions>, - with_properties: &BTreeMap, - mut additional_columns: IncludeOption, - columns: &mut Vec, - is_cdc_backfill_table: bool, -) -> Result<()> { - let connector_name = with_properties.get_connector().unwrap(); // there must be a connector in source - - if get_supported_additional_columns(connector_name.as_str(), is_cdc_backfill_table).is_none() - && !additional_columns.is_empty() - { - return Err(RwError::from(ProtocolError(format!( - "Connector {} accepts no additional column but got {:?}", - connector_name, additional_columns - )))); - } - - while let Some(item) = additional_columns.pop() { - check_additional_column_compatibility(&item, format_encode)?; - - let data_type = item - .header_inner_expect_type - .map(|dt| bind_data_type(&dt)) - .transpose()?; - if let Some(dt) = &data_type - && !matches!(dt, DataType::Bytea | DataType::Varchar) - { - return Err( - ErrorCode::BindError(format!("invalid additional column data type: {dt}")).into(), - ); - } - let col = build_additional_column_desc( - ColumnId::placeholder(), - connector_name.as_str(), - item.column_type.real_value().as_str(), - item.column_alias.map(|alias| alias.real_value()), - item.inner_field.as_deref(), - data_type.as_ref(), - true, - is_cdc_backfill_table, - )?; - columns.push(ColumnCatalog::visible(col)); - } - - Ok(()) -} - /// Bind columns from both source and sql defined. pub(crate) fn bind_all_columns( format_encode: &FormatEncodeOptions, @@ -1076,35 +486,6 @@ pub(crate) async fn bind_source_pk( Ok(res) } -// Add a hidden column `_rw_kafka_timestamp` to each message from Kafka source. -fn check_and_add_timestamp_column(with_properties: &WithOptions, columns: &mut Vec) { - if with_properties.is_kafka_connector() { - if columns.iter().any(|col| { - matches!( - col.column_desc.additional_column.column_type, - Some(AdditionalColumnType::Timestamp(_)) - ) - }) { - // already has timestamp column, no need to add a new one - return; - } - - // add a hidden column `_rw_kafka_timestamp` to each message from Kafka source - let col = build_additional_column_desc( - ColumnId::placeholder(), - KAFKA_CONNECTOR, - "timestamp", - Some(KAFKA_TIMESTAMP_COLUMN_NAME.to_owned()), - None, - None, - true, - false, - ) - .unwrap(); - columns.push(ColumnCatalog::hidden(col)); - } -} - pub(super) fn bind_source_watermark( session: &SessionImpl, name: String, @@ -1139,236 +520,6 @@ pub(super) fn bind_source_watermark( Ok(watermark_descs) } -static ALLOWED_CONNECTION_CONNECTOR: LazyLock> = LazyLock::new(|| { - hashset! { - PbConnectionType::Unspecified, - PbConnectionType::Kafka, - PbConnectionType::Iceberg, - } -}); - -static ALLOWED_CONNECTION_SCHEMA_REGISTRY: LazyLock> = - LazyLock::new(|| { - hashset! { - PbConnectionType::Unspecified, - PbConnectionType::SchemaRegistry, - } - }); - -// TODO: Better design if we want to support ENCODE KEY where we will have 4 dimensional array -static CONNECTORS_COMPATIBLE_FORMATS: LazyLock>>> = - LazyLock::new(|| { - convert_args!(hashmap!( - KAFKA_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Json, Encode::Protobuf, Encode::Avro, Encode::Bytes, Encode::Csv], - Format::Upsert => vec![Encode::Json, Encode::Avro, Encode::Protobuf], - Format::Debezium => vec![Encode::Json, Encode::Avro], - Format::Maxwell => vec![Encode::Json], - Format::Canal => vec![Encode::Json], - Format::DebeziumMongo => vec![Encode::Json], - ), - PULSAR_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Json, Encode::Protobuf, Encode::Avro, Encode::Bytes], - Format::Upsert => vec![Encode::Json, Encode::Avro], - Format::Debezium => vec![Encode::Json], - Format::Maxwell => vec![Encode::Json], - Format::Canal => vec![Encode::Json], - ), - KINESIS_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Json, Encode::Protobuf, Encode::Avro, Encode::Bytes], - Format::Upsert => vec![Encode::Json, Encode::Avro], - Format::Debezium => vec![Encode::Json], - Format::Maxwell => vec![Encode::Json], - Format::Canal => vec![Encode::Json], - ), - GOOGLE_PUBSUB_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Json, Encode::Protobuf, Encode::Avro, Encode::Bytes], - Format::Debezium => vec![Encode::Json], - Format::Maxwell => vec![Encode::Json], - Format::Canal => vec![Encode::Json], - ), - NEXMARK_CONNECTOR => hashmap!( - Format::Native => vec![Encode::Native], - Format::Plain => vec![Encode::Bytes], - ), - DATAGEN_CONNECTOR => hashmap!( - Format::Native => vec![Encode::Native], - Format::Plain => vec![Encode::Bytes, Encode::Json], - ), - S3_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Csv, Encode::Json], - ), - OPENDAL_S3_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Csv, Encode::Json, Encode::Parquet], - ), - GCS_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Csv, Encode::Json, Encode::Parquet], - ), - AZBLOB_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Csv, Encode::Json, Encode::Parquet], - ), - POSIX_FS_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Csv], - ), - MYSQL_CDC_CONNECTOR => hashmap!( - Format::Debezium => vec![Encode::Json], - // support source stream job - Format::Plain => vec![Encode::Json], - ), - POSTGRES_CDC_CONNECTOR => hashmap!( - Format::Debezium => vec![Encode::Json], - // support source stream job - Format::Plain => vec![Encode::Json], - ), - CITUS_CDC_CONNECTOR => hashmap!( - Format::Debezium => vec![Encode::Json], - ), - MONGODB_CDC_CONNECTOR => hashmap!( - Format::DebeziumMongo => vec![Encode::Json], - ), - NATS_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Json, Encode::Protobuf, Encode::Bytes], - ), - MQTT_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Json, Encode::Bytes], - ), - TEST_CONNECTOR => hashmap!( - Format::Plain => vec![Encode::Json], - ), - ICEBERG_CONNECTOR => hashmap!( - Format::None => vec![Encode::None], - ), - SQL_SERVER_CDC_CONNECTOR => hashmap!( - Format::Debezium => vec![Encode::Json], - // support source stream job - Format::Plain => vec![Encode::Json], - ), - )) - }); - -pub fn validate_license(connector: &str) -> Result<()> { - if connector == SQL_SERVER_CDC_CONNECTOR { - Feature::SqlServerCdcSource - .check_available() - .map_err(|e| anyhow::anyhow!(e))?; - } - Ok(()) -} - -pub fn validate_compatibility( - format_encode: &FormatEncodeOptions, - props: &mut BTreeMap, -) -> Result<()> { - let mut connector = props - .get_connector() - .ok_or_else(|| RwError::from(ProtocolError("missing field 'connector'".to_owned())))?; - - if connector == OPENDAL_S3_CONNECTOR { - // reject s3_v2 creation - return Err(RwError::from(Deprecated( - OPENDAL_S3_CONNECTOR.to_owned(), - S3_CONNECTOR.to_owned(), - ))); - } - if connector == S3_CONNECTOR { - // S3 connector is deprecated, use OPENDAL_S3_CONNECTOR instead - // do s3 -> s3_v2 migration - let entry = props.get_mut(UPSTREAM_SOURCE_KEY).unwrap(); - *entry = OPENDAL_S3_CONNECTOR.to_owned(); - connector = OPENDAL_S3_CONNECTOR.to_owned(); - } - - let compatible_formats = CONNECTORS_COMPATIBLE_FORMATS - .get(&connector) - .ok_or_else(|| { - RwError::from(ProtocolError(format!( - "connector {:?} is not supported, accept {:?}", - connector, - CONNECTORS_COMPATIBLE_FORMATS.keys() - ))) - })?; - - validate_license(&connector)?; - if connector != KAFKA_CONNECTOR { - let res = match (&format_encode.format, &format_encode.row_encode) { - (Format::Plain, Encode::Protobuf) | (Format::Plain, Encode::Avro) => { - let mut options = WithOptions::try_from(format_encode.row_options())?; - let (_, use_schema_registry) = get_schema_location(options.inner_mut())?; - use_schema_registry - } - (Format::Debezium, Encode::Avro) => true, - (_, _) => false, - }; - if res { - return Err(RwError::from(ProtocolError(format!( - "The {} must be kafka when schema registry is used", - UPSTREAM_SOURCE_KEY - )))); - } - } - - let compatible_encodes = compatible_formats - .get(&format_encode.format) - .ok_or_else(|| { - RwError::from(ProtocolError(format!( - "connector {} does not support format {:?}", - connector, format_encode.format - ))) - })?; - if !compatible_encodes.contains(&format_encode.row_encode) { - return Err(RwError::from(ProtocolError(format!( - "connector {} does not support format {:?} with encode {:?}", - connector, format_encode.format, format_encode.row_encode - )))); - } - - if connector == POSTGRES_CDC_CONNECTOR || connector == CITUS_CDC_CONNECTOR { - match props.get("slot.name") { - None => { - // Build a random slot name with UUID - // e.g. "rw_cdc_f9a3567e6dd54bf5900444c8b1c03815" - let uuid = uuid::Uuid::new_v4(); - props.insert("slot.name".into(), format!("rw_cdc_{}", uuid.simple())); - } - Some(slot_name) => { - // please refer to - // - https://github.com/debezium/debezium/blob/97956ce25b7612e3413d363658661896b7d2e0a2/debezium-connector-postgres/src/main/java/io/debezium/connector/postgresql/PostgresConnectorConfig.java#L1179 - // - https://doxygen.postgresql.org/slot_8c.html#afac399f07320b9adfd2c599cf822aaa3 - if !slot_name - .chars() - .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_') - || slot_name.len() > 63 - { - return Err(RwError::from(ProtocolError(format!( - "Invalid replication slot name: {:?}. Valid replication slot name must contain only digits, lowercase characters and underscores with length <= 63", - slot_name - )))); - } - } - } - - if !props.contains_key("schema.name") { - // Default schema name is "public" - props.insert("schema.name".into(), "public".into()); - } - if !props.contains_key("publication.name") { - // Default publication name is "rw_publication" - props.insert("publication.name".into(), "rw_publication".into()); - } - if !props.contains_key("publication.create.enable") { - // Default auto create publication if doesn't exist - props.insert("publication.create.enable".into(), "true".into()); - } - } - - if connector == SQL_SERVER_CDC_CONNECTOR && !props.contains_key("schema.name") { - // Default schema name is "dbo" - props.insert("schema.name".into(), "dbo".into()); - } - - Ok(()) -} - /// Performs early stage checking in frontend to see if the schema of the given `columns` is /// compatible with the connector extracted from the properties. /// @@ -1394,145 +545,6 @@ pub(super) async fn check_format_encode( } } -pub(super) fn check_nexmark_schema( - props: &WithOptionsSecResolved, - row_id_index: Option, - columns: &[ColumnCatalog], -) -> Result<()> { - let table_type = props - .get("nexmark.table.type") - .map(|t| t.to_ascii_lowercase()); - - let event_type = match table_type.as_deref() { - None => None, - Some("bid") => Some(EventType::Bid), - Some("auction") => Some(EventType::Auction), - Some("person") => Some(EventType::Person), - Some(t) => { - return Err(RwError::from(ProtocolError(format!( - "unsupported table type for nexmark source: {}", - t - )))) - } - }; - - // Ignore the generated columns and map the index of row_id column. - let user_defined_columns = columns.iter().filter(|c| !c.is_generated()); - let row_id_index = if let Some(index) = row_id_index { - let col_id = columns[index].column_id(); - user_defined_columns - .clone() - .position(|c| c.column_id() == col_id) - .unwrap() - .into() - } else { - None - }; - - let expected = get_event_data_types_with_names(event_type, row_id_index); - let user_defined = user_defined_columns - .map(|c| { - ( - c.column_desc.name.to_ascii_lowercase(), - c.column_desc.data_type.to_owned(), - ) - }) - .collect_vec(); - - if expected != user_defined { - let cmp = pretty_assertions::Comparison::new(&expected, &user_defined); - return Err(RwError::from(ProtocolError(format!( - "The schema of the nexmark source must specify all columns in order:\n{cmp}", - )))); - } - Ok(()) -} - -pub async fn extract_iceberg_columns( - with_properties: &WithOptionsSecResolved, -) -> anyhow::Result> { - let props = ConnectorProperties::extract(with_properties.clone(), true)?; - if let ConnectorProperties::Iceberg(properties) = props { - let table = properties.load_table_v2().await?; - let iceberg_schema: arrow_schema_iceberg::Schema = - iceberg::arrow::schema_to_arrow_schema(table.metadata().current_schema())?; - - let mut columns: Vec = iceberg_schema - .fields() - .iter() - .enumerate() - .map(|(i, field)| { - let column_desc = ColumnDesc::named( - field.name(), - ColumnId::new((i + 1).try_into().unwrap()), - IcebergArrowConvert.type_from_field(field).unwrap(), - ); - ColumnCatalog { - column_desc, - // hide the _row_id column for iceberg engine table - // This column is auto generated when users define a table without primary key - is_hidden: field.name() == ROWID_PREFIX, - } - }) - .collect(); - columns.push(ColumnCatalog::iceberg_sequence_num_column()); - - Ok(columns) - } else { - Err(anyhow!(format!( - "Invalid properties for iceberg source: {:?}", - props - ))) - } -} - -pub async fn check_iceberg_source( - props: &WithOptionsSecResolved, - columns: &[ColumnCatalog], -) -> anyhow::Result<()> { - let props = ConnectorProperties::extract(props.clone(), true)?; - let ConnectorProperties::Iceberg(properties) = props else { - return Err(anyhow!(format!( - "Invalid properties for iceberg source: {:?}", - props - ))); - }; - - let schema = Schema { - fields: columns - .iter() - .filter(|&c| c.column_desc.name != ICEBERG_SEQUENCE_NUM_COLUMN_NAME) - .cloned() - .map(|c| c.column_desc.into()) - .collect(), - }; - - let table = properties.load_table_v2().await?; - - let iceberg_schema = iceberg::arrow::schema_to_arrow_schema(table.metadata().current_schema())?; - - for f1 in schema.fields() { - if !iceberg_schema.fields.iter().any(|f2| f2.name() == &f1.name) { - return Err(anyhow::anyhow!(format!( - "Column {} not found in iceberg table", - f1.name - ))); - } - } - - let new_iceberg_field = iceberg_schema - .fields - .iter() - .filter(|f1| schema.fields.iter().any(|f2| f1.name() == &f2.name)) - .cloned() - .collect::>(); - let new_iceberg_schema = arrow_schema_iceberg::Schema::new(new_iceberg_field); - - risingwave_connector::sink::iceberg::try_matches_arrow_schema(&schema, &new_iceberg_schema)?; - - Ok(()) -} - pub fn bind_connector_props( handler_args: &HandlerArgs, format_encode: &FormatEncodeOptions, @@ -1875,46 +887,16 @@ pub(super) fn generate_stream_graph_for_source( Ok(graph) } -fn format_to_prost(format: &Format) -> FormatType { - match format { - Format::Native => FormatType::Native, - Format::Plain => FormatType::Plain, - Format::Upsert => FormatType::Upsert, - Format::Debezium => FormatType::Debezium, - Format::DebeziumMongo => FormatType::DebeziumMongo, - Format::Maxwell => FormatType::Maxwell, - Format::Canal => FormatType::Canal, - Format::None => FormatType::None, - } -} -fn row_encode_to_prost(row_encode: &Encode) -> EncodeType { - match row_encode { - Encode::Native => EncodeType::Native, - Encode::Json => EncodeType::Json, - Encode::Avro => EncodeType::Avro, - Encode::Protobuf => EncodeType::Protobuf, - Encode::Csv => EncodeType::Csv, - Encode::Bytes => EncodeType::Bytes, - Encode::Template => EncodeType::Template, - Encode::Parquet => EncodeType::Parquet, - Encode::None => EncodeType::None, - Encode::Text => EncodeType::Text, - } -} - #[cfg(test)] pub mod tests { use std::collections::HashMap; use std::sync::Arc; - use risingwave_common::catalog::{ - CDC_SOURCE_COLUMN_NUM, DEFAULT_DATABASE_NAME, DEFAULT_SCHEMA_NAME, ROWID_PREFIX, - }; + use risingwave_common::catalog::{DEFAULT_DATABASE_NAME, DEFAULT_SCHEMA_NAME, ROWID_PREFIX}; use risingwave_common::types::{DataType, StructType}; use crate::catalog::root_catalog::SchemaPath; use crate::catalog::source_catalog::SourceCatalog; - use crate::handler::create_source::debezium_cdc_source_schema; use crate::test_utils::{create_proto_file, LocalFrontend, PROTO_FILE_DATA}; const GET_COLUMN_FROM_CATALOG: fn(&Arc) -> HashMap<&str, DataType> = @@ -2087,13 +1069,6 @@ pub mod tests { .assert_debug_eq(&columns); } - #[tokio::test] - async fn test_cdc_source_job_schema() { - let columns = debezium_cdc_source_schema(); - // make sure it doesn't broken by future PRs - assert_eq!(CDC_SOURCE_COLUMN_NUM, columns.len() as u32); - } - #[tokio::test] async fn test_source_addition_columns() { // test derive include column for format plain diff --git a/src/frontend/src/handler/create_source/additional_column.rs b/src/frontend/src/handler/create_source/additional_column.rs new file mode 100644 index 0000000000000..7af5c4519be06 --- /dev/null +++ b/src/frontend/src/handler/create_source/additional_column.rs @@ -0,0 +1,130 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::*; + +// check the additional column compatibility with the format and encode +fn check_additional_column_compatibility( + column_def: &IncludeOptionItem, + format_encode: Option<&FormatEncodeOptions>, +) -> Result<()> { + // only allow header column have inner field + if column_def.inner_field.is_some() + && !column_def + .column_type + .real_value() + .eq_ignore_ascii_case("header") + { + return Err(RwError::from(ProtocolError(format!( + "Only header column can have inner field, but got {:?}", + column_def.column_type.real_value(), + )))); + } + + // Payload column only allowed when encode is JSON + if let Some(schema) = format_encode + && column_def + .column_type + .real_value() + .eq_ignore_ascii_case("payload") + && !matches!(schema.row_encode, Encode::Json) + { + return Err(RwError::from(ProtocolError(format!( + "INCLUDE payload is only allowed when using ENCODE JSON, but got ENCODE {:?}", + schema.row_encode + )))); + } + Ok(()) +} + +/// add connector-spec columns to the end of column catalog +pub fn handle_addition_columns( + format_encode: Option<&FormatEncodeOptions>, + with_properties: &BTreeMap, + mut additional_columns: IncludeOption, + columns: &mut Vec, + is_cdc_backfill_table: bool, +) -> Result<()> { + let connector_name = with_properties.get_connector().unwrap(); // there must be a connector in source + + if get_supported_additional_columns(connector_name.as_str(), is_cdc_backfill_table).is_none() + && !additional_columns.is_empty() + { + return Err(RwError::from(ProtocolError(format!( + "Connector {} accepts no additional column but got {:?}", + connector_name, additional_columns + )))); + } + + while let Some(item) = additional_columns.pop() { + check_additional_column_compatibility(&item, format_encode)?; + + let data_type = item + .header_inner_expect_type + .map(|dt| bind_data_type(&dt)) + .transpose()?; + if let Some(dt) = &data_type + && !matches!(dt, DataType::Bytea | DataType::Varchar) + { + return Err( + ErrorCode::BindError(format!("invalid additional column data type: {dt}")).into(), + ); + } + let col = build_additional_column_desc( + ColumnId::placeholder(), + connector_name.as_str(), + item.column_type.real_value().as_str(), + item.column_alias.map(|alias| alias.real_value()), + item.inner_field.as_deref(), + data_type.as_ref(), + true, + is_cdc_backfill_table, + )?; + columns.push(ColumnCatalog::visible(col)); + } + + Ok(()) +} + +// Add a hidden column `_rw_kafka_timestamp` to each message from Kafka source. +pub fn check_and_add_timestamp_column( + with_properties: &WithOptions, + columns: &mut Vec, +) { + if with_properties.is_kafka_connector() { + if columns.iter().any(|col| { + matches!( + col.column_desc.additional_column.column_type, + Some(AdditionalColumnType::Timestamp(_)) + ) + }) { + // already has timestamp column, no need to add a new one + return; + } + + // add a hidden column `_rw_kafka_timestamp` to each message from Kafka source + let col = build_additional_column_desc( + ColumnId::placeholder(), + KAFKA_CONNECTOR, + "timestamp", + Some(KAFKA_TIMESTAMP_COLUMN_NAME.to_owned()), + None, + None, + true, + false, + ) + .unwrap(); + columns.push(ColumnCatalog::hidden(col)); + } +} diff --git a/src/frontend/src/handler/create_source/external_schema.rs b/src/frontend/src/handler/create_source/external_schema.rs new file mode 100644 index 0000000000000..8d8288eec3a1c --- /dev/null +++ b/src/frontend/src/handler/create_source/external_schema.rs @@ -0,0 +1,441 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! bind columns from external schema + +use risingwave_connector::parser::debezium_cdc_source_schema; + +use super::*; + +mod json; +use json::*; +mod avro; +use avro::extract_avro_table_schema; +pub mod debezium; +pub mod iceberg; +use iceberg::extract_iceberg_columns; +mod protobuf; +use protobuf::extract_protobuf_table_schema; +pub mod nexmark; + +/// Resolves the schema of the source from external schema file. +/// See for more information. +/// +/// Note: the returned schema strictly corresponds to the schema. +/// Other special columns like additional columns (`INCLUDE`), and `row_id` column are not included. +pub async fn bind_columns_from_source( + session: &SessionImpl, + format_encode: &FormatEncodeOptions, + with_properties: Either<&WithOptions, &WithOptionsSecResolved>, + create_source_type: CreateSourceType, +) -> Result<(Option>, StreamSourceInfo)> { + let (columns_from_resolve_source, mut source_info) = + if create_source_type == CreateSourceType::SharedCdc { + bind_columns_from_source_for_cdc(session, format_encode)? + } else { + bind_columns_from_source_for_non_cdc(session, format_encode, with_properties).await? + }; + if create_source_type.is_shared() { + // Note: this field should be called is_shared. Check field doc for more details. + source_info.cdc_source_job = true; + source_info.is_distributed = create_source_type == CreateSourceType::SharedNonCdc; + } + Ok((columns_from_resolve_source, source_info)) +} + +async fn bind_columns_from_source_for_non_cdc( + session: &SessionImpl, + format_encode: &FormatEncodeOptions, + with_properties: Either<&WithOptions, &WithOptionsSecResolved>, +) -> Result<(Option>, StreamSourceInfo)> { + const MESSAGE_NAME_KEY: &str = "message"; + const KEY_MESSAGE_NAME_KEY: &str = "key.message"; + const NAME_STRATEGY_KEY: &str = "schema.registry.name.strategy"; + + let options_with_secret = match with_properties { + Either::Left(options) => { + let (sec_resolve_props, connection_type, _) = resolve_connection_ref_and_secret_ref( + options.clone(), + session, + TelemetryDatabaseObject::Source, + )?; + if !ALLOWED_CONNECTION_CONNECTOR.contains(&connection_type) { + return Err(RwError::from(ProtocolError(format!( + "connection type {:?} is not allowed, allowed types: {:?}", + connection_type, ALLOWED_CONNECTION_CONNECTOR + )))); + } + + sec_resolve_props + } + Either::Right(options_with_secret) => options_with_secret.clone(), + }; + + let is_kafka: bool = options_with_secret.is_kafka_connector(); + + // todo: need to resolve connection ref for schema registry + let (sec_resolve_props, connection_type, schema_registry_conn_ref) = + resolve_connection_ref_and_secret_ref( + WithOptions::try_from(format_encode.row_options())?, + session, + TelemetryDatabaseObject::Source, + )?; + ensure_connection_type_allowed(connection_type, &ALLOWED_CONNECTION_SCHEMA_REGISTRY)?; + + let (format_encode_options, format_encode_secret_refs) = sec_resolve_props.into_parts(); + // Need real secret to access the schema registry + let mut format_encode_options_to_consume = LocalSecretManager::global().fill_secrets( + format_encode_options.clone(), + format_encode_secret_refs.clone(), + )?; + + fn get_key_message_name(options: &mut BTreeMap) -> Option { + consume_string_from_options(options, KEY_MESSAGE_NAME_KEY) + .map(|ele| Some(ele.0)) + .unwrap_or(None) + } + fn get_sr_name_strategy_check( + options: &mut BTreeMap, + use_sr: bool, + ) -> Result> { + let name_strategy = get_name_strategy_or_default(try_consume_string_from_options( + options, + NAME_STRATEGY_KEY, + ))?; + if !use_sr && name_strategy.is_some() { + return Err(RwError::from(ProtocolError( + "schema registry name strategy only works with schema registry enabled".to_owned(), + ))); + } + Ok(name_strategy) + } + + let mut stream_source_info = StreamSourceInfo { + format: format_to_prost(&format_encode.format) as i32, + row_encode: row_encode_to_prost(&format_encode.row_encode) as i32, + format_encode_options, + format_encode_secret_refs, + connection_id: schema_registry_conn_ref, + ..Default::default() + }; + + if format_encode.format == Format::Debezium { + try_consume_string_from_options(&mut format_encode_options_to_consume, DEBEZIUM_IGNORE_KEY); + } + + let columns = match (&format_encode.format, &format_encode.row_encode) { + (Format::Native, Encode::Native) + | (Format::Plain, Encode::Bytes) + | (Format::DebeziumMongo, Encode::Json) => None, + (Format::Plain, Encode::Protobuf) | (Format::Upsert, Encode::Protobuf) => { + let (row_schema_location, use_schema_registry) = + get_schema_location(&mut format_encode_options_to_consume)?; + let protobuf_schema = ProtobufSchema { + message_name: consume_string_from_options( + &mut format_encode_options_to_consume, + MESSAGE_NAME_KEY, + )?, + row_schema_location, + use_schema_registry, + }; + let name_strategy = get_sr_name_strategy_check( + &mut format_encode_options_to_consume, + protobuf_schema.use_schema_registry, + )?; + + stream_source_info.use_schema_registry = protobuf_schema.use_schema_registry; + stream_source_info + .row_schema_location + .clone_from(&protobuf_schema.row_schema_location.0); + stream_source_info + .proto_message_name + .clone_from(&protobuf_schema.message_name.0); + stream_source_info.key_message_name = + get_key_message_name(&mut format_encode_options_to_consume); + stream_source_info.name_strategy = + name_strategy.unwrap_or(PbSchemaRegistryNameStrategy::Unspecified as i32); + + Some( + extract_protobuf_table_schema( + &protobuf_schema, + &options_with_secret, + &mut format_encode_options_to_consume, + ) + .await?, + ) + } + (format @ (Format::Plain | Format::Upsert | Format::Debezium), Encode::Avro) => { + if format_encode_options_to_consume + .remove(AWS_GLUE_SCHEMA_ARN_KEY) + .is_none() + { + // Legacy logic that assumes either `schema.location` or confluent `schema.registry`. + // The handling of newly added aws glue is centralized in `connector::parser`. + // TODO(xiangjinwu): move these option parsing to `connector::parser` as well. + + let (row_schema_location, use_schema_registry) = + get_schema_location(&mut format_encode_options_to_consume)?; + + if matches!(format, Format::Debezium) && !use_schema_registry { + return Err(RwError::from(ProtocolError( + "schema location for DEBEZIUM_AVRO row format is not supported".to_owned(), + ))); + } + + let message_name = try_consume_string_from_options( + &mut format_encode_options_to_consume, + MESSAGE_NAME_KEY, + ); + let name_strategy = get_sr_name_strategy_check( + &mut format_encode_options_to_consume, + use_schema_registry, + )?; + + stream_source_info.use_schema_registry = use_schema_registry; + stream_source_info + .row_schema_location + .clone_from(&row_schema_location.0); + stream_source_info.proto_message_name = + message_name.unwrap_or(AstString("".into())).0; + stream_source_info.key_message_name = + get_key_message_name(&mut format_encode_options_to_consume); + stream_source_info.name_strategy = + name_strategy.unwrap_or(PbSchemaRegistryNameStrategy::Unspecified as i32); + } + + Some( + extract_avro_table_schema( + &stream_source_info, + &options_with_secret, + &mut format_encode_options_to_consume, + matches!(format, Format::Debezium), + ) + .await?, + ) + } + (Format::Plain, Encode::Csv) => { + let chars = + consume_string_from_options(&mut format_encode_options_to_consume, "delimiter")?.0; + let delimiter = get_delimiter(chars.as_str()).context("failed to parse delimiter")?; + let has_header = try_consume_string_from_options( + &mut format_encode_options_to_consume, + "without_header", + ) + .map(|s| s.0 == "false") + .unwrap_or(true); + + if is_kafka && has_header { + return Err(RwError::from(ProtocolError( + "CSV HEADER is not supported when creating table with Kafka connector" + .to_owned(), + ))); + } + + stream_source_info.csv_delimiter = delimiter as i32; + stream_source_info.csv_has_header = has_header; + + None + } + // For parquet format, this step is implemented in parquet parser. + (Format::Plain, Encode::Parquet) => None, + ( + Format::Plain | Format::Upsert | Format::Maxwell | Format::Canal | Format::Debezium, + Encode::Json, + ) => { + if matches!( + format_encode.format, + Format::Plain | Format::Upsert | Format::Debezium + ) { + // Parse the value but throw it away. + // It would be too late to report error in `SpecificParserConfig::new`, + // which leads to recovery loop. + // TODO: rely on SpecificParserConfig::new to validate, like Avro + TimestamptzHandling::from_options(&format_encode_options_to_consume) + .map_err(|err| InvalidInputSyntax(err.message))?; + try_consume_string_from_options( + &mut format_encode_options_to_consume, + TimestamptzHandling::OPTION_KEY, + ); + } + + let schema_config = get_json_schema_location(&mut format_encode_options_to_consume)?; + stream_source_info.use_schema_registry = + json_schema_infer_use_schema_registry(&schema_config); + + extract_json_table_schema( + &schema_config, + &options_with_secret, + &mut format_encode_options_to_consume, + ) + .await? + } + (Format::None, Encode::None) => { + if options_with_secret.is_iceberg_connector() { + Some( + extract_iceberg_columns(&options_with_secret) + .await + .map_err(|err| ProtocolError(err.to_report_string()))?, + ) + } else { + None + } + } + (format, encoding) => { + return Err(RwError::from(ProtocolError(format!( + "Unknown combination {:?} {:?}", + format, encoding + )))); + } + }; + + if !format_encode_options_to_consume.is_empty() { + let err_string = format!( + "Get unknown format_encode_options for {:?} {:?}: {}", + format_encode.format, + format_encode.row_encode, + format_encode_options_to_consume + .keys() + .map(|k| k.to_string()) + .collect::>() + .join(","), + ); + session.notice_to_user(err_string); + } + Ok((columns, stream_source_info)) +} + +fn bind_columns_from_source_for_cdc( + session: &SessionImpl, + format_encode: &FormatEncodeOptions, +) -> Result<(Option>, StreamSourceInfo)> { + let with_options = WithOptions::try_from(format_encode.row_options())?; + if !with_options.connection_ref().is_empty() { + return Err(RwError::from(NotSupported( + "CDC connector does not support connection ref yet".to_owned(), + "Explicitly specify the connection in WITH clause".to_owned(), + ))); + } + let (format_encode_options, format_encode_secret_refs) = + resolve_secret_ref_in_with_options(with_options, session)?.into_parts(); + + // Need real secret to access the schema registry + let mut format_encode_options_to_consume = LocalSecretManager::global().fill_secrets( + format_encode_options.clone(), + format_encode_secret_refs.clone(), + )?; + + match (&format_encode.format, &format_encode.row_encode) { + (Format::Plain, Encode::Json) => (), + (format, encoding) => { + // Note: parser will also check this. Just be extra safe here + return Err(RwError::from(ProtocolError(format!( + "Row format for CDC connectors should be either omitted or set to `FORMAT PLAIN ENCODE JSON`, got: {:?} {:?}", + format, encoding + )))); + } + }; + + let columns = debezium_cdc_source_schema(); + let schema_config = get_json_schema_location(&mut format_encode_options_to_consume)?; + + let stream_source_info = StreamSourceInfo { + format: format_to_prost(&format_encode.format) as i32, + row_encode: row_encode_to_prost(&format_encode.row_encode) as i32, + format_encode_options, + use_schema_registry: json_schema_infer_use_schema_registry(&schema_config), + cdc_source_job: true, + is_distributed: false, + format_encode_secret_refs, + ..Default::default() + }; + if !format_encode_options_to_consume.is_empty() { + let err_string = format!( + "Get unknown format_encode_options for {:?} {:?}: {}", + format_encode.format, + format_encode.row_encode, + format_encode_options_to_consume + .keys() + .map(|k| k.to_string()) + .collect::>() + .join(","), + ); + session.notice_to_user(err_string); + } + Ok((Some(columns), stream_source_info)) +} + +fn format_to_prost(format: &Format) -> FormatType { + match format { + Format::Native => FormatType::Native, + Format::Plain => FormatType::Plain, + Format::Upsert => FormatType::Upsert, + Format::Debezium => FormatType::Debezium, + Format::DebeziumMongo => FormatType::DebeziumMongo, + Format::Maxwell => FormatType::Maxwell, + Format::Canal => FormatType::Canal, + Format::None => FormatType::None, + } +} +fn row_encode_to_prost(row_encode: &Encode) -> EncodeType { + match row_encode { + Encode::Native => EncodeType::Native, + Encode::Json => EncodeType::Json, + Encode::Avro => EncodeType::Avro, + Encode::Protobuf => EncodeType::Protobuf, + Encode::Csv => EncodeType::Csv, + Encode::Bytes => EncodeType::Bytes, + Encode::Template => EncodeType::Template, + Encode::Parquet => EncodeType::Parquet, + Encode::None => EncodeType::None, + Encode::Text => EncodeType::Text, + } +} + +pub fn get_schema_location( + format_encode_options: &mut BTreeMap, +) -> Result<(AstString, bool)> { + let schema_location = try_consume_string_from_options(format_encode_options, "schema.location"); + let schema_registry = try_consume_string_from_options(format_encode_options, "schema.registry"); + match (schema_location, schema_registry) { + (None, None) => Err(RwError::from(ProtocolError( + "missing either a schema location or a schema registry".to_owned(), + ))), + (None, Some(schema_registry)) => Ok((schema_registry, true)), + (Some(schema_location), None) => Ok((schema_location, false)), + (Some(_), Some(_)) => Err(RwError::from(ProtocolError( + "only need either the schema location or the schema registry".to_owned(), + ))), + } +} + +pub fn schema_has_schema_registry(schema: &FormatEncodeOptions) -> bool { + match schema.row_encode { + Encode::Avro | Encode::Protobuf => true, + Encode::Json => { + let mut options = WithOptions::try_from(schema.row_options()).unwrap(); + matches!(get_json_schema_location(options.inner_mut()), Ok(Some(_))) + } + _ => false, + } +} + +#[inline] +fn get_name_strategy_or_default(name_strategy: Option) -> Result> { + match name_strategy { + None => Ok(None), + Some(name) => Ok(Some(name_strategy_from_str(name.0.as_str()) + .ok_or_else(|| RwError::from(ProtocolError(format!("\ + expect strategy name in topic_name_strategy, record_name_strategy and topic_record_name_strategy, but got {}", name))))? as i32)), + } +} diff --git a/src/frontend/src/handler/create_source/external_schema/avro.rs b/src/frontend/src/handler/create_source/external_schema/avro.rs new file mode 100644 index 0000000000000..e6d3e0e7cd3a5 --- /dev/null +++ b/src/frontend/src/handler/create_source/external_schema/avro.rs @@ -0,0 +1,52 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::*; + +/// Map an Avro schema to a relational schema. +pub async fn extract_avro_table_schema( + info: &StreamSourceInfo, + with_properties: &WithOptionsSecResolved, + format_encode_options: &mut BTreeMap, + is_debezium: bool, +) -> Result> { + let parser_config = SpecificParserConfig::new(info, with_properties)?; + try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_USERNAME); + try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_PASSWORD); + consume_aws_config_from_options(format_encode_options); + + let vec_column_desc = if is_debezium { + let conf = DebeziumAvroParserConfig::new(parser_config.encoding_config).await?; + conf.map_to_columns()? + } else { + if let risingwave_connector::parser::EncodingProperties::Avro(avro_props) = + &parser_config.encoding_config + && matches!(avro_props.schema_location, SchemaLocation::File { .. }) + && format_encode_options + .get("with_deprecated_file_header") + .is_none_or(|v| v != "true") + { + bail_not_implemented!(issue = 12871, "avro without schema registry"); + } + let conf = AvroParserConfig::new(parser_config.encoding_config).await?; + conf.map_to_columns()? + }; + Ok(vec_column_desc + .into_iter() + .map(|col| ColumnCatalog { + column_desc: col.into(), + is_hidden: false, + }) + .collect_vec()) +} diff --git a/src/frontend/src/handler/create_source/external_schema/debezium.rs b/src/frontend/src/handler/create_source/external_schema/debezium.rs new file mode 100644 index 0000000000000..d02667016092f --- /dev/null +++ b/src/frontend/src/handler/create_source/external_schema/debezium.rs @@ -0,0 +1,24 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::*; + +pub async fn extract_debezium_avro_table_pk_columns( + info: &StreamSourceInfo, + with_properties: &WithOptionsSecResolved, +) -> Result> { + let parser_config = SpecificParserConfig::new(info, with_properties)?; + let conf = DebeziumAvroParserConfig::new(parser_config.encoding_config).await?; + Ok(conf.extract_pks()?.drain(..).map(|c| c.name).collect()) +} diff --git a/src/frontend/src/handler/create_source/external_schema/iceberg.rs b/src/frontend/src/handler/create_source/external_schema/iceberg.rs new file mode 100644 index 0000000000000..5ec0998a2c70e --- /dev/null +++ b/src/frontend/src/handler/create_source/external_schema/iceberg.rs @@ -0,0 +1,101 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::*; + +pub async fn extract_iceberg_columns( + with_properties: &WithOptionsSecResolved, +) -> anyhow::Result> { + let props = ConnectorProperties::extract(with_properties.clone(), true)?; + if let ConnectorProperties::Iceberg(properties) = props { + let table = properties.load_table_v2().await?; + let iceberg_schema: arrow_schema_iceberg::Schema = + ::iceberg::arrow::schema_to_arrow_schema(table.metadata().current_schema())?; + + let mut columns: Vec = iceberg_schema + .fields() + .iter() + .enumerate() + .map(|(i, field)| { + let column_desc = ColumnDesc::named( + field.name(), + ColumnId::new((i + 1).try_into().unwrap()), + IcebergArrowConvert.type_from_field(field).unwrap(), + ); + ColumnCatalog { + column_desc, + // hide the _row_id column for iceberg engine table + // This column is auto generated when users define a table without primary key + is_hidden: field.name() == ROWID_PREFIX, + } + }) + .collect(); + columns.push(ColumnCatalog::iceberg_sequence_num_column()); + + Ok(columns) + } else { + Err(anyhow!(format!( + "Invalid properties for iceberg source: {:?}", + props + ))) + } +} + +pub async fn check_iceberg_source( + props: &WithOptionsSecResolved, + columns: &[ColumnCatalog], +) -> anyhow::Result<()> { + let props = ConnectorProperties::extract(props.clone(), true)?; + let ConnectorProperties::Iceberg(properties) = props else { + return Err(anyhow!(format!( + "Invalid properties for iceberg source: {:?}", + props + ))); + }; + + let schema = Schema { + fields: columns + .iter() + .filter(|&c| c.column_desc.name != ICEBERG_SEQUENCE_NUM_COLUMN_NAME) + .cloned() + .map(|c| c.column_desc.into()) + .collect(), + }; + + let table = properties.load_table_v2().await?; + + let iceberg_schema = + ::iceberg::arrow::schema_to_arrow_schema(table.metadata().current_schema())?; + + for f1 in schema.fields() { + if !iceberg_schema.fields.iter().any(|f2| f2.name() == &f1.name) { + return Err(anyhow::anyhow!(format!( + "Column {} not found in iceberg table", + f1.name + ))); + } + } + + let new_iceberg_field = iceberg_schema + .fields + .iter() + .filter(|f1| schema.fields.iter().any(|f2| f1.name() == &f2.name)) + .cloned() + .collect::>(); + let new_iceberg_schema = arrow_schema_iceberg::Schema::new(new_iceberg_field); + + risingwave_connector::sink::iceberg::try_matches_arrow_schema(&schema, &new_iceberg_schema)?; + + Ok(()) +} diff --git a/src/frontend/src/handler/create_source/external_schema/json.rs b/src/frontend/src/handler/create_source/external_schema/json.rs new file mode 100644 index 0000000000000..1dc64f669b2cf --- /dev/null +++ b/src/frontend/src/handler/create_source/external_schema/json.rs @@ -0,0 +1,70 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::*; + +pub fn json_schema_infer_use_schema_registry(schema_config: &Option<(AstString, bool)>) -> bool { + match schema_config { + None => false, + Some((_, use_registry)) => *use_registry, + } +} + +/// Map a JSON schema to a relational schema +pub async fn extract_json_table_schema( + schema_config: &Option<(AstString, bool)>, + with_properties: &BTreeMap, + format_encode_options: &mut BTreeMap, +) -> Result>> { + match schema_config { + None => Ok(None), + Some((schema_location, use_schema_registry)) => { + let schema_registry_auth = use_schema_registry.then(|| { + let auth = SchemaRegistryAuth::from(&*format_encode_options); + try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_USERNAME); + try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_PASSWORD); + auth + }); + Ok(Some( + fetch_json_schema_and_map_to_columns( + &schema_location.0, + schema_registry_auth, + with_properties, + ) + .await? + .into_iter() + .map(|col| ColumnCatalog { + column_desc: col.into(), + is_hidden: false, + }) + .collect_vec(), + )) + } + } +} + +pub fn get_json_schema_location( + format_encode_options: &mut BTreeMap, +) -> Result> { + let schema_location = try_consume_string_from_options(format_encode_options, "schema.location"); + let schema_registry = try_consume_string_from_options(format_encode_options, "schema.registry"); + match (schema_location, schema_registry) { + (None, None) => Ok(None), + (None, Some(schema_registry)) => Ok(Some((schema_registry, true))), + (Some(schema_location), None) => Ok(Some((schema_location, false))), + (Some(_), Some(_)) => Err(RwError::from(ProtocolError( + "only need either the schema location or the schema registry".to_owned(), + ))), + } +} diff --git a/src/frontend/src/handler/create_source/external_schema/nexmark.rs b/src/frontend/src/handler/create_source/external_schema/nexmark.rs new file mode 100644 index 0000000000000..0d08424f5fa7f --- /dev/null +++ b/src/frontend/src/handler/create_source/external_schema/nexmark.rs @@ -0,0 +1,69 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::*; + +pub fn check_nexmark_schema( + props: &WithOptionsSecResolved, + row_id_index: Option, + columns: &[ColumnCatalog], +) -> Result<()> { + let table_type = props + .get("nexmark.table.type") + .map(|t| t.to_ascii_lowercase()); + + let event_type = match table_type.as_deref() { + None => None, + Some("bid") => Some(EventType::Bid), + Some("auction") => Some(EventType::Auction), + Some("person") => Some(EventType::Person), + Some(t) => { + return Err(RwError::from(ProtocolError(format!( + "unsupported table type for nexmark source: {}", + t + )))) + } + }; + + // Ignore the generated columns and map the index of row_id column. + let user_defined_columns = columns.iter().filter(|c| !c.is_generated()); + let row_id_index = if let Some(index) = row_id_index { + let col_id = columns[index].column_id(); + user_defined_columns + .clone() + .position(|c| c.column_id() == col_id) + .unwrap() + .into() + } else { + None + }; + + let expected = get_event_data_types_with_names(event_type, row_id_index); + let user_defined = user_defined_columns + .map(|c| { + ( + c.column_desc.name.to_ascii_lowercase(), + c.column_desc.data_type.to_owned(), + ) + }) + .collect_vec(); + + if expected != user_defined { + let cmp = pretty_assertions::Comparison::new(&expected, &user_defined); + return Err(RwError::from(ProtocolError(format!( + "The schema of the nexmark source must specify all columns in order:\n{cmp}", + )))); + } + Ok(()) +} diff --git a/src/frontend/src/handler/create_source/external_schema/protobuf.rs b/src/frontend/src/handler/create_source/external_schema/protobuf.rs new file mode 100644 index 0000000000000..7d3f8d2692c27 --- /dev/null +++ b/src/frontend/src/handler/create_source/external_schema/protobuf.rs @@ -0,0 +1,48 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::*; + +/// Map a protobuf schema to a relational schema. +pub async fn extract_protobuf_table_schema( + schema: &ProtobufSchema, + with_properties: &WithOptionsSecResolved, + format_encode_options: &mut BTreeMap, +) -> Result> { + let info = StreamSourceInfo { + proto_message_name: schema.message_name.0.clone(), + row_schema_location: schema.row_schema_location.0.clone(), + use_schema_registry: schema.use_schema_registry, + format: FormatType::Plain.into(), + row_encode: EncodeType::Protobuf.into(), + format_encode_options: format_encode_options.clone(), + ..Default::default() + }; + let parser_config = SpecificParserConfig::new(&info, with_properties)?; + try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_USERNAME); + try_consume_string_from_options(format_encode_options, SCHEMA_REGISTRY_PASSWORD); + consume_aws_config_from_options(format_encode_options); + + let conf = ProtobufParserConfig::new(parser_config.encoding_config).await?; + + let column_descs = conf.map_to_columns()?; + + Ok(column_descs + .into_iter() + .map(|col| ColumnCatalog { + column_desc: col.into(), + is_hidden: false, + }) + .collect_vec()) +} diff --git a/src/frontend/src/handler/create_source/validate.rs b/src/frontend/src/handler/create_source/validate.rs new file mode 100644 index 0000000000000..f4bd50c342466 --- /dev/null +++ b/src/frontend/src/handler/create_source/validate.rs @@ -0,0 +1,246 @@ +// Copyright 2024 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::*; + +pub static ALLOWED_CONNECTION_CONNECTOR: LazyLock> = + LazyLock::new(|| { + hashset! { + PbConnectionType::Unspecified, + PbConnectionType::Kafka, + PbConnectionType::Iceberg, + } + }); + +pub static ALLOWED_CONNECTION_SCHEMA_REGISTRY: LazyLock> = + LazyLock::new(|| { + hashset! { + PbConnectionType::Unspecified, + PbConnectionType::SchemaRegistry, + } + }); + +// TODO: Better design if we want to support ENCODE KEY where we will have 4 dimensional array +static CONNECTORS_COMPATIBLE_FORMATS: LazyLock>>> = + LazyLock::new(|| { + convert_args!(hashmap!( + KAFKA_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Json, Encode::Protobuf, Encode::Avro, Encode::Bytes, Encode::Csv], + Format::Upsert => vec![Encode::Json, Encode::Avro, Encode::Protobuf], + Format::Debezium => vec![Encode::Json, Encode::Avro], + Format::Maxwell => vec![Encode::Json], + Format::Canal => vec![Encode::Json], + Format::DebeziumMongo => vec![Encode::Json], + ), + PULSAR_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Json, Encode::Protobuf, Encode::Avro, Encode::Bytes], + Format::Upsert => vec![Encode::Json, Encode::Avro], + Format::Debezium => vec![Encode::Json], + Format::Maxwell => vec![Encode::Json], + Format::Canal => vec![Encode::Json], + ), + KINESIS_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Json, Encode::Protobuf, Encode::Avro, Encode::Bytes], + Format::Upsert => vec![Encode::Json, Encode::Avro], + Format::Debezium => vec![Encode::Json], + Format::Maxwell => vec![Encode::Json], + Format::Canal => vec![Encode::Json], + ), + GOOGLE_PUBSUB_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Json, Encode::Protobuf, Encode::Avro, Encode::Bytes], + Format::Debezium => vec![Encode::Json], + Format::Maxwell => vec![Encode::Json], + Format::Canal => vec![Encode::Json], + ), + NEXMARK_CONNECTOR => hashmap!( + Format::Native => vec![Encode::Native], + Format::Plain => vec![Encode::Bytes], + ), + DATAGEN_CONNECTOR => hashmap!( + Format::Native => vec![Encode::Native], + Format::Plain => vec![Encode::Bytes, Encode::Json], + ), + S3_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Csv, Encode::Json], + ), + OPENDAL_S3_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Csv, Encode::Json, Encode::Parquet], + ), + GCS_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Csv, Encode::Json, Encode::Parquet], + ), + AZBLOB_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Csv, Encode::Json, Encode::Parquet], + ), + POSIX_FS_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Csv], + ), + MYSQL_CDC_CONNECTOR => hashmap!( + Format::Debezium => vec![Encode::Json], + // support source stream job + Format::Plain => vec![Encode::Json], + ), + POSTGRES_CDC_CONNECTOR => hashmap!( + Format::Debezium => vec![Encode::Json], + // support source stream job + Format::Plain => vec![Encode::Json], + ), + CITUS_CDC_CONNECTOR => hashmap!( + Format::Debezium => vec![Encode::Json], + ), + MONGODB_CDC_CONNECTOR => hashmap!( + Format::DebeziumMongo => vec![Encode::Json], + ), + NATS_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Json, Encode::Protobuf, Encode::Bytes], + ), + MQTT_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Json, Encode::Bytes], + ), + TEST_CONNECTOR => hashmap!( + Format::Plain => vec![Encode::Json], + ), + ICEBERG_CONNECTOR => hashmap!( + Format::None => vec![Encode::None], + ), + SQL_SERVER_CDC_CONNECTOR => hashmap!( + Format::Debezium => vec![Encode::Json], + // support source stream job + Format::Plain => vec![Encode::Json], + ), + )) + }); + +fn validate_license(connector: &str) -> Result<()> { + if connector == SQL_SERVER_CDC_CONNECTOR { + Feature::SqlServerCdcSource + .check_available() + .map_err(|e| anyhow::anyhow!(e))?; + } + Ok(()) +} + +pub fn validate_compatibility( + format_encode: &FormatEncodeOptions, + props: &mut BTreeMap, +) -> Result<()> { + let mut connector = props + .get_connector() + .ok_or_else(|| RwError::from(ProtocolError("missing field 'connector'".to_owned())))?; + + if connector == OPENDAL_S3_CONNECTOR { + // reject s3_v2 creation + return Err(RwError::from(Deprecated( + OPENDAL_S3_CONNECTOR.to_owned(), + S3_CONNECTOR.to_owned(), + ))); + } + if connector == S3_CONNECTOR { + // S3 connector is deprecated, use OPENDAL_S3_CONNECTOR instead + // do s3 -> s3_v2 migration + let entry = props.get_mut(UPSTREAM_SOURCE_KEY).unwrap(); + *entry = OPENDAL_S3_CONNECTOR.to_owned(); + connector = OPENDAL_S3_CONNECTOR.to_owned(); + } + + let compatible_formats = CONNECTORS_COMPATIBLE_FORMATS + .get(&connector) + .ok_or_else(|| { + RwError::from(ProtocolError(format!( + "connector {:?} is not supported, accept {:?}", + connector, + CONNECTORS_COMPATIBLE_FORMATS.keys() + ))) + })?; + + validate_license(&connector)?; + if connector != KAFKA_CONNECTOR { + let res = match (&format_encode.format, &format_encode.row_encode) { + (Format::Plain, Encode::Protobuf) | (Format::Plain, Encode::Avro) => { + let mut options = WithOptions::try_from(format_encode.row_options())?; + let (_, use_schema_registry) = get_schema_location(options.inner_mut())?; + use_schema_registry + } + (Format::Debezium, Encode::Avro) => true, + (_, _) => false, + }; + if res { + return Err(RwError::from(ProtocolError(format!( + "The {} must be kafka when schema registry is used", + UPSTREAM_SOURCE_KEY + )))); + } + } + + let compatible_encodes = compatible_formats + .get(&format_encode.format) + .ok_or_else(|| { + RwError::from(ProtocolError(format!( + "connector {} does not support format {:?}", + connector, format_encode.format + ))) + })?; + if !compatible_encodes.contains(&format_encode.row_encode) { + return Err(RwError::from(ProtocolError(format!( + "connector {} does not support format {:?} with encode {:?}", + connector, format_encode.format, format_encode.row_encode + )))); + } + + if connector == POSTGRES_CDC_CONNECTOR || connector == CITUS_CDC_CONNECTOR { + match props.get("slot.name") { + None => { + // Build a random slot name with UUID + // e.g. "rw_cdc_f9a3567e6dd54bf5900444c8b1c03815" + let uuid = uuid::Uuid::new_v4(); + props.insert("slot.name".into(), format!("rw_cdc_{}", uuid.simple())); + } + Some(slot_name) => { + // please refer to + // - https://github.com/debezium/debezium/blob/97956ce25b7612e3413d363658661896b7d2e0a2/debezium-connector-postgres/src/main/java/io/debezium/connector/postgresql/PostgresConnectorConfig.java#L1179 + // - https://doxygen.postgresql.org/slot_8c.html#afac399f07320b9adfd2c599cf822aaa3 + if !slot_name + .chars() + .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_') + || slot_name.len() > 63 + { + return Err(RwError::from(ProtocolError(format!( + "Invalid replication slot name: {:?}. Valid replication slot name must contain only digits, lowercase characters and underscores with length <= 63", + slot_name + )))); + } + } + } + + if !props.contains_key("schema.name") { + // Default schema name is "public" + props.insert("schema.name".into(), "public".into()); + } + if !props.contains_key("publication.name") { + // Default publication name is "rw_publication" + props.insert("publication.name".into(), "rw_publication".into()); + } + if !props.contains_key("publication.create.enable") { + // Default auto create publication if doesn't exist + props.insert("publication.create.enable".into(), "true".into()); + } + } + + if connector == SQL_SERVER_CDC_CONNECTOR && !props.contains_key("schema.name") { + // Default schema name is "dbo" + props.insert("schema.name".into(), "dbo".into()); + } + + Ok(()) +} diff --git a/src/frontend/src/optimizer/plan_node/stream_cdc_table_scan.rs b/src/frontend/src/optimizer/plan_node/stream_cdc_table_scan.rs index bce2825f39645..7ab2912bd1118 100644 --- a/src/frontend/src/optimizer/plan_node/stream_cdc_table_scan.rs +++ b/src/frontend/src/optimizer/plan_node/stream_cdc_table_scan.rs @@ -17,6 +17,7 @@ use pretty_xmlish::{Pretty, XmlNode}; use risingwave_common::catalog::Field; use risingwave_common::types::DataType; use risingwave_common::util::sort_util::OrderType; +use risingwave_connector::parser::debezium_cdc_source_schema; use risingwave_pb::stream_plan::stream_node::PbNodeBody; use risingwave_pb::stream_plan::PbStreamNode; @@ -25,7 +26,6 @@ use super::utils::{childless_record, Distill}; use super::{generic, ExprRewritable, PlanBase, PlanRef, StreamNode}; use crate::catalog::ColumnId; use crate::expr::{Expr, ExprImpl, ExprRewriter, ExprType, ExprVisitor, FunctionCall, InputRef}; -use crate::handler::create_source::debezium_cdc_source_schema; use crate::optimizer::plan_node::expr_visitable::ExprVisitable; use crate::optimizer::plan_node::utils::{IndicesDisplay, TableCatalogBuilder}; use crate::optimizer::property::{Distribution, DistributionDisplay}; From 594ef891d30067f99d48ce5e515a3134a1df7f9a Mon Sep 17 00:00:00 2001 From: xxchan Date: Mon, 16 Dec 2024 12:39:05 +0800 Subject: [PATCH 6/8] feat: replace icelake Transform with iceberg (#18625) Signed-off-by: xxchan --- Cargo.lock | 9 +- Cargo.toml | 6 +- .../connector_common/iceberg/mock_catalog.rs | 250 ++++++++++++++++++ .../src/connector_common/iceberg/mod.rs | 1 + src/connector/src/sink/iceberg/mod.rs | 8 + src/expr/impl/Cargo.toml | 2 +- src/expr/impl/src/scalar/external/iceberg.rs | 27 +- src/frontend/Cargo.toml | 1 - src/frontend/src/handler/create_sink.rs | 62 ++--- .../src/optimizer/plan_node/stream_sink.rs | 5 +- 10 files changed, 314 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 69cb010feae83..b242d74d2e06f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6305,7 +6305,7 @@ dependencies = [ [[package]] name = "iceberg" version = "0.3.0" -source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=e28726443a57028f7c7df11d6d385470dc484d46#e28726443a57028f7c7df11d6d385470dc484d46" +source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=53f786fb2141b51d10a173cbcb5595edd5aa52a6#53f786fb2141b51d10a173cbcb5595edd5aa52a6" dependencies = [ "anyhow", "apache-avro 0.17.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -6352,7 +6352,7 @@ dependencies = [ [[package]] name = "iceberg-catalog-glue" version = "0.3.0" -source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=e28726443a57028f7c7df11d6d385470dc484d46#e28726443a57028f7c7df11d6d385470dc484d46" +source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=53f786fb2141b51d10a173cbcb5595edd5aa52a6#53f786fb2141b51d10a173cbcb5595edd5aa52a6" dependencies = [ "anyhow", "async-trait", @@ -6369,7 +6369,7 @@ dependencies = [ [[package]] name = "iceberg-catalog-rest" version = "0.3.0" -source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=e28726443a57028f7c7df11d6d385470dc484d46#e28726443a57028f7c7df11d6d385470dc484d46" +source = "git+https://github.com/risingwavelabs/iceberg-rust.git?rev=53f786fb2141b51d10a173cbcb5595edd5aa52a6#53f786fb2141b51d10a173cbcb5595edd5aa52a6" dependencies = [ "async-trait", "chrono", @@ -11358,7 +11358,7 @@ dependencies = [ "ginepro", "hex", "hmac", - "icelake", + "iceberg", "itertools 0.13.0", "jsonbb", "linkme", @@ -11429,7 +11429,6 @@ dependencies = [ "futures-async-stream", "iana-time-zone", "iceberg", - "icelake", "itertools 0.13.0", "jsonbb", "linkme", diff --git a/Cargo.toml b/Cargo.toml index e1956166592c7..0644def23d5c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -143,9 +143,9 @@ icelake = { git = "https://github.com/risingwavelabs/icelake.git", rev = "0ec44f "prometheus", ] } # branch dev-rebase-main-20241030 -iceberg = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "e28726443a57028f7c7df11d6d385470dc484d46" } -iceberg-catalog-rest = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "e28726443a57028f7c7df11d6d385470dc484d46" } -iceberg-catalog-glue = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "e28726443a57028f7c7df11d6d385470dc484d46" } +iceberg = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "53f786fb2141b51d10a173cbcb5595edd5aa52a6" } +iceberg-catalog-rest = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "53f786fb2141b51d10a173cbcb5595edd5aa52a6" } +iceberg-catalog-glue = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "53f786fb2141b51d10a173cbcb5595edd5aa52a6" } opendal = "0.49" # used only by arrow-udf-flight arrow-flight = "53" diff --git a/src/connector/src/connector_common/iceberg/mock_catalog.rs b/src/connector/src/connector_common/iceberg/mock_catalog.rs index 1e4910d9ff5c6..b2723cf40dfbb 100644 --- a/src/connector/src/connector_common/iceberg/mock_catalog.rs +++ b/src/connector/src/connector_common/iceberg/mock_catalog.rs @@ -23,6 +23,7 @@ use opendal::services::Memory; use opendal::Operator; /// A mock catalog for iceberg used for plan test. +#[derive(Debug)] pub struct MockCatalog; impl MockCatalog { @@ -233,3 +234,252 @@ impl Catalog for MockCatalog { unimplemented!() } } + +mod v2 { + use std::collections::HashMap; + + use async_trait::async_trait; + use iceberg::io::FileIO; + use iceberg::spec::{ + NestedField, PrimitiveType, Schema, TableMetadataBuilder, Transform, Type, + UnboundPartitionField, UnboundPartitionSpec, + }; + use iceberg::table::Table as TableV2; + use iceberg::{ + Catalog as CatalogV2, Namespace, NamespaceIdent, TableCommit, TableCreation, TableIdent, + }; + + use super::MockCatalog; + + impl MockCatalog { + fn build_table_v2( + name: &str, + schema: Schema, + partition_spec: UnboundPartitionSpec, + ) -> TableV2 { + let file_io = FileIO::from_path("memory://").unwrap().build().unwrap(); + let table_creation = TableCreation { + name: "ignore".to_owned(), + location: Some("1".to_owned()), + schema, + partition_spec: Some(partition_spec), + sort_order: None, + properties: HashMap::new(), + }; + TableV2::builder() + .identifier(TableIdent::new( + NamespaceIdent::new("mock_namespace".to_owned()), + name.to_owned(), + )) + .file_io(file_io) + .metadata( + TableMetadataBuilder::from_table_creation(table_creation) + .unwrap() + .build() + .unwrap(), + ) + .build() + .unwrap() + } + + fn sparse_table_v2() -> TableV2 { + Self::build_table_v2( + Self::SPARSE_TABLE, + Schema::builder() + .with_fields(vec![ + NestedField::new(1, "v1", Type::Primitive(PrimitiveType::Int), true).into(), + NestedField::new(2, "v2", Type::Primitive(PrimitiveType::Long), true) + .into(), + NestedField::new(3, "v3", Type::Primitive(PrimitiveType::String), true) + .into(), + NestedField::new(4, "v4", Type::Primitive(PrimitiveType::Time), true) + .into(), + ]) + .build() + .unwrap(), + UnboundPartitionSpec::builder() + .with_spec_id(1) + .add_partition_fields(vec![ + UnboundPartitionField { + source_id: 1, + field_id: Some(5), + name: "f1".to_owned(), + transform: Transform::Identity, + }, + UnboundPartitionField { + source_id: 2, + field_id: Some(6), + name: "f2".to_owned(), + transform: Transform::Bucket(1), + }, + UnboundPartitionField { + source_id: 3, + field_id: Some(7), + name: "f3".to_owned(), + transform: Transform::Truncate(1), + }, + UnboundPartitionField { + source_id: 4, + field_id: Some(8), + name: "f4".to_owned(), + transform: Transform::Void, + }, + ]) + .unwrap() + .build(), + ) + } + + fn range_table_v2() -> TableV2 { + Self::build_table_v2( + Self::RANGE_TABLE, + Schema::builder() + .with_fields(vec![ + NestedField::new(1, "v1", Type::Primitive(PrimitiveType::Date), true) + .into(), + NestedField::new(2, "v2", Type::Primitive(PrimitiveType::Timestamp), true) + .into(), + NestedField::new( + 3, + "v3", + Type::Primitive(PrimitiveType::Timestamptz), + true, + ) + .into(), + NestedField::new( + 4, + "v4", + Type::Primitive(PrimitiveType::Timestamptz), + true, + ) + .into(), + ]) + .build() + .unwrap(), + UnboundPartitionSpec::builder() + .with_spec_id(1) + .add_partition_fields(vec![ + UnboundPartitionField { + source_id: 1, + field_id: Some(5), + name: "f1".to_owned(), + transform: Transform::Year, + }, + UnboundPartitionField { + source_id: 2, + field_id: Some(6), + name: "f2".to_owned(), + transform: Transform::Month, + }, + UnboundPartitionField { + source_id: 3, + field_id: Some(7), + name: "f3".to_owned(), + transform: Transform::Day, + }, + UnboundPartitionField { + source_id: 4, + field_id: Some(8), + name: "f4".to_owned(), + transform: Transform::Hour, + }, + ]) + .unwrap() + .build(), + ) + } + } + + #[async_trait] + impl CatalogV2 for MockCatalog { + /// List namespaces from table. + async fn list_namespaces( + &self, + _parent: Option<&NamespaceIdent>, + ) -> iceberg::Result> { + todo!() + } + + /// Create a new namespace inside the catalog. + async fn create_namespace( + &self, + _namespace: &iceberg::NamespaceIdent, + _properties: HashMap, + ) -> iceberg::Result { + todo!() + } + + /// Get a namespace information from the catalog. + async fn get_namespace(&self, _namespace: &NamespaceIdent) -> iceberg::Result { + todo!() + } + + /// Check if namespace exists in catalog. + async fn namespace_exists(&self, _namespace: &NamespaceIdent) -> iceberg::Result { + todo!() + } + + /// Drop a namespace from the catalog. + async fn drop_namespace(&self, _namespace: &NamespaceIdent) -> iceberg::Result<()> { + todo!() + } + + /// List tables from namespace. + async fn list_tables( + &self, + _namespace: &NamespaceIdent, + ) -> iceberg::Result> { + todo!() + } + + async fn update_namespace( + &self, + _namespace: &NamespaceIdent, + _properties: HashMap, + ) -> iceberg::Result<()> { + todo!() + } + + /// Create a new table inside the namespace. + async fn create_table( + &self, + _namespace: &NamespaceIdent, + _creation: TableCreation, + ) -> iceberg::Result { + todo!() + } + + /// Load table from the catalog. + async fn load_table(&self, table: &TableIdent) -> iceberg::Result { + match table.name.as_ref() { + Self::SPARSE_TABLE => Ok(Self::sparse_table_v2()), + Self::RANGE_TABLE => Ok(Self::range_table_v2()), + _ => unimplemented!("table {} not found", table.name()), + } + } + + /// Drop a table from the catalog. + async fn drop_table(&self, _table: &TableIdent) -> iceberg::Result<()> { + todo!() + } + + /// Check if a table exists in the catalog. + async fn table_exists(&self, table: &TableIdent) -> iceberg::Result { + match table.name.as_ref() { + Self::SPARSE_TABLE => Ok(true), + Self::RANGE_TABLE => Ok(true), + _ => Ok(false), + } + } + + /// Rename a table in the catalog. + async fn rename_table(&self, _src: &TableIdent, _dest: &TableIdent) -> iceberg::Result<()> { + todo!() + } + + /// Update a table to the catalog. + async fn update_table(&self, _commit: TableCommit) -> iceberg::Result { + todo!() + } + } +} diff --git a/src/connector/src/connector_common/iceberg/mod.rs b/src/connector/src/connector_common/iceberg/mod.rs index d945e6449958b..c1a80a4d4fcb5 100644 --- a/src/connector/src/connector_common/iceberg/mod.rs +++ b/src/connector/src/connector_common/iceberg/mod.rs @@ -630,6 +630,7 @@ mod v2 { java_catalog_props, ) } + "mock" => Ok(Arc::new(mock_catalog::MockCatalog {})), _ => { bail!( "Unsupported catalog type: {}, only support `storage`, `rest`, `hive`, `jdbc`, `glue`", diff --git a/src/connector/src/sink/iceberg/mod.rs b/src/connector/src/sink/iceberg/mod.rs index 3679cd5ab298f..8242144918930 100644 --- a/src/connector/src/sink/iceberg/mod.rs +++ b/src/connector/src/sink/iceberg/mod.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use anyhow::{anyhow, Context}; use async_trait::async_trait; +use iceberg::table::Table as TableV2; use iceberg::{Catalog as CatalogV2, NamespaceIdent, TableCreation, TableIdent}; use icelake::catalog::CatalogRef; use icelake::io_v2::input_wrapper::{DeltaWriter, RecordBatchWriter}; @@ -185,6 +186,13 @@ impl IcebergConfig { .map_err(Into::into) } + pub async fn load_table_v2(&self) -> Result { + self.common + .load_table_v2(&self.java_catalog_props) + .await + .map_err(Into::into) + } + pub fn full_table_name_v2(&self) -> Result { self.common.full_table_name_v2().map_err(Into::into) } diff --git a/src/expr/impl/Cargo.toml b/src/expr/impl/Cargo.toml index 87b27a9e670de..c7b1974d8699a 100644 --- a/src/expr/impl/Cargo.toml +++ b/src/expr/impl/Cargo.toml @@ -45,7 +45,7 @@ futures-util = "0.3" ginepro = "0.8" hex = "0.4" hmac = "0.12" -icelake = { workspace = true } +iceberg = { workspace = true } itertools = { workspace = true } jsonbb = { workspace = true } linkme = { workspace = true } diff --git a/src/expr/impl/src/scalar/external/iceberg.rs b/src/expr/impl/src/scalar/external/iceberg.rs index c6881c5228e43..797a87510623e 100644 --- a/src/expr/impl/src/scalar/external/iceberg.rs +++ b/src/expr/impl/src/scalar/external/iceberg.rs @@ -19,9 +19,8 @@ use std::str::FromStr; use std::sync::Arc; use anyhow::anyhow; -use icelake::types::{ - create_transform_function, Any as IcelakeDataType, BoxedTransformFunction, Transform, -}; +use iceberg::spec::{PrimitiveType, Transform, Type as IcebergType}; +use iceberg::transform::{create_transform_function, BoxedTransformFunction}; use risingwave_common::array::arrow::{arrow_schema_iceberg, IcebergArrowConvert}; use risingwave_common::array::{ArrayRef, DataChunk}; use risingwave_common::ensure; @@ -81,7 +80,7 @@ fn build(return_type: DataType, mut children: Vec) -> Result) -> Result) -> Result) -> Result + (matches!(transform_type, Transform::Day) && matches!(actual_res_type, IcebergType::Primitive(PrimitiveType::Int))), ExprError::InvalidParam { name: "return type in iceberg_transform", reason: format!( - "Expect return type {:?} but got {:?}", - expect_res_type, actual_res_type + "Expect return type {:?} but got {:?}, RisingWave return type is {:?}, input type is {:?}, transform type is {:?}", + expect_res_type, + actual_res_type, + return_type, + (input_type, input_arrow_type), + transform_type ) .into() } diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml index 9dfb31a141cc2..34964e4c3438f 100644 --- a/src/frontend/Cargo.toml +++ b/src/frontend/Cargo.toml @@ -39,7 +39,6 @@ futures = { version = "0.3", default-features = false, features = ["alloc"] } futures-async-stream = { workspace = true } iana-time-zone = "0.1" iceberg = { workspace = true } -icelake = { workspace = true } itertools = { workspace = true } jsonbb = { workspace = true } linkme = { workspace = true } diff --git a/src/frontend/src/handler/create_sink.rs b/src/frontend/src/handler/create_sink.rs index c4d6793444104..c89050b331d3f 100644 --- a/src/frontend/src/handler/create_sink.rs +++ b/src/frontend/src/handler/create_sink.rs @@ -379,62 +379,56 @@ async fn get_partition_compute_info_for_iceberg( if _iceberg_config.create_table_if_not_exists { return Ok(None); } - let table = _iceberg_config.load_table().await?; - let Some(partition_spec) = table.current_table_metadata().current_partition_spec().ok() - else { - return Ok(None); - }; + let table = _iceberg_config.load_table_v2().await?; + let partition_spec = table.metadata().default_partition_spec(); if partition_spec.is_unpartitioned() { return Ok(None); } + use iceberg::spec::Transform; // Separate the partition spec into two parts: sparse partition and range partition. // Sparse partition means that the data distribution is more sparse at a given time. // Range partition means that the data distribution is likely same at a given time. // Only compute the partition and shuffle by them for the sparse partition. - let has_sparse_partition = partition_spec.fields.iter().any(|f| match f.transform { + let has_sparse_partition = partition_spec.fields().iter().any(|f| match f.transform { // Sparse partition - icelake::types::Transform::Identity - | icelake::types::Transform::Truncate(_) - | icelake::types::Transform::Bucket(_) => true, + Transform::Identity | Transform::Truncate(_) | Transform::Bucket(_) => true, // Range partition - icelake::types::Transform::Year - | icelake::types::Transform::Month - | icelake::types::Transform::Day - | icelake::types::Transform::Hour - | icelake::types::Transform::Void => false, + Transform::Year + | Transform::Month + | Transform::Day + | Transform::Hour + | Transform::Void => false, + // unknown + Transform::Unknown => false, }); if !has_sparse_partition { return Ok(None); } - let arrow_type: ArrowDataType = table - .current_partition_type() - .map_err(|err| RwError::from(ErrorCode::SinkError(err.into())))? - .try_into() + let schema = table.metadata().current_schema(); + let partition_type = partition_spec + .partition_type(schema) + .map_err(|err| RwError::from(ErrorCode::SinkError(err.into())))?; + let arrow_type: ArrowDataType = iceberg::arrow::type_to_arrow_type(&partition_type.into()) .map_err(|_| { RwError::from(ErrorCode::SinkError( "Fail to convert iceberg partition type to arrow type".into(), )) })?; - let Some(schema) = table.current_table_metadata().current_schema().ok() else { - return Ok(None); - }; - let partition_fields = partition_spec - .fields - .iter() - .map(|f| { - let source_f = - schema - .look_up_field_by_id(f.source_column_id) - .ok_or(RwError::from(ErrorCode::SinkError( - "Fail to look up iceberg partition field".into(), - )))?; - Ok((source_f.name.clone(), f.transform)) - }) - .collect::>>()?; + let partition_fields = + partition_spec + .fields() + .iter() + .map(|f| { + let source_f = schema.field_by_id(f.source_id).ok_or(RwError::from( + ErrorCode::SinkError("Fail to look up iceberg partition field".into()), + ))?; + Ok((source_f.name.clone(), f.transform)) + }) + .collect::>>()?; let ArrowDataType::Struct(partition_type) = arrow_type else { return Err(RwError::from(ErrorCode::SinkError( diff --git a/src/frontend/src/optimizer/plan_node/stream_sink.rs b/src/frontend/src/optimizer/plan_node/stream_sink.rs index 2023b7936439a..61572fe179005 100644 --- a/src/frontend/src/optimizer/plan_node/stream_sink.rs +++ b/src/frontend/src/optimizer/plan_node/stream_sink.rs @@ -18,7 +18,7 @@ use std::sync::Arc; use anyhow::anyhow; use fixedbitset::FixedBitSet; -use icelake::types::Transform; +use iceberg::spec::Transform; use itertools::Itertools; use pretty_xmlish::{Pretty, XmlNode}; use risingwave_common::catalog::{ColumnCatalog, CreateType}; @@ -612,13 +612,12 @@ impl ExprVisitable for StreamSink {} #[cfg(test)] mod test { - use icelake::types::Transform; use risingwave_common::catalog::{ColumnCatalog, ColumnDesc, ColumnId}; use risingwave_common::types::{DataType, StructType}; use risingwave_common::util::iter_util::ZipEqDebug; use risingwave_pb::expr::expr_node::Type; - use super::IcebergPartitionInfo; + use super::{IcebergPartitionInfo, *}; use crate::expr::{Expr, ExprImpl}; fn create_column_catalog() -> Vec { From 0b1c0e70b7f5b318471557d157d9b7b94d6d013f Mon Sep 17 00:00:00 2001 From: xxchan Date: Mon, 16 Dec 2024 15:22:02 +0800 Subject: [PATCH 7/8] fix: session variable should not affect shared source (#19807) Signed-off-by: xxchan --- .../kafka/protobuf/alter_source_shared.slt | 19 +++++-------------- .../src/handler/alter_source_with_sr.rs | 2 +- src/frontend/src/handler/create_source.rs | 15 +++++++++++++-- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt b/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt index 5301eda7679b1..658d4fa95c6a0 100644 --- a/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt +++ b/e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt @@ -26,30 +26,21 @@ CREATE MATERIALIZED VIEW mv_user AS SELECT * FROM src_user; statement ok CREATE MATERIALIZED VIEW mv_user_2 AS SELECT * FROM src_user; -statement ok -CREATE TABLE t_user WITH ( - ${RISEDEV_KAFKA_WITH_OPTIONS_COMMON}, - topic = 'pb_alter_source_shared_test', - scan.startup.mode = 'earliest' -) -FORMAT PLAIN ENCODE PROTOBUF( - schema.registry = '${RISEDEV_SCHEMA_REGISTRY_URL}', - message = 'test.User' -); - # age is new field statement error SELECT age FROM mv_user; -statement error -SELECT age FROM t_user; - # Push more events with extended fields system ok python3 e2e_test/source_inline/kafka/protobuf/pb.py "${RISEDEV_KAFKA_BOOTSTRAP_SERVERS}" "${RISEDEV_SCHEMA_REGISTRY_URL}" "pb_alter_source_shared_test" 5 user_with_more_fields sleep 5s + +# set session variable shouldn't affect existing source. +statement ok +set streaming_use_shared_source to false; + # Refresh source schema statement ok ALTER SOURCE src_user REFRESH SCHEMA; diff --git a/src/frontend/src/handler/alter_source_with_sr.rs b/src/frontend/src/handler/alter_source_with_sr.rs index 05548351492b9..196d4a7eaf39e 100644 --- a/src/frontend/src/handler/alter_source_with_sr.rs +++ b/src/frontend/src/handler/alter_source_with_sr.rs @@ -168,7 +168,7 @@ pub async fn refresh_sr_and_get_columns_diff( session, format_encode, Either::Right(&with_properties), - CreateSourceType::from_with_properties(session, &with_properties), + CreateSourceType::for_replace(original_source), ) .await? else { diff --git a/src/frontend/src/handler/create_source.rs b/src/frontend/src/handler/create_source.rs index 2981a96423edf..91abf9acf6c99 100644 --- a/src/frontend/src/handler/create_source.rs +++ b/src/frontend/src/handler/create_source.rs @@ -152,7 +152,8 @@ pub enum CreateSourceType { } impl CreateSourceType { - pub fn from_with_properties( + /// Note: shouldn't be used for `ALTER SOURCE`, since session variables should not affect existing source. We should respect the original type instead. + pub fn for_newly_created( session: &SessionImpl, with_properties: &impl WithPropertiesExt, ) -> Self { @@ -172,6 +173,16 @@ impl CreateSourceType { } } + pub fn for_replace(catalog: &SourceCatalog) -> Self { + if !catalog.info.is_shared() { + CreateSourceType::NonShared + } else if catalog.with_properties.is_shareable_cdc_connector() { + CreateSourceType::SharedCdc + } else { + CreateSourceType::SharedNonCdc + } + } + pub fn is_shared(&self) -> bool { matches!( self, @@ -818,7 +829,7 @@ pub async fn handle_create_source( let format_encode = stmt.format_encode.into_v2_with_warning(); let with_properties = bind_connector_props(&handler_args, &format_encode, true)?; - let create_source_type = CreateSourceType::from_with_properties(&session, &*with_properties); + let create_source_type = CreateSourceType::for_newly_created(&session, &*with_properties); let (columns_from_resolve_source, source_info) = bind_columns_from_source( &session, &format_encode, From 492e2d7d03ccfb530692649503a3e74f8624844c Mon Sep 17 00:00:00 2001 From: Shanicky Chen Date: Mon, 16 Dec 2024 16:47:13 +0800 Subject: [PATCH 8/8] fix: add mview_definition in scaled actors (#19784) Signed-off-by: Shanicky Chen --- src/meta/src/controller/scale.rs | 88 ++++++++++++++++++++++++++++++-- src/meta/src/stream/scale.rs | 8 +-- 2 files changed, 87 insertions(+), 9 deletions(-) diff --git a/src/meta/src/controller/scale.rs b/src/meta/src/controller/scale.rs index 65dd58ff1d34e..84c743e437d17 100644 --- a/src/meta/src/controller/scale.rs +++ b/src/meta/src/controller/scale.rs @@ -22,10 +22,12 @@ use risingwave_connector::source::{SplitImpl, SplitMetaData}; use risingwave_meta_model::actor::ActorStatus; use risingwave_meta_model::actor_dispatcher::DispatcherType; use risingwave_meta_model::fragment::DistributionType; -use risingwave_meta_model::prelude::{Actor, ActorDispatcher, Fragment, StreamingJob}; +use risingwave_meta_model::prelude::{ + Actor, ActorDispatcher, Fragment, Sink, Source, StreamingJob, Table, +}; use risingwave_meta_model::{ - actor, actor_dispatcher, fragment, streaming_job, ActorId, ActorMapping, ActorUpstreamActors, - ConnectorSplits, FragmentId, I32Array, ObjectId, VnodeBitmap, + actor, actor_dispatcher, fragment, sink, source, streaming_job, table, ActorId, ActorMapping, + ActorUpstreamActors, ConnectorSplits, FragmentId, I32Array, ObjectId, VnodeBitmap, }; use risingwave_meta_model_migration::{ Alias, CommonTableExpression, Expr, IntoColumnRef, QueryStatementBuilder, SelectStatement, @@ -165,7 +167,7 @@ pub struct RescheduleWorkingSet { pub fragment_downstreams: HashMap>, pub fragment_upstreams: HashMap>, - pub related_jobs: HashMap, + pub related_jobs: HashMap, } async fn resolve_no_shuffle_query( @@ -192,6 +194,67 @@ where Ok(result) } +async fn resolve_streaming_job_definition( + txn: &C, + job_ids: &HashSet, +) -> MetaResult> +where + C: ConnectionTrait, +{ + let job_ids = job_ids.iter().cloned().collect_vec(); + + // including table, materialized view, index + let common_job_definitions: Vec<(ObjectId, String)> = Table::find() + .select_only() + .columns([ + table::Column::TableId, + #[cfg(not(debug_assertions))] + table::Column::Name, + #[cfg(debug_assertions)] + table::Column::Definition, + ]) + .filter(table::Column::TableId.is_in(job_ids.clone())) + .into_tuple() + .all(txn) + .await?; + + let sink_definitions: Vec<(ObjectId, String)> = Sink::find() + .select_only() + .columns([ + sink::Column::SinkId, + #[cfg(not(debug_assertions))] + sink::Column::Name, + #[cfg(debug_assertions)] + sink::Column::Definition, + ]) + .filter(sink::Column::SinkId.is_in(job_ids.clone())) + .into_tuple() + .all(txn) + .await?; + + let source_definitions: Vec<(ObjectId, String)> = Source::find() + .select_only() + .columns([ + source::Column::SourceId, + #[cfg(not(debug_assertions))] + source::Column::Name, + #[cfg(debug_assertions)] + source::Column::Definition, + ]) + .filter(source::Column::SourceId.is_in(job_ids.clone())) + .into_tuple() + .all(txn) + .await?; + + let definitions: HashMap = common_job_definitions + .into_iter() + .chain(sink_definitions.into_iter()) + .chain(source_definitions.into_iter()) + .collect(); + + Ok(definitions) +} + impl CatalogController { pub async fn resolve_working_set_for_reschedule_fragments( &self, @@ -339,6 +402,9 @@ impl CatalogController { let related_job_ids: HashSet<_> = fragments.values().map(|fragment| fragment.job_id).collect(); + let related_job_definitions = + resolve_streaming_job_definition(txn, &related_job_ids).await?; + let related_jobs = StreamingJob::find() .filter(streaming_job::Column::JobId.is_in(related_job_ids)) .all(txn) @@ -346,7 +412,19 @@ impl CatalogController { let related_jobs = related_jobs .into_iter() - .map(|job| (job.job_id, job)) + .map(|job| { + let job_id = job.job_id; + ( + job_id, + ( + job, + related_job_definitions + .get(&job_id) + .cloned() + .unwrap_or("".to_owned()), + ), + ) + }) .collect(); Ok(RescheduleWorkingSet { diff --git a/src/meta/src/stream/scale.rs b/src/meta/src/stream/scale.rs index 2da8c0fae3e1b..578ee101d0f27 100644 --- a/src/meta/src/stream/scale.rs +++ b/src/meta/src/stream/scale.rs @@ -590,6 +590,9 @@ impl ScaleController { vnode_bitmap, } = actors.first().unwrap().clone(); + let (related_job, job_definition) = + related_jobs.get(&job_id).expect("job not found"); + let fragment = CustomFragmentInfo { fragment_id: fragment_id as _, fragment_type_mask: fragment_type_mask as _, @@ -603,8 +606,7 @@ impl ScaleController { dispatcher, upstream_actor_id, vnode_bitmap: vnode_bitmap.map(|b| b.to_protobuf()), - // todo, we need to fill this part - mview_definition: "".to_owned(), + mview_definition: job_definition.to_owned(), expr_context: expr_contexts .get(&actor_id) .cloned() @@ -617,8 +619,6 @@ impl ScaleController { fragment_to_table.insert(fragment_id as _, TableId::from(job_id as u32)); - let related_job = related_jobs.get(&job_id).expect("job not found"); - fragment_state.insert( fragment_id, table_fragments::PbState::from(related_job.job_status),