From 65c2f19365740f5b165f5fc988b79c4f8fba3576 Mon Sep 17 00:00:00 2001 From: azexcy <13588031592@qq.com> Date: Sun, 29 Oct 2023 20:14:21 +0800 Subject: [PATCH 1/9] Add CDC document of build and usage --- .../shardingsphere-proxy/cdc/_index.cn.md | 8 + .../shardingsphere-proxy/cdc/_index.en.md | 8 + .../shardingsphere-proxy/cdc/build.cn.md | 270 ++++++++++++ .../shardingsphere-proxy/cdc/build.en.md | 7 + .../shardingsphere-proxy/cdc/usage.cn.md | 393 ++++++++++++++++++ .../shardingsphere-proxy/cdc/usage.en.md | 6 + 6 files changed, 692 insertions(+) create mode 100644 docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md create mode 100644 docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.en.md create mode 100644 docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md create mode 100644 docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md create mode 100644 docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md create mode 100644 docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md new file mode 100644 index 0000000000000..2242d67887914 --- /dev/null +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md @@ -0,0 +1,8 @@ ++++ +title = "CDC" +weight = 9 ++++ + +## 简介 + +用户可以通过 ShardingSphere 的 CDC 功能进行数据同步,也可用于 ETL,目前支持 openGauss、MySQL 和 PostgreSQL diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.en.md new file mode 100644 index 0000000000000..f27d81dad8214 --- /dev/null +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.en.md @@ -0,0 +1,8 @@ ++++ +title = "CDC" +weight = 9 ++++ + +## 简介 + +Users can synchronize data through ShardingSphere's CDC feature, which can also be used for ETL and currently supports openGauss, MySQL and PostgreSQL. \ No newline at end of file diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md new file mode 100644 index 0000000000000..f335bd4ba4a95 --- /dev/null +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md @@ -0,0 +1,270 @@ ++++ +title = "运行部署" +weight = 1 ++++ + +## 背景信息 + +ShardingSphere CDC 分为两个部分,一个是 CDC Server,另一个是 CDC Client。 CDC Server 和 ShardingSphere-Proxy 目前是一同部署的。 + +用户可以在自己的项目中引入 CDC Client,实现数据的消费逻辑。 + +## 约束条件 + +- 纯 JAVA 开发,JDK 建议 1.8 或以上版本。 +- CDC Server 要求 SharingSphere-Proxy 使用集群模式,目前支持 ZooKeeper 作为注册中心。 +- CDC 只同步数据,不会同步表结构,目前也不支持 DDL 的语句同步。 +- CDC 增量会按照事务的维度输出数据, 如果要开启 XA 事务的兼容,则 openGauss 和 ShardingSphere-Proxy 都需要 GLT 模块 + +## CDC 功能介绍 + +CDC 服务端的逻辑可以参考 https://github.com/apache/shardingsphere/tree/master/kernel/data-pipeline/scenario/cdc/core 中的实现。 + +### CDC 协议 + +CDC 协议使用 Protobuf,对应的 Protobuf 类型是根据 Java 中的类型来映射的,CDC 的数据类型和 openGauss 之间的映射关系如下 + +CDC 协议的数据类型和 openGauss 之间的映射关系如下 + +| openGauss 类型 | Java 数据类型 | CDC 对应的 protobuf 类型 | 备注 | +|-------------------------------------|--------------------|---------------------|--------------------------| +| INT1、INT2、INT4 | Integer | int32 | | +| INT8 | Long | int64 | | +| NUMERIC | BigDecimal | string | | +| FLOAT4 | Float | float | | +| FLOAT8 | Double | double | | +| BOOLEAN | Boolean | bool | | +| CHAR、VARCHAR、TEXT、CLOB | String | string | | +| BLOB、RAW、BYTEA | byte[] | bytes | | +| DATE | java.util.Date | Timestamp | | +| DATE | java.sql.Date | int64 | 这种情况下返回从1970-01-01 以来的天数 | +| TIMESTAMP,TIMESTAMPTZ、SMALLDATETIME | java.sql.Timestamp | Timestamp | 不带时区信息 | +| TIME,TIMETZ | java.sql.Time | int64 | 代表当天的纳秒数(时区无关) | +| INTERVAL、reltime、abstime | String | string | | +| point、lseg、box、path、polygon、circle | String | string | | +| cidr、inet、macaddr | String | string | | +| tsvector | String | string | | +| UUID | String | string | | +| JSON、JSONB | String | string | | +| HLL | String | string | | +| 范围类型(int4range等) | String | string | | +| HASH16、HASH32 | String | string | | + +> 需要注意对时间类型的处理,为了屏蔽时区的差异,CDC 返回的数据都是时区无关的 + +CDC Server 中 Java 类型转 Protobuf 类型的工具类:ColumnValueConvertUtils,[源码地址](https://github.com/apache/shardingsphere/blob/master/kernel/data-pipeline/scenario/cdc/core/src/main/java/org/apache/shardingsphere/data/pipeline/cdc/util/ColumnValueConvertUtils.java) + +对应的 CDC Client 中有 Protobuf 类型转换成 Java 类型的工具类 ProtobufAnyValueConverter,[源码地址](https://github.com/apache/shardingsphere/blob/master/kernel/data-pipeline/scenario/cdc/client/src/main/java/org/apache/shardingsphere/data/pipeline/cdc/client/util/ProtobufAnyValueConverter.java) + +## CDC Server 部署步骤 + +这里以 openGauss 数据库为例,介绍 CDC Server 的部署步骤。 + +由于 CDC Server 内置于 ShardingSphere-Proxy,所以需要获取 ShardingSphere-Proxy。详情请参见 [proxy 启动手册](/cn/user-manual/shardingsphere-proxy/startup/bin/)。 + +### 配置 GLT 模块(可选) + +官网发布的二进制包默认不包含 GLT 模块,不保证跨库事务完整性,如果使用的是包含 GLT 功能的 openGauss 数据库,则可以额外引入 GLT 模块,保证跨库事务的完整性。 + +目前有两种方式引入 GLT 模块,并且需要在 server.yaml 中也进行相应的配置。 + +#### 1. 源码编译安装 + +1. 准备代码环境,提前下载或者使用 Git clone,从 Github 下载 [ShardingSphere](https://github.com/apache/shardingsphere.git) 源码。 +2. 删除 kernel/global-clock/type/tso/core/pom.xml 中 shardingsphere-global-clock-tso-provider-redis 依赖的 `provided` 标签和 kernel/global-clock/type/tso/provider/redis/pom.xml 中 jedis 的 `provided` 标签 +3. 编译 ShardingSphere-Proxy,具体编译步骤请参考 [ShardingSphere 编译手册](https://github.com/apache/shardingsphere/wiki#build-apache-shardingsphere)。 + +#### 2. 直接引入 GLT 依赖 + +可以从 maven 仓库中引入 + +1. [shardingsphere-global-clock-tso-provider-redis](https://mvnrepository.com/artifact/io.github.greycode/shardingsphere-global-clock-tso-provider-redis),需要和 ShardingSphere-Proxy 版本一致 +2. [jedis](https://mvnrepository.com/artifact/redis.clients/jedis), 推荐使用 4.3.1 版本 + +### CDC Server 使用手册 + +1. 修改配置文件 `conf/server.yaml`,打开 CDC 功能。 目前 `mode` 必须是 `Cluster`,需要提前启动对应的注册中心。如果 GLT provider 使用 Redis,需要提前启动 Redis。 + +配置示例: + +```yaml +mode: + type: Cluster + repository: + type: ZooKeeper + props: + namespace: open_cdc + server-lists: localhost:2181 + retryIntervalMilliseconds: 500 + timeToLiveSeconds: 60 + maxRetries: 3 + operationTimeoutMilliseconds: 500 + +authority: + users: + # 这里的用户名和密码在 CDC Client 的认证中也会用到 + - user: root@% + password: root + - user: proxy + password: Proxy@123 + privilege: + type: ALL_PERMITTED + +# 开启 GLT 的时候也需要打开分布式事务 +#transaction: +# defaultType: XA +# providerType: Atomikos + +# GLT 模块配置,如果不需要 GLT 模块,可以不配置 +#globalClock: +# enabled: true +# type: TSO +# provider: redis +# props: +# host: 127.0.0.1 +# port: 6379 + + +props: + cdc-server-port: 33071 # CDC Server 端口,必须配置 + proxy-frontend-database-protocol-type: openGauss + # 省略其他配置 + ...... +``` + +2. 引入 JDBC 驱动。 + +proxy 已包含 PostgreSQL JDBC 驱动。 + +如果后端连接以下数据库,请下载相应 JDBC 驱动 jar 包,并将其放入 `${shardingsphere-proxy}/ext-lib` 目录。 + +| 数据库 | JDBC 驱动 | 参考 | +|-----------|---------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------| +| MySQL | [mysql-connector-java-5.1.49.jar]( https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.49/mysql-connector-java-5.1.49.jar ) | [Connector/J Versions]( https://dev.mysql.com/doc/connector-j/5.1/en/connector-j-versions.html ) | +| openGauss | [opengauss-jdbc-3.0.0.jar]( https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/3.0.0/opengauss-jdbc-3.0.0.jar ) | | + +4. 启动 ShardingSphere-Proxy: + +``` +sh bin/start.sh +``` + +5. 查看 proxy 日志 `logs/stdout.log`,看到日志中出现: + +``` +[INFO ] [main] o.a.s.p.frontend.ShardingSphereProxy - ShardingSphere-Proxy Cluster mode started successfully +``` + +确认启动成功。 + +## CDC Client 使用手册 + +用户可以通过 CDC Client 和服务端进行交互,CDC Client 的依赖很轻,只包含 netty 以及 CDC 协议相关的依赖。 + +有两种方式可以引入 CDC Client + +1. 源码编译,CDC Client 在编译 Proxy 的时候会一同编译,在 kernel/data-pipeline/scenario/cdc/client/target 目录下可以找到编译后的 jar 文件 +2. 从 maven 仓库获取,[Shardingsphere Data Pipeline CDC Client](https://mvnrepository.com/artifact/io.github.greycode/shardingsphere-data-pipeline-cdc-client) + +### CDC Client 介绍 + +`org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` 是 CDC Client 的入口类,用户可以通过该类和 CDC Server 进行交互。主要的和新方法如下。 + +| 方法名 | 返回值 | 说明 | +|----------------------------------------------------| --- | --- | +| await() | void | 阻塞 CDC 线程,await channel 关闭 | +| close() | void | 关闭 channel | +| connect() | void | 和服务端进行连接 | +| login (CDCLoginParameter parameter) | void | 登陆验证 | +| startStreaming (StartStreamingParameter parameter) | java.lang.String (CDC 任务唯一标识) | 开启 CDC 订阅 | +| restartStreaming (java.lang.String streamingId) | void | 重启订阅 | +| stopStreaming (java.lang.String streamingId) | void | 停止订阅 | + + +### CDC Client 使用示例 + +1. 引入 CDC Client + +```xml + + io.github.greycode + shardingsphere-data-pipeline-cdc-client + ${version} + +``` + +2. 启动 CDC Client + +这里先介绍下 `CDCClientConfiguration` 参数,构造 CDCClient 的时候需要传入该参数,该参数包含了 CDC Server 的地址,端口,以及 CDC 数据的消费逻辑。 + +```java +@RequiredArgsConstructor +@Getter +public final class CDCClientConfiguration { + + // CDC 的地址,和Proxy一致 + private final String address; + + // CDC 端口,和 server.yaml 的一致 + private final int port; + + // 数据消费的逻辑, 需要用户自行实现 + private final Consumer> dataConsumer; + + // 异常处理 handler,有个默认的实现 org.apache.shardingsphere.data.pipeline.cdc.client.handler.LoggerExceptionHandler,也可以自行实现相应的处理逻辑,比如出现错误后重连,或者停止 + private final ExceptionHandler exceptionHandler; + + // 超时时间,超过这个时间没收到服务器的响应,会认为请求失败。 + private final int timeoutMills; + ...... +} +``` + +下面是一个简单的启动 CDC Client 的示例。 + +```java +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient; +import org.apache.shardingsphere.data.pipeline.cdc.client.config.CDCClientConfiguration; +import org.apache.shardingsphere.data.pipeline.cdc.client.handler.LoggerExceptionHandler; +import org.apache.shardingsphere.data.pipeline.cdc.client.parameter.CDCLoginParameter; +import org.apache.shardingsphere.data.pipeline.cdc.client.parameter.StartStreamingParameter; +import org.apache.shardingsphere.data.pipeline.cdc.protocol.request.StreamDataRequestBody.SchemaTable; + +import java.util.Collections; + +@Slf4j +public final class Bootstrap { + + @SneakyThrows(InterruptedException.class) + public static void main(final String[] args) { + // TODO records 的消费逻辑需要用户自行实现,这里只是简单打印下 + CDCClientConfiguration clientConfig = new CDCClientConfiguration("127.0.0.1", 33071, records -> log.info("records: {}", records), new LoggerExceptionHandler()); + try (CDCClient cdcClient = new CDCClient(clientConfig)) { + // 1. 先调用 connect 连接到 CDC Server + cdcClient.connect(); + // 2. 调用登陆的逻辑,用户名密码和 server.yaml 配置文件中的一致 + cdcClient.login(new CDCLoginParameter("root", "root")); + // 3. 开启 CDC 数据订阅,用户只需要传入逻辑库和逻辑表,不需要关注底层数据分片情况,CDC Server 会将数据聚合后推送 + String streamingId = cdcClient.startStreaming(new StartStreamingParameter("sharding_db", Collections.singleton(SchemaTable.newBuilder().setTable("t_order").build()), true)); + log.info("Streaming id={}", streamingId); + // stopStreaming 和 restartStreaming 非必需的操作,分别表示停止订阅和重启订阅 + // cdcClient.stopStreaming(streamingId); + // cdcClient.restartStreaming(streamingId); + // 4. 这里是阻塞线程,确保 CDC Client 一直运行。 + cdcClient.await(); + } + } +} +``` + +主要有4个步骤 +1. 构造 CDCClient,传入 CDCClientConfiguration +2. 调用 CDCClient.connect,这一步是和 CDC Server 建立连接 +3. 调用 CDCClient.login,使用 server.yaml 中配置好的用户名和密码登录 +4. 调用 CDCClient.startStreaming,开启订阅,需要保证订阅的库和表在 ShardingSphere-Proxy 存在,否则会报错。 + +> CDCClient.await 是阻塞主线程,非必需的步骤,用其他方式也可以,只要保证 CDC 线程一直在工作就行。 + +如果需要更复杂数据消费的实现,例如写入到数据库,可以参考 [DataSourceRecordConsumer](https://github.com/apache/shardingsphere/blob/master/test/e2e/operation/pipeline/src/test/java/org/apache/shardingsphere/test/e2e/data/pipeline/cases/cdc/DataSourceRecordConsumer.java) diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md new file mode 100644 index 0000000000000..eae2a595db983 --- /dev/null +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md @@ -0,0 +1,7 @@ ++++ +title = "Build" +weight = 1 ++++ + +TODO + diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md new file mode 100644 index 0000000000000..9f7d617a85af6 --- /dev/null +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md @@ -0,0 +1,393 @@ ++++ +title = "使用手册" +weight = 2 ++++ + +## openGauss 使用手册 + +### 环境要求 + +支持的 openGauss 版本:2.x ~ 3.x。 + +### 权限要求 + +1. 调整源端 WAL 配置。 + +`postgresql.conf` 示例配置: +``` +wal_level = logical +max_wal_senders = 10 +max_replication_slots = 10 +wal_sender_timeout = 0 +max_connections = 600 +``` + +详情请参见 [Write Ahead Log](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/settings.html) 和 [Replication](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/sending-server.html)。 + +2. 赋予源端 openGauss 账号 replication 权限。 + +`pg_hba.conf` 示例配置: + +``` +host replication repl_acct 0.0.0.0/0 md5 +# 0.0.0.0/0 表示允许任意 IP 地址访问,可以根据实际情况调整成 CDC Server 的 IP 地址 +``` + +详情请参见 [Configuring Client Access Authentication](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/configuring-client-access-authentication.html) 和 [Example: Logic Replication Code](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/example-logic-replication-code.html)。 + +3. 赋予 openGauss 账号 DDL DML 权限。 + +如果使用非超级管理员账号,要求该账号在用到的数据库上,具备 CREATE 和 CONNECT 的权限。 + +示例: +```sql +GRANT CREATE, CONNECT ON DATABASE source_ds TO cdc_user; +``` + +还需要账号对迁移的表和 schema 具备访问权限,以 test schema 下的 t_order 表为例。 + +```sql +\c source_ds + +GRANT USAGE ON SCHEMA test TO GROUP cdc_user; +GRANT SELECT ON TABLE test.t_order TO cdc_user; +``` + +openGauss 有 OWNER 的概念,如果是数据库,SCHEMA,表的 OWNER,则可以省略对应的授权步骤。 + +openGauss 不允许普通账户在 public schema 下操作。所以如果迁移的表在 public schema 下,需要额外授权。 + +```sql +GRANT ALL PRIVILEGES TO cdc_user; +``` + +详情请参见 [openGauss GRANT](https://docs.opengauss.org/zh/docs/2.0.1/docs/Developerguide/GRANT.html) + + +### 完整流程示例 + +#### 前提条件 + +1. 在 MySQL 已准备好源端库、表、数据。 + +```sql +DROP DATABASE IF EXISTS migration_ds_0; +CREATE DATABASE migration_ds_0 DEFAULT CHARSET utf8; + +USE migration_ds_0; + +CREATE TABLE t_order (order_id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (order_id)); + +INSERT INTO t_order (order_id, user_id, status) VALUES (1,2,'ok'),(2,4,'ok'),(3,6,'ok'),(4,1,'ok'),(5,3,'ok'),(6,5,'ok'); +``` + +2. 在 MySQL 准备目标端库。 + +```sql +DROP DATABASE IF EXISTS migration_ds_10; +CREATE DATABASE migration_ds_10 DEFAULT CHARSET utf8; + +DROP DATABASE IF EXISTS migration_ds_11; +CREATE DATABASE migration_ds_11 DEFAULT CHARSET utf8; + +DROP DATABASE IF EXISTS migration_ds_12; +CREATE DATABASE migration_ds_12 DEFAULT CHARSET utf8; +``` + +#### 操作步骤 + +1. 在 proxy 新建逻辑数据库并配置好存储单元和规则。 + +```sql +CREATE DATABASE sharding_db; + +USE sharding_db + +REGISTER STORAGE UNIT ds_2 ( + URL="jdbc:mysql://127.0.0.1:3306/migration_ds_10?serverTimezone=UTC&useSSL=false", + USER="root", + PASSWORD="root", + PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") +), ds_3 ( + URL="jdbc:mysql://127.0.0.1:3306/migration_ds_11?serverTimezone=UTC&useSSL=false", + USER="root", + PASSWORD="root", + PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") +), ds_4 ( + URL="jdbc:mysql://127.0.0.1:3306/migration_ds_12?serverTimezone=UTC&useSSL=false", + USER="root", + PASSWORD="root", + PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") +); + +CREATE SHARDING TABLE RULE t_order( +STORAGE_UNITS(ds_2,ds_3,ds_4), +SHARDING_COLUMN=order_id, +TYPE(NAME="hash_mod",PROPERTIES("sharding-count"="6")), +KEY_GENERATE_STRATEGY(COLUMN=order_id,TYPE(NAME="snowflake")) +); +``` + +如果是迁移到异构数据库,那目前需要在 proxy 执行建表语句。 + +2. 在 proxy 配置源端存储单元。 + +```sql +REGISTER MIGRATION SOURCE STORAGE UNIT ds_0 ( + URL="jdbc:mysql://127.0.0.1:3306/migration_ds_0?serverTimezone=UTC&useSSL=false", + USER="root", + PASSWORD="root", + PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") +); +``` + +3. 启动数据迁移。 + +```sql +MIGRATE TABLE ds_0.t_order INTO t_order; +``` + +或者指定目标端逻辑库: + +```sql +MIGRATE TABLE ds_0.t_order INTO sharding_db.t_order; +``` + +4. 查看数据迁移作业列表。 + +```sql +SHOW MIGRATION LIST; +``` + +示例结果: +``` ++--------------------------------------------+--------------+----------------+--------+---------------------+-----------+ +| id | tables | job_item_count | active | create_time | stop_time | ++--------------------------------------------+--------------+----------------+--------+---------------------+-----------+ +| j0102p00002333dcb3d9db141cef14bed6fbf1ab54 | ds_0.t_order | 1 | true | 2023-09-20 14:41:32 | NULL | ++--------------------------------------------+--------------+----------------+--------+---------------------+-----------+ +``` + +5. 查看数据迁移详情。 + +```sql +SHOW MIGRATION STATUS 'j0102p00002333dcb3d9db141cef14bed6fbf1ab54'; +``` + +示例结果: +``` ++------+-------------+--------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+---------------+ +| item | data_source | tables | status | active | processed_records_count | inventory_finished_percentage | incremental_idle_seconds | error_message | ++------+-------------+--------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+---------------+ +| 0 | ds_0 | ds_0.t_order | EXECUTE_INCREMENTAL_TASK | true | 6 | 100 | | | ++------+-------------+--------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+---------------+ +``` + +6. 执行数据一致性校验。 + +```sql +CHECK MIGRATION 'j0102p00002333dcb3d9db141cef14bed6fbf1ab54' BY TYPE (NAME='DATA_MATCH'); +``` + +数据一致性校验算法类型来自: +```sql +SHOW MIGRATION CHECK ALGORITHMS; +``` + +示例结果: +``` ++-------------+--------------+--------------------------------------------------------------+----------------------------+ +| type | type_aliases | supported_database_types | description | ++-------------+--------------+--------------------------------------------------------------+----------------------------+ +| CRC32_MATCH | | MySQL,MariaDB,H2 | Match CRC32 of records. | +| DATA_MATCH | | SQL92,MySQL,PostgreSQL,openGauss,Oracle,SQLServer,MariaDB,H2 | Match raw data of records. | ++-------------+--------------+--------------------------------------------------------------+----------------------------+ +``` + +目标端开启数据加密的情况需要使用`DATA_MATCH`。 + +异构迁移需要使用`DATA_MATCH`。 + +查询数据一致性校验进度: +```sql +SHOW MIGRATION CHECK STATUS 'j0102p00002333dcb3d9db141cef14bed6fbf1ab54'; +``` + +示例结果: +``` ++--------------+--------+---------------------+--------+-------------------------------+-----------------------------+--------------------------+-------------------------+-------------------------+------------------+----------------+-----------------+---------------+ +| tables | result | check_failed_tables | active | inventory_finished_percentage | inventory_remaining_seconds | incremental_idle_seconds | check_begin_time | check_end_time | duration_seconds | algorithm_type | algorithm_props | error_message | ++--------------+--------+---------------------+--------+-------------------------------+-----------------------------+--------------------------+-------------------------+-------------------------+------------------+----------------+-----------------+---------------+ +| ds_0.t_order | true | | false | 100 | 0 | | 2023-09-20 14:45:31.992 | 2023-09-20 14:45:33.519 | 1 | DATA_MATCH | | | ++--------------+--------+---------------------+--------+-------------------------------+-----------------------------+--------------------------+-------------------------+-------------------------+------------------+----------------+-----------------+---------------+ +``` + +7. 完成作业。 + +```sql +COMMIT MIGRATION 'j0102p00002333dcb3d9db141cef14bed6fbf1ab54'; +``` + +更多 DistSQL 请参见 [RAL #数据迁移](/cn/user-manual/shardingsphere-proxy/distsql/syntax/ral/#%E6%95%B0%E6%8D%AE%E8%BF%81%E7%A7%BB)。 + +#### 前提条件 + +1. 准备好 CDC 源端的库、表、数据。 + +```sql +DROP DATABASE IF EXISTS ds_0; +CREATE DATABASE ds_0; + +DROP DATABASE IF EXISTS ds_1; +CREATE DATABASE ds_1; +``` + +#### 操作步骤 + +1. 在 `server.yaml` 中开启 CDC 功能。 + +```yaml +mode: + type: Cluster + repository: + type: ZooKeeper + props: + namespace: cdc + server-lists: localhost:2181 + retryIntervalMilliseconds: 500 + timeToLiveSeconds: 60 + maxRetries: 3 + operationTimeoutMilliseconds: 500 + +authority: + users: + - user: root@% + password: root + privilege: + type: ALL_PERMITTED + +#开启 GLT 功能参考 CDC 部署手册 +#transaction: +# defaultType: XA +# providerType: Atomikos +# +#globalClock: +# enabled: true +# type: TSO +# provider: redis +# props: +# host: 127.0.0.1 +# port: 6379 + +props: + system-log-level: INFO + check-table-metadata-enabled: false + proxy-default-port: 3307 # Proxy default port. + cdc-server-port: 33071 # CDC server port + proxy-frontend-database-protocol-type: openGauss +``` + +2. 在 proxy 新建逻辑数据库并配置好存储单元和规则。 + +2.1. 创建逻辑库。 + +```sql +CREATE DATABASE sharding_db; + +\c sharding_db +``` +2.2. 注册存储单元。 + +```sql +REGISTER STORAGE UNIT ds_0 ( + URL="jdbc:opengauss://127.0.0.1:5432/ds_0", + USER="gaussdb", + PASSWORD="Root@123", + PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") +), ds_1 ( + URL="jdbc:opengauss://127.0.0.1:5432/ds_1", + USER="gaussdb", + PASSWORD="Root@123", + PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") +); +``` + +2.3. 创建分片规则。 + +```sql +CREATE SHARDING TABLE RULE t_order( +STORAGE_UNITS(ds_0,ds_1), +SHARDING_COLUMN=order_id, +TYPE(NAME="hash_mod",PROPERTIES("sharding-count"="2")), +KEY_GENERATE_STRATEGY(COLUMN=order_id,TYPE(NAME="snowflake")) +); +``` + +2.4. 创建表和初始化数据 + +在 proxy 执行建表语句。 + +```sql +CREATE TABLE t_order (id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (id)); + +INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); +``` + +3. 启动 CDC Client + +引入 CDC Client 依赖,然后按照 [Example](https://github.com/apache/shardingsphere/blob/master/kernel/data-pipeline/scenario/cdc/client/src/test/java/org/apache/shardingsphere/data/pipeline/cdc/client/example/Bootstrap.java) 启动 CDC Client。 + +观察 CDC Client 启动后,是否有如下的日志 + +``` + records: [before { + name: "id" + value { + type_url: "type.googleapis.com/google.protobuf.Empty" + } +} +``` + +4. 通过 DistSQL 查看 CDC 任务状态 + +CDC 任务的启动和停止目前只能通过 CDC Client 控制,可以在 proxy 中执行对应的 DistSQL 查看 CDC 任务的运行情况 + +1. 查看 CDC 任务列表 + +SHOW STREAMING LIST; + +运行结果 + +``` +sharding_db=> SHOW STREAMING LIST; + id | database | tables | job_item_count | active | create_time | stop_time +--------------------------------------------+-------------+---------+----------------+--------+---------------------+----------- + j0302p0000702a83116fcee83f70419ca5e2993791 | sharding_db | t_order | 1 | true | 2023-10-27 22:01:27 | +(1 row) +``` + +2. 查看 CDC 任务详情 + +SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; + +运行结果 + +``` +sharding_db=> SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; + item | data_source | status | active | processed_records_count | inventory_finished_percentage | incremental_idle_seconds | error_message +------+-------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+--------------- + 0 | ds_0 | EXECUTE_INCREMENTAL_TASK | true | 1 | 100 | 101 | + 1 | ds_1 | EXECUTE_INCREMENTAL_TASK | true | 2 | 100 | 100 | +(2 rows) +``` + +3. 删除 CDC 任务 + +DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; + +此时也会删除 openGauss 物理库上的 replication slots + +``` +sharding_db=> DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; +SUCCESS +``` \ No newline at end of file diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md new file mode 100644 index 0000000000000..2e6bde2fefb48 --- /dev/null +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md @@ -0,0 +1,6 @@ ++++ +title = "Manual" +weight = 2 ++++ + +TODO \ No newline at end of file From 18632b0a7c8ff96e4357199b065acfe45aa2a7ff Mon Sep 17 00:00:00 2001 From: azexcy <13588031592@qq.com> Date: Mon, 30 Oct 2023 15:10:43 +0800 Subject: [PATCH 2/9] Improve doc --- .../shardingsphere-proxy/cdc/build.cn.md | 126 +------- .../shardingsphere-proxy/cdc/usage.cn.md | 277 +++++++----------- 2 files changed, 111 insertions(+), 292 deletions(-) diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md index f335bd4ba4a95..daa0b6bcbf96c 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md @@ -14,47 +14,7 @@ ShardingSphere CDC 分为两个部分,一个是 CDC Server,另一个是 CDC - 纯 JAVA 开发,JDK 建议 1.8 或以上版本。 - CDC Server 要求 SharingSphere-Proxy 使用集群模式,目前支持 ZooKeeper 作为注册中心。 - CDC 只同步数据,不会同步表结构,目前也不支持 DDL 的语句同步。 -- CDC 增量会按照事务的维度输出数据, 如果要开启 XA 事务的兼容,则 openGauss 和 ShardingSphere-Proxy 都需要 GLT 模块 - -## CDC 功能介绍 - -CDC 服务端的逻辑可以参考 https://github.com/apache/shardingsphere/tree/master/kernel/data-pipeline/scenario/cdc/core 中的实现。 - -### CDC 协议 - -CDC 协议使用 Protobuf,对应的 Protobuf 类型是根据 Java 中的类型来映射的,CDC 的数据类型和 openGauss 之间的映射关系如下 - -CDC 协议的数据类型和 openGauss 之间的映射关系如下 - -| openGauss 类型 | Java 数据类型 | CDC 对应的 protobuf 类型 | 备注 | -|-------------------------------------|--------------------|---------------------|--------------------------| -| INT1、INT2、INT4 | Integer | int32 | | -| INT8 | Long | int64 | | -| NUMERIC | BigDecimal | string | | -| FLOAT4 | Float | float | | -| FLOAT8 | Double | double | | -| BOOLEAN | Boolean | bool | | -| CHAR、VARCHAR、TEXT、CLOB | String | string | | -| BLOB、RAW、BYTEA | byte[] | bytes | | -| DATE | java.util.Date | Timestamp | | -| DATE | java.sql.Date | int64 | 这种情况下返回从1970-01-01 以来的天数 | -| TIMESTAMP,TIMESTAMPTZ、SMALLDATETIME | java.sql.Timestamp | Timestamp | 不带时区信息 | -| TIME,TIMETZ | java.sql.Time | int64 | 代表当天的纳秒数(时区无关) | -| INTERVAL、reltime、abstime | String | string | | -| point、lseg、box、path、polygon、circle | String | string | | -| cidr、inet、macaddr | String | string | | -| tsvector | String | string | | -| UUID | String | string | | -| JSON、JSONB | String | string | | -| HLL | String | string | | -| 范围类型(int4range等) | String | string | | -| HASH16、HASH32 | String | string | | - -> 需要注意对时间类型的处理,为了屏蔽时区的差异,CDC 返回的数据都是时区无关的 - -CDC Server 中 Java 类型转 Protobuf 类型的工具类:ColumnValueConvertUtils,[源码地址](https://github.com/apache/shardingsphere/blob/master/kernel/data-pipeline/scenario/cdc/core/src/main/java/org/apache/shardingsphere/data/pipeline/cdc/util/ColumnValueConvertUtils.java) - -对应的 CDC Client 中有 Protobuf 类型转换成 Java 类型的工具类 ProtobufAnyValueConverter,[源码地址](https://github.com/apache/shardingsphere/blob/master/kernel/data-pipeline/scenario/cdc/client/src/main/java/org/apache/shardingsphere/data/pipeline/cdc/client/util/ProtobufAnyValueConverter.java) +- CDC 增量阶段会按照事务的维度输出数据, 如果要开启 XA 事务的兼容,则 openGauss 和 ShardingSphere-Proxy 都需要 GLT 模块 ## CDC Server 部署步骤 @@ -78,8 +38,8 @@ CDC Server 中 Java 类型转 Protobuf 类型的工具类:ColumnValueConvertUt 可以从 maven 仓库中引入 -1. [shardingsphere-global-clock-tso-provider-redis](https://mvnrepository.com/artifact/io.github.greycode/shardingsphere-global-clock-tso-provider-redis),需要和 ShardingSphere-Proxy 版本一致 -2. [jedis](https://mvnrepository.com/artifact/redis.clients/jedis), 推荐使用 4.3.1 版本 +1. [shardingsphere-global-clock-tso-provider-redis](https://repo1.maven.org/maven2/org/apache/shardingsphere/shardingsphere-global-clock-tso-provider-redis),下载和 ShardingSphere-Proxy 同名版本 +2. [jedis-4.3.1](https://repo1.maven.org/maven2/redis/clients/jedis/4.3.1/jedis-4.3.1.jar) ### CDC Server 使用手册 @@ -140,8 +100,8 @@ proxy 已包含 PostgreSQL JDBC 驱动。 | 数据库 | JDBC 驱动 | 参考 | |-----------|---------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------| -| MySQL | [mysql-connector-java-5.1.49.jar]( https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.49/mysql-connector-java-5.1.49.jar ) | [Connector/J Versions]( https://dev.mysql.com/doc/connector-j/5.1/en/connector-j-versions.html ) | -| openGauss | [opengauss-jdbc-3.0.0.jar]( https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/3.0.0/opengauss-jdbc-3.0.0.jar ) | | +| MySQL | [mysql-connector-java-8.0.11.jar]( https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.49/mysql-connector-java-5.1.49.jar ) | [Connector/J Versions]( https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.11/mysql-connector-java-8.0.11.jar ) | +| openGauss | [opengauss-jdbc-3.1.1-og.jar]( https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/3.1.1-og/opengauss-jdbc-3.1.1-og.jar ) | | 4. 启动 ShardingSphere-Proxy: @@ -183,6 +143,8 @@ sh bin/start.sh ### CDC Client 使用示例 +目前 CDC Client 只提供了 Java API,用户需要自行实现数据的消费逻辑。 + 1. 引入 CDC Client ```xml @@ -195,76 +157,4 @@ sh bin/start.sh 2. 启动 CDC Client -这里先介绍下 `CDCClientConfiguration` 参数,构造 CDCClient 的时候需要传入该参数,该参数包含了 CDC Server 的地址,端口,以及 CDC 数据的消费逻辑。 - -```java -@RequiredArgsConstructor -@Getter -public final class CDCClientConfiguration { - - // CDC 的地址,和Proxy一致 - private final String address; - - // CDC 端口,和 server.yaml 的一致 - private final int port; - - // 数据消费的逻辑, 需要用户自行实现 - private final Consumer> dataConsumer; - - // 异常处理 handler,有个默认的实现 org.apache.shardingsphere.data.pipeline.cdc.client.handler.LoggerExceptionHandler,也可以自行实现相应的处理逻辑,比如出现错误后重连,或者停止 - private final ExceptionHandler exceptionHandler; - - // 超时时间,超过这个时间没收到服务器的响应,会认为请求失败。 - private final int timeoutMills; - ...... -} -``` - -下面是一个简单的启动 CDC Client 的示例。 - -```java -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; -import org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient; -import org.apache.shardingsphere.data.pipeline.cdc.client.config.CDCClientConfiguration; -import org.apache.shardingsphere.data.pipeline.cdc.client.handler.LoggerExceptionHandler; -import org.apache.shardingsphere.data.pipeline.cdc.client.parameter.CDCLoginParameter; -import org.apache.shardingsphere.data.pipeline.cdc.client.parameter.StartStreamingParameter; -import org.apache.shardingsphere.data.pipeline.cdc.protocol.request.StreamDataRequestBody.SchemaTable; - -import java.util.Collections; - -@Slf4j -public final class Bootstrap { - - @SneakyThrows(InterruptedException.class) - public static void main(final String[] args) { - // TODO records 的消费逻辑需要用户自行实现,这里只是简单打印下 - CDCClientConfiguration clientConfig = new CDCClientConfiguration("127.0.0.1", 33071, records -> log.info("records: {}", records), new LoggerExceptionHandler()); - try (CDCClient cdcClient = new CDCClient(clientConfig)) { - // 1. 先调用 connect 连接到 CDC Server - cdcClient.connect(); - // 2. 调用登陆的逻辑,用户名密码和 server.yaml 配置文件中的一致 - cdcClient.login(new CDCLoginParameter("root", "root")); - // 3. 开启 CDC 数据订阅,用户只需要传入逻辑库和逻辑表,不需要关注底层数据分片情况,CDC Server 会将数据聚合后推送 - String streamingId = cdcClient.startStreaming(new StartStreamingParameter("sharding_db", Collections.singleton(SchemaTable.newBuilder().setTable("t_order").build()), true)); - log.info("Streaming id={}", streamingId); - // stopStreaming 和 restartStreaming 非必需的操作,分别表示停止订阅和重启订阅 - // cdcClient.stopStreaming(streamingId); - // cdcClient.restartStreaming(streamingId); - // 4. 这里是阻塞线程,确保 CDC Client 一直运行。 - cdcClient.await(); - } - } -} -``` - -主要有4个步骤 -1. 构造 CDCClient,传入 CDCClientConfiguration -2. 调用 CDCClient.connect,这一步是和 CDC Server 建立连接 -3. 调用 CDCClient.login,使用 server.yaml 中配置好的用户名和密码登录 -4. 调用 CDCClient.startStreaming,开启订阅,需要保证订阅的库和表在 ShardingSphere-Proxy 存在,否则会报错。 - -> CDCClient.await 是阻塞主线程,非必需的步骤,用其他方式也可以,只要保证 CDC 线程一直在工作就行。 - -如果需要更复杂数据消费的实现,例如写入到数据库,可以参考 [DataSourceRecordConsumer](https://github.com/apache/shardingsphere/blob/master/test/e2e/operation/pipeline/src/test/java/org/apache/shardingsphere/test/e2e/data/pipeline/cases/cdc/DataSourceRecordConsumer.java) +参考 [Example](https://github.com/apache/shardingsphere/blob/master/kernel/data-pipeline/scenario/cdc/client/src/test/java/org/apache/shardingsphere/data/pipeline/cdc/client/example/Bootstrap.java) 启动 CDC Client。 diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md index 9f7d617a85af6..a6520253b10b7 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md @@ -3,6 +3,38 @@ title = "使用手册" weight = 2 +++ +## CDC 功能介绍 + +### CDC 协议介绍 + +CDC 协议使用 Protobuf,对应的 Protobuf 类型是根据 Java 中的类型来映射的。 + +这里以 openGauss 为例,CDC 协议的数据类型和数据库类型的映射关系如下 + +| openGauss 类型 | Java 数据类型 | CDC 对应的 protobuf 类型 | 备注 | +|------------------------------------------|--------------------|---------------------|----------------| +| INT1、INT2、INT4 | Integer | int32 | | +| INT8 | Long | int64 | | +| NUMERIC | BigDecimal | string | | +| FLOAT4 | Float | float | | +| FLOAT8 | Double | double | | +| BOOLEAN | Boolean | bool | | +| CHAR、VARCHAR、TEXT、CLOB | String | string | | +| BLOB、RAW、BYTEA | byte[] | bytes | | +| DATE、TIMESTAMP,TIMESTAMPTZ、SMALLDATETIME | java.sql.Timestamp | Timestamp | 不带时区信息 | +| TIME,TIMETZ | java.sql.Time | int64 | 代表当天的纳秒数(时区无关) | +| INTERVAL、reltime、abstime | String | string | | +| point、lseg、box、path、polygon、circle | String | string | | +| cidr、inet、macaddr | String | string | | +| tsvector | String | string | | +| UUID | String | string | | +| JSON、JSONB | String | string | | +| HLL | String | string | | +| 范围类型(int4range等) | String | string | | +| HASH16、HASH32 | String | string | | + +> 需要注意对时间类型的处理,为了屏蔽时区的差异,CDC 返回的数据都是时区无关的 + ## openGauss 使用手册 ### 环境要求 @@ -63,175 +95,10 @@ GRANT ALL PRIVILEGES TO cdc_user; 详情请参见 [openGauss GRANT](https://docs.opengauss.org/zh/docs/2.0.1/docs/Developerguide/GRANT.html) - ### 完整流程示例 #### 前提条件 -1. 在 MySQL 已准备好源端库、表、数据。 - -```sql -DROP DATABASE IF EXISTS migration_ds_0; -CREATE DATABASE migration_ds_0 DEFAULT CHARSET utf8; - -USE migration_ds_0; - -CREATE TABLE t_order (order_id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (order_id)); - -INSERT INTO t_order (order_id, user_id, status) VALUES (1,2,'ok'),(2,4,'ok'),(3,6,'ok'),(4,1,'ok'),(5,3,'ok'),(6,5,'ok'); -``` - -2. 在 MySQL 准备目标端库。 - -```sql -DROP DATABASE IF EXISTS migration_ds_10; -CREATE DATABASE migration_ds_10 DEFAULT CHARSET utf8; - -DROP DATABASE IF EXISTS migration_ds_11; -CREATE DATABASE migration_ds_11 DEFAULT CHARSET utf8; - -DROP DATABASE IF EXISTS migration_ds_12; -CREATE DATABASE migration_ds_12 DEFAULT CHARSET utf8; -``` - -#### 操作步骤 - -1. 在 proxy 新建逻辑数据库并配置好存储单元和规则。 - -```sql -CREATE DATABASE sharding_db; - -USE sharding_db - -REGISTER STORAGE UNIT ds_2 ( - URL="jdbc:mysql://127.0.0.1:3306/migration_ds_10?serverTimezone=UTC&useSSL=false", - USER="root", - PASSWORD="root", - PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") -), ds_3 ( - URL="jdbc:mysql://127.0.0.1:3306/migration_ds_11?serverTimezone=UTC&useSSL=false", - USER="root", - PASSWORD="root", - PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") -), ds_4 ( - URL="jdbc:mysql://127.0.0.1:3306/migration_ds_12?serverTimezone=UTC&useSSL=false", - USER="root", - PASSWORD="root", - PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") -); - -CREATE SHARDING TABLE RULE t_order( -STORAGE_UNITS(ds_2,ds_3,ds_4), -SHARDING_COLUMN=order_id, -TYPE(NAME="hash_mod",PROPERTIES("sharding-count"="6")), -KEY_GENERATE_STRATEGY(COLUMN=order_id,TYPE(NAME="snowflake")) -); -``` - -如果是迁移到异构数据库,那目前需要在 proxy 执行建表语句。 - -2. 在 proxy 配置源端存储单元。 - -```sql -REGISTER MIGRATION SOURCE STORAGE UNIT ds_0 ( - URL="jdbc:mysql://127.0.0.1:3306/migration_ds_0?serverTimezone=UTC&useSSL=false", - USER="root", - PASSWORD="root", - PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") -); -``` - -3. 启动数据迁移。 - -```sql -MIGRATE TABLE ds_0.t_order INTO t_order; -``` - -或者指定目标端逻辑库: - -```sql -MIGRATE TABLE ds_0.t_order INTO sharding_db.t_order; -``` - -4. 查看数据迁移作业列表。 - -```sql -SHOW MIGRATION LIST; -``` - -示例结果: -``` -+--------------------------------------------+--------------+----------------+--------+---------------------+-----------+ -| id | tables | job_item_count | active | create_time | stop_time | -+--------------------------------------------+--------------+----------------+--------+---------------------+-----------+ -| j0102p00002333dcb3d9db141cef14bed6fbf1ab54 | ds_0.t_order | 1 | true | 2023-09-20 14:41:32 | NULL | -+--------------------------------------------+--------------+----------------+--------+---------------------+-----------+ -``` - -5. 查看数据迁移详情。 - -```sql -SHOW MIGRATION STATUS 'j0102p00002333dcb3d9db141cef14bed6fbf1ab54'; -``` - -示例结果: -``` -+------+-------------+--------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+---------------+ -| item | data_source | tables | status | active | processed_records_count | inventory_finished_percentage | incremental_idle_seconds | error_message | -+------+-------------+--------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+---------------+ -| 0 | ds_0 | ds_0.t_order | EXECUTE_INCREMENTAL_TASK | true | 6 | 100 | | | -+------+-------------+--------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+---------------+ -``` - -6. 执行数据一致性校验。 - -```sql -CHECK MIGRATION 'j0102p00002333dcb3d9db141cef14bed6fbf1ab54' BY TYPE (NAME='DATA_MATCH'); -``` - -数据一致性校验算法类型来自: -```sql -SHOW MIGRATION CHECK ALGORITHMS; -``` - -示例结果: -``` -+-------------+--------------+--------------------------------------------------------------+----------------------------+ -| type | type_aliases | supported_database_types | description | -+-------------+--------------+--------------------------------------------------------------+----------------------------+ -| CRC32_MATCH | | MySQL,MariaDB,H2 | Match CRC32 of records. | -| DATA_MATCH | | SQL92,MySQL,PostgreSQL,openGauss,Oracle,SQLServer,MariaDB,H2 | Match raw data of records. | -+-------------+--------------+--------------------------------------------------------------+----------------------------+ -``` - -目标端开启数据加密的情况需要使用`DATA_MATCH`。 - -异构迁移需要使用`DATA_MATCH`。 - -查询数据一致性校验进度: -```sql -SHOW MIGRATION CHECK STATUS 'j0102p00002333dcb3d9db141cef14bed6fbf1ab54'; -``` - -示例结果: -``` -+--------------+--------+---------------------+--------+-------------------------------+-----------------------------+--------------------------+-------------------------+-------------------------+------------------+----------------+-----------------+---------------+ -| tables | result | check_failed_tables | active | inventory_finished_percentage | inventory_remaining_seconds | incremental_idle_seconds | check_begin_time | check_end_time | duration_seconds | algorithm_type | algorithm_props | error_message | -+--------------+--------+---------------------+--------+-------------------------------+-----------------------------+--------------------------+-------------------------+-------------------------+------------------+----------------+-----------------+---------------+ -| ds_0.t_order | true | | false | 100 | 0 | | 2023-09-20 14:45:31.992 | 2023-09-20 14:45:33.519 | 1 | DATA_MATCH | | | -+--------------+--------+---------------------+--------+-------------------------------+-----------------------------+--------------------------+-------------------------+-------------------------+------------------+----------------+-----------------+---------------+ -``` - -7. 完成作业。 - -```sql -COMMIT MIGRATION 'j0102p00002333dcb3d9db141cef14bed6fbf1ab54'; -``` - -更多 DistSQL 请参见 [RAL #数据迁移](/cn/user-manual/shardingsphere-proxy/distsql/syntax/ral/#%E6%95%B0%E6%8D%AE%E8%BF%81%E7%A7%BB)。 - -#### 前提条件 - 1. 准备好 CDC 源端的库、表、数据。 ```sql @@ -335,19 +202,81 @@ INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'o 3. 启动 CDC Client -引入 CDC Client 依赖,然后按照 [Example](https://github.com/apache/shardingsphere/blob/master/kernel/data-pipeline/scenario/cdc/client/src/test/java/org/apache/shardingsphere/data/pipeline/cdc/client/example/Bootstrap.java) 启动 CDC Client。 - -观察 CDC Client 启动后,是否有如下的日志 - +先引入 CDC Client 依赖,在代码 + +这里先介绍下 `CDCClientConfiguration` 参数,构造 CDCClient 的时候需要传入该参数,该参数包含了 CDC Server 的地址,端口,以及 CDC 数据的消费逻辑。 + +```java +@RequiredArgsConstructor +@Getter +public final class CDCClientConfiguration { + + // CDC 的地址,和Proxy一致 + private final String address; + + // CDC 端口,和 server.yaml 的一致 + private final int port; + + // 数据消费的逻辑, 需要用户自行实现 + private final Consumer> dataConsumer; + + // 异常处理 handler,有个默认的实现 org.apache.shardingsphere.data.pipeline.cdc.client.handler.LoggerExceptionHandler,也可以自行实现相应的处理逻辑,比如出现错误后重连,或者停止 + private final ExceptionHandler exceptionHandler; + + // 超时时间,超过这个时间没收到服务器的响应,会认为请求失败。 + private final int timeoutMills; +} ``` - records: [before { - name: "id" - value { - type_url: "type.googleapis.com/google.protobuf.Empty" - } + +下面是一个简单的启动 CDC Client 的示例。 + +```java +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient; +import org.apache.shardingsphere.data.pipeline.cdc.client.config.CDCClientConfiguration; +import org.apache.shardingsphere.data.pipeline.cdc.client.handler.LoggerExceptionHandler; +import org.apache.shardingsphere.data.pipeline.cdc.client.parameter.CDCLoginParameter; +import org.apache.shardingsphere.data.pipeline.cdc.client.parameter.StartStreamingParameter; +import org.apache.shardingsphere.data.pipeline.cdc.protocol.request.StreamDataRequestBody.SchemaTable; + +import java.util.Collections; + +@Slf4j +public final class Bootstrap { + + @SneakyThrows(InterruptedException.class) + public static void main(final String[] args) { + // TODO records 的消费逻辑需要用户自行实现,这里只是简单打印下 + CDCClientConfiguration clientConfig = new CDCClientConfiguration("127.0.0.1", 33071, records -> log.info("records: {}", records), new LoggerExceptionHandler()); + try (CDCClient cdcClient = new CDCClient(clientConfig)) { + // 1. 先调用 connect 连接到 CDC Server + cdcClient.connect(); + // 2. 调用登陆的逻辑,用户名密码和 server.yaml 配置文件中的一致 + cdcClient.login(new CDCLoginParameter("root", "root")); + // 3. 开启 CDC 数据订阅,用户只需要传入逻辑库和逻辑表,不需要关注底层数据分片情况,CDC Server 会将数据聚合后推送 + String streamingId = cdcClient.startStreaming(new StartStreamingParameter("sharding_db", Collections.singleton(SchemaTable.newBuilder().setTable("t_order").build()), true)); + log.info("Streaming id={}", streamingId); + // stopStreaming 和 restartStreaming 非必需的操作,分别表示停止订阅和重启订阅 + // cdcClient.stopStreaming(streamingId); + // cdcClient.restartStreaming(streamingId); + // 4. 这里是阻塞线程,确保 CDC Client 一直运行。 + cdcClient.await(); + } + } } ``` +主要有4个步骤 +1. 构造 CDCClient,传入 CDCClientConfiguration +2. 调用 CDCClient.connect,这一步是和 CDC Server 建立连接 +3. 调用 CDCClient.login,使用 server.yaml 中配置好的用户名和密码登录 +4. 调用 CDCClient.startStreaming,开启订阅,需要保证订阅的库和表在 ShardingSphere-Proxy 存在,否则会报错。 + +> CDCClient.await 是阻塞主线程,非必需的步骤,用其他方式也可以,只要保证 CDC 线程一直在工作就行。 + +如果需要更复杂数据消费的实现,例如写入到数据库,可以参考 [DataSourceRecordConsumer](https://github.com/apache/shardingsphere/blob/master/test/e2e/operation/pipeline/src/test/java/org/apache/shardingsphere/test/e2e/data/pipeline/cases/cdc/DataSourceRecordConsumer.java) + 4. 通过 DistSQL 查看 CDC 任务状态 CDC 任务的启动和停止目前只能通过 CDC Client 控制,可以在 proxy 中执行对应的 DistSQL 查看 CDC 任务的运行情况 From ab459b3570fe4c3db4b30953dae99207b7e1a9d0 Mon Sep 17 00:00:00 2001 From: azexcy <13588031592@qq.com> Date: Tue, 14 Nov 2023 14:22:14 +0800 Subject: [PATCH 3/9] Advise CDC doc --- .../shardingsphere-proxy/cdc/build.cn.md | 76 ++++----- .../shardingsphere-proxy/cdc/usage.cn.md | 158 +++++------------- 2 files changed, 74 insertions(+), 160 deletions(-) diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md index daa0b6bcbf96c..394366a07ea26 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md @@ -31,7 +31,8 @@ ShardingSphere CDC 分为两个部分,一个是 CDC Server,另一个是 CDC #### 1. 源码编译安装 1. 准备代码环境,提前下载或者使用 Git clone,从 Github 下载 [ShardingSphere](https://github.com/apache/shardingsphere.git) 源码。 -2. 删除 kernel/global-clock/type/tso/core/pom.xml 中 shardingsphere-global-clock-tso-provider-redis 依赖的 `provided` 标签和 kernel/global-clock/type/tso/provider/redis/pom.xml 中 jedis 的 `provided` 标签 +2. 删除 kernel/global-clock/type/tso/core/pom.xml 中 shardingsphere-global-clock-tso-provider-redis 依赖的 `provided` 标签和 kernel/global-clock/type/tso/provider/redis/pom.xml 中 jedis + 的 `provided` 标签 3. 编译 ShardingSphere-Proxy,具体编译步骤请参考 [ShardingSphere 编译手册](https://github.com/apache/shardingsphere/wiki#build-apache-shardingsphere)。 #### 2. 直接引入 GLT 依赖 @@ -47,13 +48,15 @@ ShardingSphere CDC 分为两个部分,一个是 CDC Server,另一个是 CDC 配置示例: +1. 在 `server.yaml` 中开启 CDC 功能。 + ```yaml mode: type: Cluster repository: type: ZooKeeper props: - namespace: open_cdc + namespace: cdc_demo server-lists: localhost:2181 retryIntervalMilliseconds: 500 timeToLiveSeconds: 60 @@ -62,11 +65,8 @@ mode: authority: users: - # 这里的用户名和密码在 CDC Client 的认证中也会用到 - user: root@% password: root - - user: proxy - password: Proxy@123 privilege: type: ALL_PERMITTED @@ -74,8 +74,7 @@ authority: #transaction: # defaultType: XA # providerType: Atomikos - -# GLT 模块配置,如果不需要 GLT 模块,可以不配置 +# #globalClock: # enabled: true # type: TSO @@ -84,12 +83,12 @@ authority: # host: 127.0.0.1 # port: 6379 - props: + system-log-level: INFO + check-table-metadata-enabled: false + proxy-default-port: 3307 # Proxy default port. cdc-server-port: 33071 # CDC Server 端口,必须配置 - proxy-frontend-database-protocol-type: openGauss - # 省略其他配置 - ...... + #proxy-frontend-database-protocol-type: openGauss # 和后端数据库的类型一致 ``` 2. 引入 JDBC 驱动。 @@ -98,10 +97,10 @@ proxy 已包含 PostgreSQL JDBC 驱动。 如果后端连接以下数据库,请下载相应 JDBC 驱动 jar 包,并将其放入 `${shardingsphere-proxy}/ext-lib` 目录。 -| 数据库 | JDBC 驱动 | 参考 | -|-----------|---------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------| -| MySQL | [mysql-connector-java-8.0.11.jar]( https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.49/mysql-connector-java-5.1.49.jar ) | [Connector/J Versions]( https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.11/mysql-connector-java-8.0.11.jar ) | -| openGauss | [opengauss-jdbc-3.1.1-og.jar]( https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/3.1.1-og/opengauss-jdbc-3.1.1-og.jar ) | | +| 数据库 | JDBC 驱动 | +|-----------|---------------------------------------------------------------------------------------------------------------------------------| +| MySQL | [mysql-connector-java-8.0.31.jar](https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.31/) | +| openGauss | [opengauss-jdbc-3.1.1-og.jar](https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/3.1.1-og/opengauss-jdbc-3.1.1-og.jar) | 4. 启动 ShardingSphere-Proxy: @@ -117,44 +116,31 @@ sh bin/start.sh 确认启动成功。 -## CDC Client 使用手册 - -用户可以通过 CDC Client 和服务端进行交互,CDC Client 的依赖很轻,只包含 netty 以及 CDC 协议相关的依赖。 - -有两种方式可以引入 CDC Client - -1. 源码编译,CDC Client 在编译 Proxy 的时候会一同编译,在 kernel/data-pipeline/scenario/cdc/client/target 目录下可以找到编译后的 jar 文件 -2. 从 maven 仓库获取,[Shardingsphere Data Pipeline CDC Client](https://mvnrepository.com/artifact/io.github.greycode/shardingsphere-data-pipeline-cdc-client) - -### CDC Client 介绍 - -`org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` 是 CDC Client 的入口类,用户可以通过该类和 CDC Server 进行交互。主要的和新方法如下。 - -| 方法名 | 返回值 | 说明 | -|----------------------------------------------------| --- | --- | -| await() | void | 阻塞 CDC 线程,await channel 关闭 | -| close() | void | 关闭 channel | -| connect() | void | 和服务端进行连接 | -| login (CDCLoginParameter parameter) | void | 登陆验证 | -| startStreaming (StartStreamingParameter parameter) | java.lang.String (CDC 任务唯一标识) | 开启 CDC 订阅 | -| restartStreaming (java.lang.String streamingId) | void | 重启订阅 | -| stopStreaming (java.lang.String streamingId) | void | 停止订阅 | - +## CDC Client 手册 -### CDC Client 使用示例 +CDC Client 不需要额外部署,只需要通过 maven 引入 CDC Client 的依赖就可以在项目中使用。用户可以通过 CDC Client 和服务端进行交互。 -目前 CDC Client 只提供了 Java API,用户需要自行实现数据的消费逻辑。 - -1. 引入 CDC Client +如果有需要,用户也可以自行实现一个 CDC Client,进行数据的消费和 ACK。 ```xml - io.github.greycode + org.apache.shardingsphere shardingsphere-data-pipeline-cdc-client ${version} ``` -2. 启动 CDC Client +### CDC Client 介绍 + +`org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` 是 CDC Client 的入口类,用户可以通过该类和 CDC Server 进行交互。主要的和新方法如下。 -参考 [Example](https://github.com/apache/shardingsphere/blob/master/kernel/data-pipeline/scenario/cdc/client/src/test/java/org/apache/shardingsphere/data/pipeline/cdc/client/example/Bootstrap.java) 启动 CDC Client。 +| 方法名 | 返回值 | 说明 | +|-----------------------------------------------------------------------------------------------------------------------------|--------------------------------------|---------------------------------------------------------------------------------| +| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | 和服务端进行连接,连接的时候需要指定 1. 数据的消费处理逻辑 2. 消费时候的异常处理逻辑 3. 服务端错误的异常处理逻辑 | +| login(CDCLoginParameter parameter) | void | CDC 登陆 CDCLoginParameter 参数 - username:用户名 - password:密码 | +| startStreaming(StartStreamingParameter parameter) | java.lang.String (CDC 任务唯一标识,用于后续操作) | 开启 CDC 订阅 StartStreamingParameter 参数 - database:逻辑库名称 - schemaTables:订阅的表名 - full:是否订阅全量数据 | +| restartStreaming(String streamingId) | void | 重启订阅 | +| stopStreaming(String streamingId) | void | 停止订阅 | +| dropStreaming(String streamingId) | void | 删除订阅 | +| await() | void | 阻塞 CDC 线程,等待 channel 关闭 | +| close() | void | 关闭 channel,流程结束。 | diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md index a6520253b10b7..aaf95864d2ce2 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md @@ -11,29 +11,29 @@ CDC 协议使用 Protobuf,对应的 Protobuf 类型是根据 Java 中的类型 这里以 openGauss 为例,CDC 协议的数据类型和数据库类型的映射关系如下 -| openGauss 类型 | Java 数据类型 | CDC 对应的 protobuf 类型 | 备注 | -|------------------------------------------|--------------------|---------------------|----------------| -| INT1、INT2、INT4 | Integer | int32 | | -| INT8 | Long | int64 | | -| NUMERIC | BigDecimal | string | | -| FLOAT4 | Float | float | | -| FLOAT8 | Double | double | | -| BOOLEAN | Boolean | bool | | -| CHAR、VARCHAR、TEXT、CLOB | String | string | | -| BLOB、RAW、BYTEA | byte[] | bytes | | -| DATE、TIMESTAMP,TIMESTAMPTZ、SMALLDATETIME | java.sql.Timestamp | Timestamp | 不带时区信息 | -| TIME,TIMETZ | java.sql.Time | int64 | 代表当天的纳秒数(时区无关) | -| INTERVAL、reltime、abstime | String | string | | -| point、lseg、box、path、polygon、circle | String | string | | -| cidr、inet、macaddr | String | string | | -| tsvector | String | string | | -| UUID | String | string | | -| JSON、JSONB | String | string | | -| HLL | String | string | | -| 范围类型(int4range等) | String | string | | -| HASH16、HASH32 | String | string | | - -> 需要注意对时间类型的处理,为了屏蔽时区的差异,CDC 返回的数据都是时区无关的 +| openGauss 类型 | Java 数据类型 | CDC 对应的 protobuf 类型 | 备注 | +|------------------------------------------|--------------------|---------------------|----------------------------------------| +| tinyint、smallint、integer | Integer | int32 | | +| bigint | Long | int64 | | +| numeric | BigDecimal | string | | +| real、float4 | Float | float | | +| binary_double、double precision | Double | double | | +| boolean | Boolean | bool | | +| char、varchar、text、clob | String | string | | +| blob、bytea、raw | byte[] | bytes | | +| date、timestamp,timestamptz、smalldatetime | java.sql.Timestamp | Timestamp | protobuf 的 Timestamp 类型只包含秒和纳秒,所以和时区无关 | +| time、timetz | java.sql.Time | int64 | 代表当天的纳秒数,和时区无关 | +| interval、reltime、abstime | String | string | | +| point、lseg、box、path、polygon、circle | String | string | | +| cidr、inet、macaddr | String | string | | +| tsvector | String | string | | +| tsquery | String | String | | +| uuid | String | string | | +| json、jsonb | String | string | | +| hll | String | string | | +| int4range、daterange、tsrange、tstzrange | String | string | | +| hash16、hash32 | String | string | | +| bit、bit varying | String | string | bit(1) 的时候返回 Boolean 类型 | ## openGauss 使用手册 @@ -109,61 +109,16 @@ DROP DATABASE IF EXISTS ds_1; CREATE DATABASE ds_1; ``` -#### 操作步骤 - -1. 在 `server.yaml` 中开启 CDC 功能。 - -```yaml -mode: - type: Cluster - repository: - type: ZooKeeper - props: - namespace: cdc - server-lists: localhost:2181 - retryIntervalMilliseconds: 500 - timeToLiveSeconds: 60 - maxRetries: 3 - operationTimeoutMilliseconds: 500 - -authority: - users: - - user: root@% - password: root - privilege: - type: ALL_PERMITTED - -#开启 GLT 功能参考 CDC 部署手册 -#transaction: -# defaultType: XA -# providerType: Atomikos -# -#globalClock: -# enabled: true -# type: TSO -# provider: redis -# props: -# host: 127.0.0.1 -# port: 6379 - -props: - system-log-level: INFO - check-table-metadata-enabled: false - proxy-default-port: 3307 # Proxy default port. - cdc-server-port: 33071 # CDC server port - proxy-frontend-database-protocol-type: openGauss -``` - -2. 在 proxy 新建逻辑数据库并配置好存储单元和规则。 +#### 配置 CDC Server -2.1. 创建逻辑库。 +1. 创建逻辑库。 ```sql CREATE DATABASE sharding_db; \c sharding_db ``` -2.2. 注册存储单元。 +2. 注册存储单元。 ```sql REGISTER STORAGE UNIT ds_0 ( @@ -179,7 +134,7 @@ REGISTER STORAGE UNIT ds_0 ( ); ``` -2.3. 创建分片规则。 +3. 创建分片规则。 ```sql CREATE SHARDING TABLE RULE t_order( @@ -190,7 +145,7 @@ KEY_GENERATE_STRATEGY(COLUMN=order_id,TYPE(NAME="snowflake")) ); ``` -2.4. 创建表和初始化数据 +4. 创建表和初始化数据 在 proxy 执行建表语句。 @@ -200,33 +155,9 @@ CREATE TABLE t_order (id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); ``` -3. 启动 CDC Client +#### 启动 CDC Client -先引入 CDC Client 依赖,在代码 - -这里先介绍下 `CDCClientConfiguration` 参数,构造 CDCClient 的时候需要传入该参数,该参数包含了 CDC Server 的地址,端口,以及 CDC 数据的消费逻辑。 - -```java -@RequiredArgsConstructor -@Getter -public final class CDCClientConfiguration { - - // CDC 的地址,和Proxy一致 - private final String address; - - // CDC 端口,和 server.yaml 的一致 - private final int port; - - // 数据消费的逻辑, 需要用户自行实现 - private final Consumer> dataConsumer; - - // 异常处理 handler,有个默认的实现 org.apache.shardingsphere.data.pipeline.cdc.client.handler.LoggerExceptionHandler,也可以自行实现相应的处理逻辑,比如出现错误后重连,或者停止 - private final ExceptionHandler exceptionHandler; - - // 超时时间,超过这个时间没收到服务器的响应,会认为请求失败。 - private final int timeoutMills; -} -``` +目前 CDC Client 只提供了 Java API,用户需要自行实现数据的消费逻辑。 下面是一个简单的启动 CDC Client 的示例。 @@ -235,7 +166,7 @@ import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient; import org.apache.shardingsphere.data.pipeline.cdc.client.config.CDCClientConfiguration; -import org.apache.shardingsphere.data.pipeline.cdc.client.handler.LoggerExceptionHandler; +import org.apache.shardingsphere.data.pipeline.cdc.client.handler.RetryStreamingExceptionHandler; import org.apache.shardingsphere.data.pipeline.cdc.client.parameter.CDCLoginParameter; import org.apache.shardingsphere.data.pipeline.cdc.client.parameter.StartStreamingParameter; import org.apache.shardingsphere.data.pipeline.cdc.protocol.request.StreamDataRequestBody.SchemaTable; @@ -247,20 +178,17 @@ public final class Bootstrap { @SneakyThrows(InterruptedException.class) public static void main(final String[] args) { - // TODO records 的消费逻辑需要用户自行实现,这里只是简单打印下 - CDCClientConfiguration clientConfig = new CDCClientConfiguration("127.0.0.1", 33071, records -> log.info("records: {}", records), new LoggerExceptionHandler()); - try (CDCClient cdcClient = new CDCClient(clientConfig)) { - // 1. 先调用 connect 连接到 CDC Server - cdcClient.connect(); - // 2. 调用登陆的逻辑,用户名密码和 server.yaml 配置文件中的一致 + String address = "127.0.0.1"; + // 构造 CDCClient,传入 CDCClientConfiguration,CDCClientConfiguration 中包含了 CDC Server 的地址和端口,以及超时时间 + try (CDCClient cdcClient = new CDCClient(new CDCClientConfiguration(address, 33071, 10000))) { + // 先调用 connect 连接到 CDC Server,需要传入 1. 数据的消费处理逻辑 2. 消费时候的异常处理逻辑 3. 服务端错误的异常处理逻辑 + cdcClient.connect(records -> log.info("records: {}", records), new RetryStreamingExceptionHandler(cdcClient, 5, 5000), + (ctx, result) -> log.error("Server error: {}", result.getErrorMessage())); cdcClient.login(new CDCLoginParameter("root", "root")); - // 3. 开启 CDC 数据订阅,用户只需要传入逻辑库和逻辑表,不需要关注底层数据分片情况,CDC Server 会将数据聚合后推送 + // 开始 CDC 数据同步,返回的 streamingId 是这次 CDC 任务的唯一标识,CDC Server 生成唯一标识的依据是 订阅的数据库名称 + 订阅的表 + 是否是全量同步 String streamingId = cdcClient.startStreaming(new StartStreamingParameter("sharding_db", Collections.singleton(SchemaTable.newBuilder().setTable("t_order").build()), true)); log.info("Streaming id={}", streamingId); - // stopStreaming 和 restartStreaming 非必需的操作,分别表示停止订阅和重启订阅 - // cdcClient.stopStreaming(streamingId); - // cdcClient.restartStreaming(streamingId); - // 4. 这里是阻塞线程,确保 CDC Client 一直运行。 + // 防止 main 主线程退出 cdcClient.await(); } } @@ -277,9 +205,9 @@ public final class Bootstrap { 如果需要更复杂数据消费的实现,例如写入到数据库,可以参考 [DataSourceRecordConsumer](https://github.com/apache/shardingsphere/blob/master/test/e2e/operation/pipeline/src/test/java/org/apache/shardingsphere/test/e2e/data/pipeline/cases/cdc/DataSourceRecordConsumer.java) -4. 通过 DistSQL 查看 CDC 任务状态 +#### 查看 CDC 任务运行情况 -CDC 任务的启动和停止目前只能通过 CDC Client 控制,可以在 proxy 中执行对应的 DistSQL 查看 CDC 任务的运行情况 +CDC 任务的启动和停止目前只能通过 CDC Client 控制,可以通过在 proxy 中执行 DistSQL 查看 CDC 任务状态 1. 查看 CDC 任务列表 @@ -314,9 +242,9 @@ sharding_db=> SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; -此时也会删除 openGauss 物理库上的 replication slots +只有当 CDC 任务没有订阅的时候才可以删除,此时也会删除 openGauss 物理库上的 replication slots ``` sharding_db=> DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; SUCCESS -``` \ No newline at end of file +``` From 237bc269bb30787bc55c5ceacb57673c247cff69 Mon Sep 17 00:00:00 2001 From: Xinze Guo <101622833+azexcy@users.noreply.github.com> Date: Thu, 16 Nov 2023 15:09:32 +0800 Subject: [PATCH 4/9] Add alter streaming rule doc --- .../shardingsphere-proxy/cdc/build.cn.md | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md index 394366a07ea26..388bc62ecaef4 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md @@ -116,6 +116,81 @@ sh bin/start.sh 确认启动成功。 +6. 按需配置迁移 + +6.1. 查询配置。 + +```sql +SHOW STREAMING RULE; +``` + +默认配置如下: + +``` ++--------------------------------------------------------------+--------------------------------------+-------------------------------------------------------+ +| read | write | stream_channel | ++--------------------------------------------------------------+--------------------------------------+-------------------------------------------------------+ +| {"workerThread":20,"batchSize":1000,"shardingSize":10000000} | {"workerThread":20,"batchSize":1000} | {"type":"MEMORY","props":{"block-queue-size":"2000"}} | ++--------------------------------------------------------------+--------------------------------------+-------------------------------------------------------+ +``` + +6.2. 修改配置(可选)。 + +因 streaming rule 具有默认值,无需创建,仅提供 ALTER 语句。 + +完整配置 DistSQL 示例: + +```sql +ALTER STREAMING RULE ( +READ( + WORKER_THREAD=20, + BATCH_SIZE=1000, + SHARDING_SIZE=10000000, + RATE_LIMITER (TYPE(NAME='QPS',PROPERTIES('qps'='500'))) +), +WRITE( + WORKER_THREAD=20, + BATCH_SIZE=1000, + RATE_LIMITER (TYPE(NAME='TPS',PROPERTIES('tps'='2000'))) +), +STREAM_CHANNEL (TYPE(NAME='MEMORY',PROPERTIES('block-queue-size'='2000'))) +); +``` + +配置项说明: + +```sql +ALTER STREAMING RULE ( +READ( -- 数据读取配置。如果不配置则部分参数默认生效。 + WORKER_THREAD=20, -- 从源端摄取全量数据的线程池大小。如果不配置则使用默认值。 + BATCH_SIZE=1000, -- 一次查询操作返回的最大记录数。如果不配置则使用默认值。 + SHARDING_SIZE=10000000, -- 存量数据分片大小。如果不配置则使用默认值。 + RATE_LIMITER ( -- 限流算法。如果不配置则不限流。 + TYPE( -- 算法类型。可选项:QPS + NAME='QPS', + PROPERTIES( -- 算法属性 + 'qps'='500' + ))) +), +WRITE( -- 数据写入配置。如果不配置则部分参数默认生效。 + WORKER_THREAD=20, -- 数据写入到目标端的线程池大小。如果不配置则使用默认值。 + BATCH_SIZE=1000, -- 存量任务一次批量写入操作的最大记录数。如果不配置则使用默认值。 + RATE_LIMITER ( -- 限流算法。如果不配置则不限流。 + TYPE( -- 算法类型。可选项:TPS + NAME='TPS', + PROPERTIES( -- 算法属性 + 'tps'='2000' + ))) +), +STREAM_CHANNEL ( -- 数据通道,连接生产者和消费者,用于 read 和 write 环节。如果不配置则默认使用 MEMORY 类型。 +TYPE( -- 算法类型。可选项:MEMORY +NAME='MEMORY', +PROPERTIES( -- 算法属性 +'block-queue-size'='2000' -- 属性:阻塞队列大小,堆内存比较小的时候需要调小该值。 +))) +); +``` + ## CDC Client 手册 CDC Client 不需要额外部署,只需要通过 maven 引入 CDC Client 的依赖就可以在项目中使用。用户可以通过 CDC Client 和服务端进行交互。 From 469ee6331df2e6085b6f7d680b6d705993bd9f60 Mon Sep 17 00:00:00 2001 From: Xinze Guo <101622833+azexcy@users.noreply.github.com> Date: Fri, 22 Dec 2023 16:24:54 +0800 Subject: [PATCH 5/9] Update doc --- .../shardingsphere-proxy/cdc/build.cn.md | 24 +- .../shardingsphere-proxy/cdc/build.en.md | 220 ++++++++++++++- .../shardingsphere-proxy/cdc/usage.cn.md | 11 + .../shardingsphere-proxy/cdc/usage.en.md | 257 +++++++++++++++++- 4 files changed, 498 insertions(+), 14 deletions(-) diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md index 388bc62ecaef4..e57f8fb880f9f 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md @@ -70,7 +70,7 @@ authority: privilege: type: ALL_PERMITTED -# 开启 GLT 的时候也需要打开分布式事务 +# 使用 GLT 的时候也需要开启分布式事务 #transaction: # defaultType: XA # providerType: Atomikos @@ -162,7 +162,7 @@ STREAM_CHANNEL (TYPE(NAME='MEMORY',PROPERTIES('block-queue-size'='2000'))) ```sql ALTER STREAMING RULE ( READ( -- 数据读取配置。如果不配置则部分参数默认生效。 - WORKER_THREAD=20, -- 从源端摄取全量数据的线程池大小。如果不配置则使用默认值。 + WORKER_THREAD=20, -- 从源端摄取全量数据的线程池大小。如果不配置则使用默认值。需要确保该值不低于分库的数量 BATCH_SIZE=1000, -- 一次查询操作返回的最大记录数。如果不配置则使用默认值。 SHARDING_SIZE=10000000, -- 存量数据分片大小。如果不配置则使用默认值。 RATE_LIMITER ( -- 限流算法。如果不配置则不限流。 @@ -209,13 +209,13 @@ CDC Client 不需要额外部署,只需要通过 maven 引入 CDC Client 的 `org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` 是 CDC Client 的入口类,用户可以通过该类和 CDC Server 进行交互。主要的和新方法如下。 -| 方法名 | 返回值 | 说明 | -|-----------------------------------------------------------------------------------------------------------------------------|--------------------------------------|---------------------------------------------------------------------------------| -| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | 和服务端进行连接,连接的时候需要指定 1. 数据的消费处理逻辑 2. 消费时候的异常处理逻辑 3. 服务端错误的异常处理逻辑 | -| login(CDCLoginParameter parameter) | void | CDC 登陆 CDCLoginParameter 参数 - username:用户名 - password:密码 | -| startStreaming(StartStreamingParameter parameter) | java.lang.String (CDC 任务唯一标识,用于后续操作) | 开启 CDC 订阅 StartStreamingParameter 参数 - database:逻辑库名称 - schemaTables:订阅的表名 - full:是否订阅全量数据 | -| restartStreaming(String streamingId) | void | 重启订阅 | -| stopStreaming(String streamingId) | void | 停止订阅 | -| dropStreaming(String streamingId) | void | 删除订阅 | -| await() | void | 阻塞 CDC 线程,等待 channel 关闭 | -| close() | void | 关闭 channel,流程结束。 | +| 方法名 | 返回值 | 说明 | +|-----------------------------------------------------------------------------------------------------------------------------|------------------------------------|---------------------------------------------------------------------------------------------------------| +| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | 和服务端进行连接,连接的时候需要指定
1. 数据的消费处理逻辑
2. 消费时候的异常处理逻辑
3. 服务端错误的异常处理逻辑 | +| login(CDCLoginParameter parameter) | void | CDC登陆,参数
username:用户名
password:密码 | +| startStreaming(StartStreamingParameter parameter) | String (CDC 任务唯一标识,用于后续操作) | 开启 CDC 订阅, StartStreamingParameter 参数
database:逻辑库名称
schemaTables:订阅的表名
full:是否订阅全量数据 | +| restartStreaming(String streamingId) | void | 重启订阅 | +| stopStreaming(String streamingId) | void | 停止订阅 | +| dropStreaming(String streamingId) | void | 删除订阅 | +| await() | void | 阻塞 CDC 线程,等待 channel 关闭 | +| close() | void | 关闭 channel,流程结束。 | diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md index eae2a595db983..03cbb6ad0e5ac 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md @@ -3,5 +3,223 @@ title = "Build" weight = 1 +++ -TODO +The document is in Chinese and it's about the deployment and usage of ShardingSphere CDC (Change Data Capture). Here's a rough translation: +Title: "Deployment Operation" +Weight: 1 + +## Background Information + +ShardingSphere CDC is divided into two parts, one is the CDC Server, and the other is the CDC Client. The CDC Server and ShardingSphere-Proxy are currently deployed together. + +Users can introduce the CDC Client into their own projects to implement data consumption logic. + +## Constraints + +- Pure JAVA development, JDK recommended 1.8 or above. +- The CDC Server requires SharingSphere-Proxy to use cluster mode, currently supports ZooKeeper as the registry center. +- CDC only synchronizes data, does not synchronize table structure, and currently does not support DDL statement synchronization. +- The CDC incremental phase will output data according to the dimension of the transaction. If you want to enable XA transaction compatibility, both openGauss and ShardingSphere-Proxy need the GLT module. + +## CDC Server Deployment Steps + +Here, the openGauss database is used as an example to introduce the deployment steps of the CDC Server. + +Since the CDC Server is built into the ShardingSphere-Proxy, you need to get the ShardingSphere-Proxy. For details, please refer to the [proxy startup manual](/cn/user-manual/shardingsphere-proxy/startup/bin/). + +### Configure the GLT Module (Optional) + +The official binary package does not include the GLT module by default and does not guarantee the integrity of cross-database transactions. If you are using the openGauss database with the GLT function, you can additionally introduce the GLT module to ensure the integrity of cross-database transactions. + +There are currently two ways to introduce the GLT module, and you also need to make corresponding configurations in server.yaml. + +#### 1. Source code compilation and installation + +1. Prepare the code environment, download in advance or use Git clone to download the [ShardingSphere](https://github.com/apache/shardingsphere.git) source code from Github. +2. Delete the `provided` tag of the shardingsphere-global-clock-tso-provider-redis dependency in kernel/global-clock/type/tso/core/pom.xml and the `provided` tag of jedis in kernel/global-clock/type/tso/provider/redis/pom.xml +3. Compile ShardingSphere-Proxy, for specific compilation steps, please refer to the [ShardingSphere Compilation Manual](https://github.com/apache/shardingsphere/wiki#build-apache-shardingsphere). + +#### 2. Directly introduce GLT dependencies + +Can be introduced from the maven repository + +1. [shardingsphere-global-clock-tso-provider-redis](https://repo1.maven.org/maven2/org/apache/shardingsphere/shardingsphere-global-clock-tso-provider-redis), download the same version as ShardingSphere-Proxy +2. [jedis-4.3.1](https://repo1.maven.org/maven2/redis/clients/jedis/4.3.1/jedis-4.3.1.jar) + +### CDC Server User Manual + +1. Modify the configuration file `conf/server.yaml`, turn on the CDC function. Currently, `mode` must be `Cluster`, and the corresponding registry center needs to be started in advance. If the GLT provider uses Redis, Redis needs to be started in advance. + +Configuration example: + +1. Enable the CDC function in `server.yaml`. + +```yaml +mode: + type: Cluster + repository: + type: ZooKeeper + props: + namespace: cdc_demo + server-lists: localhost:2181 + retryIntervalMilliseconds: 500 + timeToLiveSeconds: 60 + maxRetries: 3 + operationTimeoutMilliseconds: 500 + +authority: + users: + - user: root@% + password: root + privilege: + type: ALL_PERMITTED + +# When using GLT, you also need to enable distributed transactions +#transaction: +# defaultType: XA +# providerType: Atomikos +# +#globalClock: +# enabled: true +# type: TSO +# provider: redis +# props: +# host: 127.0.0.1 +# port: 6379 + +props: + system-log-level: INFO + check-table-metadata-enabled: false + proxy-default-port: 3307 # Proxy default port. + cdc-server-port: 33071 # CDC Server port, must be configured + #proxy-frontend-database-protocol-type: openGauss # Consistent with the type of backend database +``` + +2. Introduce JDBC driver. + +The proxy already includes the PostgreSQL JDBC driver. + +If the backend connects to the following databases, please download the corresponding JDBC driver jar package and put it in the `${shardingsphere-proxy}/ext-lib` directory. + +| Database | JDBC Driver | +|-----------|---------------------------------------------------------------------------------------------------------------------------------| +| MySQL | [mysql-connector-java-8.0.31.jar](https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.31/) | +| openGauss | [opengauss-jdbc-3.1.1-og.jar](https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/3.1.1-og/opengauss-jdbc-3.1.1-og.jar) | + +4. Start ShardingSphere-Proxy: + +``` +sh bin/start.sh +``` + +5. View the proxy log `logs/stdout.log`, see in the log: + +``` +[INFO ] [main] o.a.s.p.frontend.ShardingSphereProxy - ShardingSphere-Proxy Cluster mode started successfully +``` + +Confirm successful startup. + +6. Configure migration as needed + +6.1. Query configuration. + +```sql +SHOW STREAMING RULE; +``` + +The default configuration is as follows: + +``` ++--------------------------------------------------------------+--------------------------------------+-------------------------------------------------------+ +| read | write | stream_channel | ++--------------------------------------------------------------+--------------------------------------+-------------------------------------------------------+ +| {"workerThread":20,"batchSize":1000,"shardingSize":10000000} | {"workerThread":20,"batchSize":1000} | {"type":"MEMORY","props":{"block-queue-size":"2000"}} | ++--------------------------------------------------------------+--------------------------------------+-------------------------------------------------------+ +``` + +6.2. Modify configuration (optional). + +Because the streaming rule has default values, no creation is required, only the ALTER statement is provided. + +Complete configuration DistSQL example: + +```sql +ALTER STREAMING RULE ( +READ( + WORKER_THREAD=20, + BATCH_SIZE=1000, + SHARDING_SIZE=10000000, + RATE_LIMITER (TYPE(NAME='QPS',PROPERTIES('qps'='500'))) +), +WRITE( + WORKER_THREAD=20, + BATCH_SIZE=1000, + RATE_LIMITER (TYPE(NAME='TPS',PROPERTIES('tps'='2000'))) +), +STREAM_CHANNEL (TYPE(NAME='MEMORY',PROPERTIES('block-queue-size'='2000'))) +); +``` + +Configuration item description: + +```sql +ALTER STREAMING RULE ( +READ( -- Data reading configuration. If not configured, some parameters will take effect by default. + WORKER_THREAD=20, -- The size of the thread pool for fetching full data from the source end. If not configured, the default value will be used. It needs to ensure that this value is not lower than the number of sub-libraries + BATCH_SIZE=1000, -- The maximum number of records returned by a query operation. If not configured, the default value will be used. + SHARDING_SIZE=10000000, -- The size of the stock data partition. If not configured, the default value will be used. + RATE_LIMITER ( -- Rate limiting algorithm. If not configured, no rate limiting. + TYPE( -- Algorithm type. Optional: QPS + NAME='QPS', + PROPERTIES( -- Algorithm properties + 'qps'='500' + ))) +), +WRITE( -- Data writing configuration. If not configured, some parameters will take effect by default. + WORKER_THREAD=20, -- The size of the thread pool for writing data to the target end. If not configured, the default value will be used. + BATCH_SIZE=1000, -- The maximum number of records for a batch write operation of a stock task. If not configured, the default value will be used. + RATE_LIMITER ( -- Rate limiting algorithm. If not configured, no rate limiting. + TYPE( -- Algorithm type. Optional: TPS + NAME='TPS', + PROPERTIES( -- Algorithm properties + 'tps'='2000' + ))) +), +STREAM_CHANNEL ( -- Data channel, connecting producers and consumers, used for read and write links. If not configured, the MEMORY type is used by default. +TYPE( -- Algorithm type. Optional: MEMORY +NAME='MEMORY', +PROPERTIES( -- Algorithm properties +'block-queue-size'='2000' -- Property: Blocking queue size, when the heap memory is relatively small, this value needs to be reduced. +))) +); +``` + +## CDC Client Manual + +The CDC Client does not need to be deployed separately, just introduce the CDC Client's dependency through maven to use it in the project. Users can interact with the server through the CDC Client. + +If necessary, users can also implement a CDC Client themselves to consume data and ACK. + +```xml + + org.apache.shardingsphere + shardingsphere-data-pipeline-cdc-client + ${version} + +``` + +### CDC Client Introduction + +`org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` is the entry class of the CDC Client, users can interact with the CDC Server through this class. The main new methods are as follows. + +| Method Name | Return Value | Description | +|-----------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | Connect to the server, when connecting, you need to specify 1. Data consumption processing logic 2. Exception handling logic during consumption 3. Server error exception handling logic | +| login(CDCLoginParameter parameter) | void | CDC login CDCLoginParameter parameter - username: username - password: password | +| startStreaming(StartStreamingParameter parameter) | java.lang.String (CDC task unique identifier, used for subsequent operations) | Start CDC subscription StartStreamingParameter parameter - database: logical library name - schemaTables: subscribed table name - full: whether to subscribe to full data | +| restartStreaming(String streamingId) | void | Restart subscription | +| stopStreaming(String streamingId) | void | Stop subscription | +| dropStreaming(String streamingId) | void | Delete subscription | +| await() | void | Block the CDC thread, waiting for the channel to close | +| close() | void | Close the channel, the process ends. | diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md index aaf95864d2ce2..a2308d3988f64 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md @@ -5,6 +5,8 @@ weight = 2 ## CDC 功能介绍 +CDC 只会同步数据,不会同步表结构,目前也不支持 DDL 的语句同步。 + ### CDC 协议介绍 CDC 协议使用 Protobuf,对应的 Protobuf 类型是根据 Java 中的类型来映射的。 @@ -248,3 +250,12 @@ DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; sharding_db=> DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; SUCCESS ``` + +# 注意事项 + +## 关于数据增量推送的说明 + +1. CDC 增量推送目前是按照事务维度的,物理库的事务不会被拆分,所以如果一个事务中有多个表的数据变更,那么这些数据变更会被一起推送。 +如果要支持 XA 事务(目前只支持 openGauss),则 openGauss 和 Proxy 都需要 GLT 模块。 +2. 在存在超大事务的情况下,目前的处理逻辑可能会导致 CDC 所在的进程 OOM,后续可能会强制截断。 +3. 满足推送的条件是满足了一定大小的数据量或者到了一定的时间间隔(目前是 300ms),在处理 XA 事务时,收到的多个分库增量事件超过了 300ms,可能会导致 XA 事务被拆开推送。 \ No newline at end of file diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md index 2e6bde2fefb48..e7b8cb910571e 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md @@ -3,4 +3,259 @@ title = "Manual" weight = 2 +++ -TODO \ No newline at end of file +## CDC 功能介绍 + +CDC will only synchronize data, it will not synchronize table structures, and currently does not support the synchronization of DDL statements. + +### CDC 协议介绍 + +The CDC protocol uses Protobuf, and the corresponding Protobuf types are mapped based on the types in Java. + +Here, taking openGauss as an example, the mapping relationship between the data types of the CDC protocol and the database types is as follows. + +| openGauss type | Java data type | protobuf type corresponding to CDC | Remarks | +|------------------------------------------|--------------------|------------------------------------|---------------------------------------------------------------------------------------------------------------| +| tinyint,smallint,integer | Integer | int32 | | +| bigint | Long | int64 | | +| numeric | BigDecimal | string | | +| real,float4 | Float | float | | +| binary_double,double precision | Double | double | | +| boolean | Boolean | bool | | +| char,varchar,text,clob | String | string | | +| blob,bytea,raw | byte[] | bytes | | +| date,timestamp,timestamptz,smalldatetime | java.sql.Timestamp | Timestamp | The Timestamp type of protobuf only contains seconds and nanoseconds, so it has nothing to do with time zones | +| time,timetz | java.sql.Time | int64 | Represents the number of nanoseconds in the day, regardless of time zone | +| interval,reltime,abstime | String | string | | +| point,lseg,box,path,polygon,circle | String | string | | +| cidr,inet,macaddr | String | string | | +| tsvector | String | string | | +| tsquery | String | String | | +| uuid | String | string | | +| json,jsonb | String | string | | +| hll | String | string | | +| int4range,daterange,tsrange,tstzrange | String | string | | +| hash16,hash32 | String | string | | +| bit,bit varying | String | string | bit(1) returns Boolean type | + +## openGauss User Manual + +### 环境要求 + +支持的 openGauss 版本:2.x ~ 3.x。 + +### 权限要求 + +1. 调整源端 WAL 配置。 + +`postgresql.conf` 示例配置: +``` +wal_level = logical +max_wal_senders = 10 +max_replication_slots = 10 +wal_sender_timeout = 0 +max_connections = 600 +``` + +详情请参见 [Write Ahead Log](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/settings.html) 和 [Replication](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/sending-server.html)。 + +2. 赋予源端 openGauss 账号 replication 权限。 + +`pg_hba.conf` 示例配置: + +``` +host replication repl_acct 0.0.0.0/0 md5 +# 0.0.0.0/0 表示允许任意 IP 地址访问,可以根据实际情况调整成 CDC Server 的 IP 地址 +``` + +详情请参见 [Configuring Client Access Authentication](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/configuring-client-access-authentication.html) 和 [Example: Logic Replication Code](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/example-logic-replication-code.html)。 + +3. 赋予 openGauss 账号 DDL DML 权限。 + +如果使用非超级管理员账号,要求该账号在用到的数据库上,具备 CREATE 和 CONNECT 的权限。 + +示例: +```sql +GRANT CREATE, CONNECT ON DATABASE source_ds TO cdc_user; +``` + +还需要账号对迁移的表和 schema 具备访问权限,以 test schema 下的 t_order 表为例。 + +```sql +\c source_ds + +GRANT USAGE ON SCHEMA test TO GROUP cdc_user; +GRANT SELECT ON TABLE test.t_order TO cdc_user; +``` + +openGauss 有 OWNER 的概念,如果是数据库,SCHEMA,表的 OWNER,则可以省略对应的授权步骤。 + +openGauss 不允许普通账户在 public schema 下操作。所以如果迁移的表在 public schema 下,需要额外授权。 + +```sql +GRANT ALL PRIVILEGES TO cdc_user; +``` + +详情请参见 [openGauss GRANT](https://docs.opengauss.org/zh/docs/2.0.1/docs/Developerguide/GRANT.html) + +### 完整流程示例 + +#### 前提条件 + +1. 准备好 CDC 源端的库、表、数据。 + +```sql +DROP DATABASE IF EXISTS ds_0; +CREATE DATABASE ds_0; + +DROP DATABASE IF EXISTS ds_1; +CREATE DATABASE ds_1; +``` + +#### 配置 CDC Server + +1. 创建逻辑库。 + +```sql +CREATE DATABASE sharding_db; + +\c sharding_db +``` +2. 注册存储单元。 + +```sql +REGISTER STORAGE UNIT ds_0 ( + URL="jdbc:opengauss://127.0.0.1:5432/ds_0", + USER="gaussdb", + PASSWORD="Root@123", + PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") +), ds_1 ( + URL="jdbc:opengauss://127.0.0.1:5432/ds_1", + USER="gaussdb", + PASSWORD="Root@123", + PROPERTIES("minPoolSize"="1","maxPoolSize"="20","idleTimeout"="60000") +); +``` + +3. 创建分片规则。 + +```sql +CREATE SHARDING TABLE RULE t_order( +STORAGE_UNITS(ds_0,ds_1), +SHARDING_COLUMN=order_id, +TYPE(NAME="hash_mod",PROPERTIES("sharding-count"="2")), +KEY_GENERATE_STRATEGY(COLUMN=order_id,TYPE(NAME="snowflake")) +); +``` + +4. 创建表和初始化数据 + +在 proxy 执行建表语句。 + +```sql +CREATE TABLE t_order (id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (id)); + +INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); +``` + +#### 启动 CDC Client + +目前 CDC Client 只提供了 Java API,用户需要自行实现数据的消费逻辑。 + +下面是一个简单的启动 CDC Client 的示例。 + +```java +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient; +import org.apache.shardingsphere.data.pipeline.cdc.client.config.CDCClientConfiguration; +import org.apache.shardingsphere.data.pipeline.cdc.client.handler.RetryStreamingExceptionHandler; +import org.apache.shardingsphere.data.pipeline.cdc.client.parameter.CDCLoginParameter; +import org.apache.shardingsphere.data.pipeline.cdc.client.parameter.StartStreamingParameter; +import org.apache.shardingsphere.data.pipeline.cdc.protocol.request.StreamDataRequestBody.SchemaTable; + +import java.util.Collections; + +@Slf4j +public final class Bootstrap { + + @SneakyThrows(InterruptedException.class) + public static void main(final String[] args) { + String address = "127.0.0.1"; + // 构造 CDCClient,传入 CDCClientConfiguration,CDCClientConfiguration 中包含了 CDC Server 的地址和端口,以及超时时间 + try (CDCClient cdcClient = new CDCClient(new CDCClientConfiguration(address, 33071, 10000))) { + // 先调用 connect 连接到 CDC Server,需要传入 1. 数据的消费处理逻辑 2. 消费时候的异常处理逻辑 3. 服务端错误的异常处理逻辑 + cdcClient.connect(records -> log.info("records: {}", records), new RetryStreamingExceptionHandler(cdcClient, 5, 5000), + (ctx, result) -> log.error("Server error: {}", result.getErrorMessage())); + cdcClient.login(new CDCLoginParameter("root", "root")); + // 开始 CDC 数据同步,返回的 streamingId 是这次 CDC 任务的唯一标识,CDC Server 生成唯一标识的依据是 订阅的数据库名称 + 订阅的表 + 是否是全量同步 + String streamingId = cdcClient.startStreaming(new StartStreamingParameter("sharding_db", Collections.singleton(SchemaTable.newBuilder().setTable("t_order").build()), true)); + log.info("Streaming id={}", streamingId); + // 防止 main 主线程退出 + cdcClient.await(); + } + } +} +``` + +主要有4个步骤 +1. 构造 CDCClient,传入 CDCClientConfiguration +2. 调用 CDCClient.connect,这一步是和 CDC Server 建立连接 +3. 调用 CDCClient.login,使用 server.yaml 中配置好的用户名和密码登录 +4. 调用 CDCClient.startStreaming,开启订阅,需要保证订阅的库和表在 ShardingSphere-Proxy 存在,否则会报错。 + +> CDCClient.await 是阻塞主线程,非必需的步骤,用其他方式也可以,只要保证 CDC 线程一直在工作就行。 + +如果需要更复杂数据消费的实现,例如写入到数据库,可以参考 [DataSourceRecordConsumer](https://github.com/apache/shardingsphere/blob/master/test/e2e/operation/pipeline/src/test/java/org/apache/shardingsphere/test/e2e/data/pipeline/cases/cdc/DataSourceRecordConsumer.java) + +#### 查看 CDC 任务运行情况 + +CDC 任务的启动和停止目前只能通过 CDC Client 控制,可以通过在 proxy 中执行 DistSQL 查看 CDC 任务状态 + +1. 查看 CDC 任务列表 + +SHOW STREAMING LIST; + +运行结果 + +``` +sharding_db=> SHOW STREAMING LIST; + id | database | tables | job_item_count | active | create_time | stop_time +--------------------------------------------+-------------+---------+----------------+--------+---------------------+----------- + j0302p0000702a83116fcee83f70419ca5e2993791 | sharding_db | t_order | 1 | true | 2023-10-27 22:01:27 | +(1 row) +``` + +2. 查看 CDC 任务详情 + +SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; + +运行结果 + +``` +sharding_db=> SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; + item | data_source | status | active | processed_records_count | inventory_finished_percentage | incremental_idle_seconds | error_message +------+-------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+--------------- + 0 | ds_0 | EXECUTE_INCREMENTAL_TASK | true | 1 | 100 | 101 | + 1 | ds_1 | EXECUTE_INCREMENTAL_TASK | true | 2 | 100 | 100 | +(2 rows) +``` + +3. 删除 CDC 任务 + +DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; + +只有当 CDC 任务没有订阅的时候才可以删除,此时也会删除 openGauss 物理库上的 replication slots + +``` +sharding_db=> DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; +SUCCESS +``` + +# 注意事项 + +## 关于数据增量推送的说明 + +1. CDC 增量推送目前是按照事务维度的,物理库的事务不会被拆分,所以如果一个事务中有多个表的数据变更,那么这些数据变更会被一起推送。 +如果要支持 XA 事务(目前只支持 openGauss),则 openGauss 和 Proxy 都需要 GLT 模块。 +2. 在存在超大事务的情况下,目前的处理逻辑可能会导致 CDC 所在的进程 OOM,后续可能会强制截断。 +3. 满足推送的条件是满足了一定大小的数据量或者到了一定的时间间隔(目前是 300ms),在处理 XA 事务时,收到的多个分库增量事件超过了 300ms,可能会导致 XA 事务被拆开推送。 From c4a4082a59cc8bd8d7a20bef70942103a746909f Mon Sep 17 00:00:00 2001 From: Xinze Guo <101622833+azexcy@users.noreply.github.com> Date: Sun, 24 Dec 2023 22:08:45 +0800 Subject: [PATCH 6/9] Update doc --- .../shardingsphere-proxy/cdc/_index.cn.md | 4 ++- .../shardingsphere-proxy/cdc/build.cn.md | 29 ++++++++++--------- .../shardingsphere-proxy/cdc/usage.cn.md | 14 +++++++-- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md index 2242d67887914..d33beb7884df7 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md @@ -5,4 +5,6 @@ weight = 9 ## 简介 -用户可以通过 ShardingSphere 的 CDC 功能进行数据同步,也可用于 ETL,目前支持 openGauss、MySQL 和 PostgreSQL +CDC(Change Data Capture)增量数据捕捉。CDC 可以监控 ShardingSphere-Proxy 的存储节点中的数据变化,捕捉到数据操作事件,过滤并提取有用信息,最终将这些变化数据发送到指定的目标上. + +CDC可以用于数据同步,数据备份和恢复等方面。通常情况下,目前支持 openGauss、MySQL 和 PostgreSQL. diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md index e57f8fb880f9f..978322362338b 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md @@ -30,17 +30,20 @@ ShardingSphere CDC 分为两个部分,一个是 CDC Server,另一个是 CDC #### 1. 源码编译安装 -1. 准备代码环境,提前下载或者使用 Git clone,从 Github 下载 [ShardingSphere](https://github.com/apache/shardingsphere.git) 源码。 -2. 删除 kernel/global-clock/type/tso/core/pom.xml 中 shardingsphere-global-clock-tso-provider-redis 依赖的 `provided` 标签和 kernel/global-clock/type/tso/provider/redis/pom.xml 中 jedis +1.1 准备代码环境,提前下载或者使用 Git clone,从 Github 下载 [ShardingSphere](https://github.com/apache/shardingsphere.git) 源码。 + +1.2 删除 kernel/global-clock/type/tso/core/pom.xml 中 shardingsphere-global-clock-tso-provider-redis 依赖的 `provided` 标签和 kernel/global-clock/type/tso/provider/redis/pom.xml 中 jedis 的 `provided` 标签 -3. 编译 ShardingSphere-Proxy,具体编译步骤请参考 [ShardingSphere 编译手册](https://github.com/apache/shardingsphere/wiki#build-apache-shardingsphere)。 + +1.3 编译 ShardingSphere-Proxy,具体编译步骤请参考 [ShardingSphere 编译手册](https://github.com/apache/shardingsphere/wiki#build-apache-shardingsphere)。 #### 2. 直接引入 GLT 依赖 可以从 maven 仓库中引入 -1. [shardingsphere-global-clock-tso-provider-redis](https://repo1.maven.org/maven2/org/apache/shardingsphere/shardingsphere-global-clock-tso-provider-redis),下载和 ShardingSphere-Proxy 同名版本 -2. [jedis-4.3.1](https://repo1.maven.org/maven2/redis/clients/jedis/4.3.1/jedis-4.3.1.jar) +2.1. [shardingsphere-global-clock-tso-provider-redis](https://repo1.maven.org/maven2/org/apache/shardingsphere/shardingsphere-global-clock-tso-provider-redis),下载和 ShardingSphere-Proxy 同名版本 + +2.2. [jedis-4.3.1](https://repo1.maven.org/maven2/redis/clients/jedis/4.3.1/jedis-4.3.1.jar) ### CDC Server 使用手册 @@ -116,7 +119,7 @@ sh bin/start.sh 确认启动成功。 -6. 按需配置迁移 +6. 按需配置 CDC 任务同步配置 6.1. 查询配置。 @@ -162,10 +165,10 @@ STREAM_CHANNEL (TYPE(NAME='MEMORY',PROPERTIES('block-queue-size'='2000'))) ```sql ALTER STREAMING RULE ( READ( -- 数据读取配置。如果不配置则部分参数默认生效。 - WORKER_THREAD=20, -- 从源端摄取全量数据的线程池大小。如果不配置则使用默认值。需要确保该值不低于分库的数量 - BATCH_SIZE=1000, -- 一次查询操作返回的最大记录数。如果不配置则使用默认值。 - SHARDING_SIZE=10000000, -- 存量数据分片大小。如果不配置则使用默认值。 - RATE_LIMITER ( -- 限流算法。如果不配置则不限流。 + WORKER_THREAD=20, -- 影响全量、增量任务,从源端摄取数据的线程池大小。不配置则使用默认值。需要确保该值不低于分库的数量 + BATCH_SIZE=1000, -- 影响全量、增量任务,一次查询操作返回的最大记录数。如果一个事务中的数据量大于该值,增量情况下可能超过设定的值。 + SHARDING_SIZE=10000000, -- 影响全量任务,存量数据分片大小。如果不配置则使用默认值。 + RATE_LIMITER ( -- 影响全量、增量任务,限流算法。如果不配置则不限流。 TYPE( -- 算法类型。可选项:QPS NAME='QPS', PROPERTIES( -- 算法属性 @@ -173,8 +176,8 @@ READ( -- 数据读取配置。如果不配置则部分参数默认生效。 ))) ), WRITE( -- 数据写入配置。如果不配置则部分参数默认生效。 - WORKER_THREAD=20, -- 数据写入到目标端的线程池大小。如果不配置则使用默认值。 - BATCH_SIZE=1000, -- 存量任务一次批量写入操作的最大记录数。如果不配置则使用默认值。 + WORKER_THREAD=20, -- 影响全量、增量任务,数据写入到目标端的线程池大小。如果不配置则使用默认值。 + BATCH_SIZE=1000, -- 影响全量、增量任务,存量任务一次批量写入操作的最大记录数。如果不配置则使用默认值。如果一个事务中的数据量大于该值,增量情况下可能超过设定的值。 RATE_LIMITER ( -- 限流算法。如果不配置则不限流。 TYPE( -- 算法类型。可选项:TPS NAME='TPS', @@ -186,7 +189,7 @@ STREAM_CHANNEL ( -- 数据通道,连接生产者和消费者,用于 read 和 TYPE( -- 算法类型。可选项:MEMORY NAME='MEMORY', PROPERTIES( -- 算法属性 -'block-queue-size'='2000' -- 属性:阻塞队列大小,堆内存比较小的时候需要调小该值。 +'block-queue-size'='2000' -- 属性:阻塞队列大小 ))) ); ``` diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md index a2308d3988f64..1ed2db9aea6a7 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md @@ -253,9 +253,17 @@ SUCCESS # 注意事项 -## 关于数据增量推送的说明 +## 增量数据推送的说明 1. CDC 增量推送目前是按照事务维度的,物理库的事务不会被拆分,所以如果一个事务中有多个表的数据变更,那么这些数据变更会被一起推送。 如果要支持 XA 事务(目前只支持 openGauss),则 openGauss 和 Proxy 都需要 GLT 模块。 -2. 在存在超大事务的情况下,目前的处理逻辑可能会导致 CDC 所在的进程 OOM,后续可能会强制截断。 -3. 满足推送的条件是满足了一定大小的数据量或者到了一定的时间间隔(目前是 300ms),在处理 XA 事务时,收到的多个分库增量事件超过了 300ms,可能会导致 XA 事务被拆开推送。 \ No newline at end of file +2. 满足推送的条件是满足了一定大小的数据量或者到了一定的时间间隔(目前是 300ms),在处理 XA 事务时,收到的多个分库增量事件超过了 300ms,可能会导致 XA 事务被拆开推送。 + +## 超大事务的处理 + +目前是将大事务完整解析,这样可能会导致 CDC Server 进程 OOM,后续可能会考虑强制截断。 + +## 建议的配置 + +1. 如果有限流的要求,读写推荐只配置一侧,另一侧会自动受到限流的影响。 +2. CDC 的任务配置需要根据实际情况调整,不是线程数越多越好。例如堆内存比较小的时候需要调小阻塞队列的值,以免堆内存不够用。 \ No newline at end of file From 80137d34cbf04475de7d0dd802adfb568d7ed08c Mon Sep 17 00:00:00 2001 From: Xinze Guo <101622833+azexcy@users.noreply.github.com> Date: Thu, 28 Dec 2023 18:22:27 +0800 Subject: [PATCH 7/9] Update english doc --- .../shardingsphere-proxy/cdc/_index.cn.md | 2 - .../shardingsphere-proxy/cdc/_index.en.md | 4 +- .../shardingsphere-proxy/cdc/build.en.md | 88 ++++----- .../shardingsphere-proxy/cdc/usage.en.md | 176 +++++++++--------- 4 files changed, 138 insertions(+), 132 deletions(-) diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md index d33beb7884df7..43f34e863629c 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md @@ -3,8 +3,6 @@ title = "CDC" weight = 9 +++ -## 简介 - CDC(Change Data Capture)增量数据捕捉。CDC 可以监控 ShardingSphere-Proxy 的存储节点中的数据变化,捕捉到数据操作事件,过滤并提取有用信息,最终将这些变化数据发送到指定的目标上. CDC可以用于数据同步,数据备份和恢复等方面。通常情况下,目前支持 openGauss、MySQL 和 PostgreSQL. diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.en.md index f27d81dad8214..5052863886e83 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.en.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.en.md @@ -3,6 +3,6 @@ title = "CDC" weight = 9 +++ -## 简介 +CDC (Change Data Capture) captures incremental data changes. CDC can monitor data changes in the storage nodes of ShardingSphere-Proxy, capture data operation events, filter and extract useful information, and finally send these changed data to a specified target. -Users can synchronize data through ShardingSphere's CDC feature, which can also be used for ETL and currently supports openGauss, MySQL and PostgreSQL. \ No newline at end of file +CDC can be used for data synchronization, data backup and recovery, etc. Under normal circumstances, it currently supports openGauss, MySQL, and PostgreSQL. \ No newline at end of file diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md index 03cbb6ad0e5ac..b7263cb180dc6 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md @@ -3,11 +3,6 @@ title = "Build" weight = 1 +++ -The document is in Chinese and it's about the deployment and usage of ShardingSphere CDC (Change Data Capture). Here's a rough translation: - -Title: "Deployment Operation" -Weight: 1 - ## Background Information ShardingSphere CDC is divided into two parts, one is the CDC Server, and the other is the CDC Client. The CDC Server and ShardingSphere-Proxy are currently deployed together. @@ -17,42 +12,45 @@ Users can introduce the CDC Client into their own projects to implement data con ## Constraints - Pure JAVA development, JDK recommended 1.8 or above. -- The CDC Server requires SharingSphere-Proxy to use cluster mode, currently supports ZooKeeper as the registry center. +- CDC Server requires SharingSphere-Proxy to use cluster mode, currently supports ZooKeeper as the registry center. - CDC only synchronizes data, does not synchronize table structure, and currently does not support DDL statement synchronization. -- The CDC incremental phase will output data according to the dimension of the transaction. If you want to enable XA transaction compatibility, both openGauss and ShardingSphere-Proxy need the GLT module. +- CDC incremental stage will output data according to the dimension of the transaction. If you want to enable XA transaction compatibility, both openGauss and ShardingSphere-Proxy need the GLT module. ## CDC Server Deployment Steps Here, the openGauss database is used as an example to introduce the deployment steps of the CDC Server. -Since the CDC Server is built into the ShardingSphere-Proxy, you need to get the ShardingSphere-Proxy. For details, please refer to the [proxy startup manual](/cn/user-manual/shardingsphere-proxy/startup/bin/). +Since the CDC Server is built into ShardingSphere-Proxy, you need to get ShardingSphere-Proxy. For details, please refer to the [proxy startup manual](/cn/user-manual/shardingsphere-proxy/startup/bin/). -### Configure the GLT Module (Optional) +### Configure GLT Module (Optional) -The official binary package does not include the GLT module by default and does not guarantee the integrity of cross-database transactions. If you are using the openGauss database with the GLT function, you can additionally introduce the GLT module to ensure the integrity of cross-database transactions. +The official website's released binary package does not include the GLT module by default and does not guarantee the integrity of cross-library transactions. If you are using the openGauss database with GLT functionality, you can additionally introduce the GLT module to ensure the integrity of cross-library transactions. -There are currently two ways to introduce the GLT module, and you also need to make corresponding configurations in server.yaml. +There are currently two ways to introduce the GLT module, and corresponding configurations need to be made in server.yaml. #### 1. Source code compilation and installation -1. Prepare the code environment, download in advance or use Git clone to download the [ShardingSphere](https://github.com/apache/shardingsphere.git) source code from Github. -2. Delete the `provided` tag of the shardingsphere-global-clock-tso-provider-redis dependency in kernel/global-clock/type/tso/core/pom.xml and the `provided` tag of jedis in kernel/global-clock/type/tso/provider/redis/pom.xml -3. Compile ShardingSphere-Proxy, for specific compilation steps, please refer to the [ShardingSphere Compilation Manual](https://github.com/apache/shardingsphere/wiki#build-apache-shardingsphere). +1.1 Prepare the code environment, download in advance or use Git clone to download the [ShardingSphere](https://github.com/apache/shardingsphere.git) source code from Github. + +1.2 Delete the `provided` tag of the shardingsphere-global-clock-tso-provider-redis dependency in kernel/global-clock/type/tso/core/pom.xml and the `provided` tag of jedis in kernel/global-clock/type/tso/provider/redis/pom.xml + +1.3 Compile ShardingSphere-Proxy, for specific compilation steps, please refer to the [ShardingSphere Compilation Manual](https://github.com/apache/shardingsphere/wiki#build-apache-shardingsphere). #### 2. Directly introduce GLT dependencies Can be introduced from the maven repository -1. [shardingsphere-global-clock-tso-provider-redis](https://repo1.maven.org/maven2/org/apache/shardingsphere/shardingsphere-global-clock-tso-provider-redis), download the same version as ShardingSphere-Proxy -2. [jedis-4.3.1](https://repo1.maven.org/maven2/redis/clients/jedis/4.3.1/jedis-4.3.1.jar) +2.1. [shardingsphere-global-clock-tso-provider-redis](https://repo1.maven.org/maven2/org/apache/shardingsphere/shardingsphere-global-clock-tso-provider-redis), download the same version as ShardingSphere-Proxy + +2.2. [jedis-4.3.1](https://repo1.maven.org/maven2/redis/clients/jedis/4.3.1/jedis-4.3.1.jar) ### CDC Server User Manual -1. Modify the configuration file `conf/server.yaml`, turn on the CDC function. Currently, `mode` must be `Cluster`, and the corresponding registry center needs to be started in advance. If the GLT provider uses Redis, Redis needs to be started in advance. +1. Modify the configuration file `conf/server.yaml` and turn on the CDC function. Currently, `mode` must be `Cluster`, and the corresponding registry center needs to be started in advance. If the GLT provider uses Redis, Redis needs to be started in advance. Configuration example: -1. Enable the CDC function in `server.yaml`. +1. Enable CDC function in `server.yaml`. ```yaml mode: @@ -97,13 +95,13 @@ props: 2. Introduce JDBC driver. -The proxy already includes the PostgreSQL JDBC driver. +Proxy already includes PostgreSQL JDBC driver. If the backend connects to the following databases, please download the corresponding JDBC driver jar package and put it in the `${shardingsphere-proxy}/ext-lib` directory. -| Database | JDBC Driver | +| Database | JDBC Driver | |-----------|---------------------------------------------------------------------------------------------------------------------------------| -| MySQL | [mysql-connector-java-8.0.31.jar](https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.31/) | +| MySQL | [mysql-connector-java-8.0.31.jar](https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.31/) | | openGauss | [opengauss-jdbc-3.1.1-og.jar](https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/3.1.1-og/opengauss-jdbc-3.1.1-og.jar) | 4. Start ShardingSphere-Proxy: @@ -112,7 +110,7 @@ If the backend connects to the following databases, please download the correspo sh bin/start.sh ``` -5. View the proxy log `logs/stdout.log`, see in the log: +5. View the proxy log `logs/stdout.log`, and see in the log: ``` [INFO ] [main] o.a.s.p.frontend.ShardingSphereProxy - ShardingSphere-Proxy Cluster mode started successfully @@ -120,7 +118,7 @@ sh bin/start.sh Confirm successful startup. -6. Configure migration as needed +6. Configure CDC task synchronization configuration as needed 6.1. Query configuration. @@ -140,7 +138,7 @@ The default configuration is as follows: 6.2. Modify configuration (optional). -Because the streaming rule has default values, no creation is required, only the ALTER statement is provided. +Because the streaming rule has a default value, no creation is required, only the ALTER statement is provided. Complete configuration DistSQL example: @@ -166,10 +164,10 @@ Configuration item description: ```sql ALTER STREAMING RULE ( READ( -- Data reading configuration. If not configured, some parameters will take effect by default. - WORKER_THREAD=20, -- The size of the thread pool for fetching full data from the source end. If not configured, the default value will be used. It needs to ensure that this value is not lower than the number of sub-libraries - BATCH_SIZE=1000, -- The maximum number of records returned by a query operation. If not configured, the default value will be used. - SHARDING_SIZE=10000000, -- The size of the stock data partition. If not configured, the default value will be used. - RATE_LIMITER ( -- Rate limiting algorithm. If not configured, no rate limiting. + WORKER_THREAD=20, -- Affects full and incremental tasks, the size of the thread pool for fetching data from the source end. If not configured, the default value will be used. It needs to ensure that this value is not lower than the number of sub-libraries + BATCH_SIZE=1000, -- Affects full and incremental tasks, the maximum number of records returned by a query operation. If the amount of data in a transaction is greater than this value, the incremental situation may exceed the set value. + SHARDING_SIZE=10000000, -- Affects full tasks, the size of stock data sharding. If not configured, the default value will be used. + RATE_LIMITER ( -- Affects full and incremental tasks, rate limiting algorithm. If not configured, no rate limiting. TYPE( -- Algorithm type. Optional: QPS NAME='QPS', PROPERTIES( -- Algorithm properties @@ -177,8 +175,8 @@ READ( -- Data reading configuration. If not configured, some parameters will tak ))) ), WRITE( -- Data writing configuration. If not configured, some parameters will take effect by default. - WORKER_THREAD=20, -- The size of the thread pool for writing data to the target end. If not configured, the default value will be used. - BATCH_SIZE=1000, -- The maximum number of records for a batch write operation of a stock task. If not configured, the default value will be used. + WORKER_THREAD=20, -- Affects full and incremental tasks, the size of the thread pool for writing data to the target end. If not configured, the default value will be used. + BATCH_SIZE=1000, -- Affects full and incremental tasks, the maximum number of records for a batch write operation in a stock task. If not configured, the default value will be used. If the amount of data in a transaction is greater than this value, the incremental situation may exceed the set value. RATE_LIMITER ( -- Rate limiting algorithm. If not configured, no rate limiting. TYPE( -- Algorithm type. Optional: TPS NAME='TPS', @@ -190,14 +188,14 @@ STREAM_CHANNEL ( -- Data channel, connecting producers and consumers, used for r TYPE( -- Algorithm type. Optional: MEMORY NAME='MEMORY', PROPERTIES( -- Algorithm properties -'block-queue-size'='2000' -- Property: Blocking queue size, when the heap memory is relatively small, this value needs to be reduced. +'block-queue-size'='2000' -- Property: Blocking queue size ))) ); ``` ## CDC Client Manual -The CDC Client does not need to be deployed separately, just introduce the CDC Client's dependency through maven to use it in the project. Users can interact with the server through the CDC Client. +The CDC Client does not need to be deployed separately, just need to introduce the dependency of the CDC Client through maven to use it in the project. Users can interact with the server through the CDC Client. If necessary, users can also implement a CDC Client themselves to consume data and ACK. @@ -211,15 +209,17 @@ If necessary, users can also implement a CDC Client themselves to consume data a ### CDC Client Introduction -`org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` is the entry class of the CDC Client, users can interact with the CDC Server through this class. The main new methods are as follows. - -| Method Name | Return Value | Description | -|-----------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | Connect to the server, when connecting, you need to specify 1. Data consumption processing logic 2. Exception handling logic during consumption 3. Server error exception handling logic | -| login(CDCLoginParameter parameter) | void | CDC login CDCLoginParameter parameter - username: username - password: password | -| startStreaming(StartStreamingParameter parameter) | java.lang.String (CDC task unique identifier, used for subsequent operations) | Start CDC subscription StartStreamingParameter parameter - database: logical library name - schemaTables: subscribed table name - full: whether to subscribe to full data | -| restartStreaming(String streamingId) | void | Restart subscription | -| stopStreaming(String streamingId) | void | Stop subscription | -| dropStreaming(String streamingId) | void | Delete subscription | -| await() | void | Block the CDC thread, waiting for the channel to close | -| close() | void | Close the channel, the process ends. | +`org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` is the entry class of the CDC Client. Users can interact with the CDC Server through this class. The main new methods are as follows. + +| Method Name | Return Value | Description | +|-----------------------------------------------------------------------------------------------------------------------------|------------------------------------|---------------------------------------------------------------------------------------------------------| +| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | Connect with the server, when connecting, you need to specify
1. Data consumption processing logic
2. Exception handling logic during consumption
3. Server error exception handling logic | +| login(CDCLoginParameter parameter) | void | CDC login, parameters
username: username
password: password | +| startStreaming(StartStreamingParameter parameter) | String (CDC task unique identifier, used for subsequent operations) | Start CDC subscription, StartStreamingParameter parameters
database: logical library name
schemaTables: subscribed table name
full: whether to subscribe to full data | +| restartStreaming(String streamingId) | void | Restart subscription | +| stopStreaming(String streamingId) | void | Stop subscription | +| dropStreaming(String streamingId) | void | Delete subscription | +| await() | void | Block the CDC thread and wait for the channel to close | +| close() | void | Close the channel, the process ends. | + +``` \ No newline at end of file diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md index e7b8cb910571e..546cf6f292fed 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md @@ -1,53 +1,53 @@ +++ -title = "Manual" +title = "User Manual" weight = 2 +++ -## CDC 功能介绍 +## Introduction to CDC Function -CDC will only synchronize data, it will not synchronize table structures, and currently does not support the synchronization of DDL statements. +CDC only synchronizes data, it does not synchronize table structures, and currently does not support the synchronization of DDL statements. -### CDC 协议介绍 +### Introduction to CDC Protocol The CDC protocol uses Protobuf, and the corresponding Protobuf types are mapped based on the types in Java. Here, taking openGauss as an example, the mapping relationship between the data types of the CDC protocol and the database types is as follows. -| openGauss type | Java data type | protobuf type corresponding to CDC | Remarks | -|------------------------------------------|--------------------|------------------------------------|---------------------------------------------------------------------------------------------------------------| -| tinyint,smallint,integer | Integer | int32 | | -| bigint | Long | int64 | | -| numeric | BigDecimal | string | | -| real,float4 | Float | float | | -| binary_double,double precision | Double | double | | -| boolean | Boolean | bool | | -| char,varchar,text,clob | String | string | | -| blob,bytea,raw | byte[] | bytes | | -| date,timestamp,timestamptz,smalldatetime | java.sql.Timestamp | Timestamp | The Timestamp type of protobuf only contains seconds and nanoseconds, so it has nothing to do with time zones | -| time,timetz | java.sql.Time | int64 | Represents the number of nanoseconds in the day, regardless of time zone | -| interval,reltime,abstime | String | string | | -| point,lseg,box,path,polygon,circle | String | string | | -| cidr,inet,macaddr | String | string | | -| tsvector | String | string | | -| tsquery | String | String | | -| uuid | String | string | | -| json,jsonb | String | string | | -| hll | String | string | | -| int4range,daterange,tsrange,tstzrange | String | string | | -| hash16,hash32 | String | string | | -| bit,bit varying | String | string | bit(1) returns Boolean type | +| openGauss type | Java data type | CDC corresponding protobuf type | Remarks | +|------------------------------------------|--------------------|---------------------------------|---------------------------------------------------------------------------------------------------------------| +| tinyint, smallint, integer | Integer | int32 | | +| bigint | Long | int64 | | +| numeric | BigDecimal | string | | +| real, float4 | Float | float | | +| binary_double, double precision | Double | double | | +| boolean | Boolean | bool | | +| char, varchar, text, clob | String | string | | +| blob, bytea, raw | byte[] | bytes | | +| date, timestamp, timestamptz, smalldatetime | java.sql.Timestamp | Timestamp | The Timestamp type of protobuf only contains seconds and nanoseconds, so it is irrelevant to the time zone | +| time, timetz | java.sql.Time | int64 | Represents the number of nanoseconds of the day, irrelevant to the time zone | +| interval, reltime, abstime | String | string | | +| point, lseg, box, path, polygon, circle | String | string | | +| cidr, inet, macaddr | String | string | | +| tsvector | String | string | | +| tsquery | String | String | | +| uuid | String | string | | +| json, jsonb | String | string | | +| hll | String | string | | +| int4range, daterange, tsrange, tstzrange | String | string | | +| hash16, hash32 | String | string | | +| bit, bit varying | String | string | Returns Boolean type when bit(1) | ## openGauss User Manual -### 环境要求 +### Environmental Requirements -支持的 openGauss 版本:2.x ~ 3.x。 +Supported openGauss versions: 2.x ~ 3.x. -### 权限要求 +### Permission Requirements -1. 调整源端 WAL 配置。 +1. Adjust the source end WAL configuration. -`postgresql.conf` 示例配置: +Example configuration for `postgresql.conf`: ``` wal_level = logical max_wal_senders = 10 @@ -56,29 +56,29 @@ wal_sender_timeout = 0 max_connections = 600 ``` -详情请参见 [Write Ahead Log](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/settings.html) 和 [Replication](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/sending-server.html)。 +For details, please refer to [Write Ahead Log](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/settings.html) and [Replication](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/sending-server.html). -2. 赋予源端 openGauss 账号 replication 权限。 +2. Grant replication permission to the source end openGauss account. -`pg_hba.conf` 示例配置: +Example configuration for `pg_hba.conf`: ``` host replication repl_acct 0.0.0.0/0 md5 -# 0.0.0.0/0 表示允许任意 IP 地址访问,可以根据实际情况调整成 CDC Server 的 IP 地址 +# 0.0.0.0/0 means allowing access from any IP address, which can be adjusted to the IP address of the CDC Server according to the actual situation ``` -详情请参见 [Configuring Client Access Authentication](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/configuring-client-access-authentication.html) 和 [Example: Logic Replication Code](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/example-logic-replication-code.html)。 +For details, please refer to [Configuring Client Access Authentication](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/configuring-client-access-authentication.html) and [Example: Logic Replication Code](https://docs.opengauss.org/en/docs/2.0.1/docs/Developerguide/example-logic-replication-code.html). -3. 赋予 openGauss 账号 DDL DML 权限。 +3. Grant DDL DML permissions to the openGauss account. -如果使用非超级管理员账号,要求该账号在用到的数据库上,具备 CREATE 和 CONNECT 的权限。 +If a non-super administrator account is used, it is required that this account has CREATE and CONNECT permissions on the database used. -示例: +Example: ```sql GRANT CREATE, CONNECT ON DATABASE source_ds TO cdc_user; ``` -还需要账号对迁移的表和 schema 具备访问权限,以 test schema 下的 t_order 表为例。 +The account also needs to have access permissions to the table and schema to be migrated, taking the t_order table under the test schema as an example. ```sql \c source_ds @@ -87,21 +87,21 @@ GRANT USAGE ON SCHEMA test TO GROUP cdc_user; GRANT SELECT ON TABLE test.t_order TO cdc_user; ``` -openGauss 有 OWNER 的概念,如果是数据库,SCHEMA,表的 OWNER,则可以省略对应的授权步骤。 +openGauss has the concept of OWNER. If it is the OWNER of the database, SCHEMA, or table, the corresponding authorization steps can be omitted. -openGauss 不允许普通账户在 public schema 下操作。所以如果迁移的表在 public schema 下,需要额外授权。 +openGauss does not allow ordinary accounts to operate under the public schema. So if the table to be migrated is under the public schema, additional authorization is needed. ```sql GRANT ALL PRIVILEGES TO cdc_user; ``` -详情请参见 [openGauss GRANT](https://docs.opengauss.org/zh/docs/2.0.1/docs/Developerguide/GRANT.html) +For details, please refer to [openGauss GRANT](https://docs.opengauss.org/zh/docs/2.0.1/docs/Developerguide/GRANT.html) -### 完整流程示例 +### Complete Process Example -#### 前提条件 +#### Prerequisites -1. 准备好 CDC 源端的库、表、数据。 +1. Prepare the library, table, and data of the CDC source end. ```sql DROP DATABASE IF EXISTS ds_0; @@ -111,16 +111,16 @@ DROP DATABASE IF EXISTS ds_1; CREATE DATABASE ds_1; ``` -#### 配置 CDC Server +#### Configure CDC Server -1. 创建逻辑库。 +1. Create a logical library. ```sql CREATE DATABASE sharding_db; \c sharding_db ``` -2. 注册存储单元。 +2. Register storage unit. ```sql REGISTER STORAGE UNIT ds_0 ( @@ -136,7 +136,7 @@ REGISTER STORAGE UNIT ds_0 ( ); ``` -3. 创建分片规则。 +3. Create sharding rules. ```sql CREATE SHARDING TABLE RULE t_order( @@ -147,9 +147,9 @@ KEY_GENERATE_STRATEGY(COLUMN=order_id,TYPE(NAME="snowflake")) ); ``` -4. 创建表和初始化数据 +4. Create tables and initialize data -在 proxy 执行建表语句。 +Execute the create table statement in the proxy. ```sql CREATE TABLE t_order (id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (id)); @@ -157,11 +157,11 @@ CREATE TABLE t_order (id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); ``` -#### 启动 CDC Client +#### Start CDC Client -目前 CDC Client 只提供了 Java API,用户需要自行实现数据的消费逻辑。 +Currently, the CDC Client only provides a Java API, and users need to implement the data consumption logic themselves. -下面是一个简单的启动 CDC Client 的示例。 +Below is a simple example of starting the CDC Client. ```java import lombok.SneakyThrows; @@ -177,59 +177,59 @@ import java.util.Collections; @Slf4j public final class Bootstrap { - + @SneakyThrows(InterruptedException.class) public static void main(final String[] args) { String address = "127.0.0.1"; - // 构造 CDCClient,传入 CDCClientConfiguration,CDCClientConfiguration 中包含了 CDC Server 的地址和端口,以及超时时间 + // Construct CDCClient, pass in CDCClientConfiguration, CDCClientConfiguration contains the address and port of the CDC Server, as well as the timeout time try (CDCClient cdcClient = new CDCClient(new CDCClientConfiguration(address, 33071, 10000))) { - // 先调用 connect 连接到 CDC Server,需要传入 1. 数据的消费处理逻辑 2. 消费时候的异常处理逻辑 3. 服务端错误的异常处理逻辑 + // First call connect to the CDC Server, you need to pass in 1. Data consumption processing logic 2. Exception handling logic during consumption 3. Server error exception handling logic cdcClient.connect(records -> log.info("records: {}", records), new RetryStreamingExceptionHandler(cdcClient, 5, 5000), (ctx, result) -> log.error("Server error: {}", result.getErrorMessage())); cdcClient.login(new CDCLoginParameter("root", "root")); - // 开始 CDC 数据同步,返回的 streamingId 是这次 CDC 任务的唯一标识,CDC Server 生成唯一标识的依据是 订阅的数据库名称 + 订阅的表 + 是否是全量同步 + // Start CDC data synchronization, the returned streamingId is the unique identifier of this CDC task, the basis for the CDC Server to generate a unique identifier is the name of the subscribed database + the subscribed table + whether it is full synchronization String streamingId = cdcClient.startStreaming(new StartStreamingParameter("sharding_db", Collections.singleton(SchemaTable.newBuilder().setTable("t_order").build()), true)); log.info("Streaming id={}", streamingId); - // 防止 main 主线程退出 + // Prevent the main thread from exiting cdcClient.await(); } } } ``` -主要有4个步骤 -1. 构造 CDCClient,传入 CDCClientConfiguration -2. 调用 CDCClient.connect,这一步是和 CDC Server 建立连接 -3. 调用 CDCClient.login,使用 server.yaml 中配置好的用户名和密码登录 -4. 调用 CDCClient.startStreaming,开启订阅,需要保证订阅的库和表在 ShardingSphere-Proxy 存在,否则会报错。 +There are mainly 4 steps +1. Construct CDCClient, pass in CDCClientConfiguration +2. Call CDCClient.connect(), this step is to establish a connection with the CDC Server +3. Call CDCClient.login(), log in with the username and password configured in server.yaml +4. Call CDCClient.startStreaming(), start subscribing, you need to ensure that the subscribed library and table exist in ShardingSphere-Proxy, otherwise an error will be reported. -> CDCClient.await 是阻塞主线程,非必需的步骤,用其他方式也可以,只要保证 CDC 线程一直在工作就行。 +> CDCClient.await is to block the main thread, it is not a necessary step, other methods can also be used, as long as the CDC thread is always working. -如果需要更复杂数据消费的实现,例如写入到数据库,可以参考 [DataSourceRecordConsumer](https://github.com/apache/shardingsphere/blob/master/test/e2e/operation/pipeline/src/test/java/org/apache/shardingsphere/test/e2e/data/pipeline/cases/cdc/DataSourceRecordConsumer.java) +If you need more complex data consumption implementation, such as writing to the database, you can refer to [DataSourceRecordConsumer](https://github.com/apache/shardingsphere/blob/master/test/e2e/operation/pipeline/src/test/java/org/apache/shardingsphere/test/e2e/data/pipeline/cases/cdc/DataSourceRecordConsumer.java) -#### 查看 CDC 任务运行情况 +#### View the Running Status of the CDC Task -CDC 任务的启动和停止目前只能通过 CDC Client 控制,可以通过在 proxy 中执行 DistSQL 查看 CDC 任务状态 +The start and stop of the CDC task can only be controlled by the CDC Client. You can view the status of the CDC task by executing DistSQL in the proxy -1. 查看 CDC 任务列表 +1. View the CDC task list SHOW STREAMING LIST; -运行结果 +Running result ``` sharding_db=> SHOW STREAMING LIST; - id | database | tables | job_item_count | active | create_time | stop_time + id | database | tables | job_item_count | active | create_time | stop_time --------------------------------------------+-------------+---------+----------------+--------+---------------------+----------- - j0302p0000702a83116fcee83f70419ca5e2993791 | sharding_db | t_order | 1 | true | 2023-10-27 22:01:27 | + j0302p0000702a83116fcee83f70419ca5e2993791 | sharding_db | t_order | 1 | true | 2023-10-27 22:01:27 | (1 row) ``` -2. 查看 CDC 任务详情 +2. View the details of the CDC task SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; -运行结果 +Running result ``` sharding_db=> SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; @@ -240,22 +240,30 @@ sharding_db=> SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; (2 rows) ``` -3. 删除 CDC 任务 +3. Drop CDC task DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; -只有当 CDC 任务没有订阅的时候才可以删除,此时也会删除 openGauss 物理库上的 replication slots +The CDC task can only be deleted when there are no subscriptions. At this time, the replication slots on the openGauss physical library will also be deleted. ``` sharding_db=> DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; SUCCESS ``` -# 注意事项 +# Precautions + +## Explanation of incremental data push + +1. The CDC incremental push is currently transactional, and the transactions of the physical library will not be split. Therefore, if there are data changes in multiple tables in a transaction, these data changes will be pushed together. +If you want to support XA transactions (currently only supports openGauss), both openGauss and Proxy need the GLT module. +2. The conditions for push are met when a certain amount of data is met or a certain time interval is reached (currently 300ms). When processing XA transactions, if the received multiple sub-library incremental events exceed 300ms, it may cause the XA transaction to be split and pushed. + +## Handling of large transactions + +Currently, large transactions are fully parsed, which may cause the CDC Server process to OOM. In the future, forced truncation may be considered. -## 关于数据增量推送的说明 +## Recommended configuration -1. CDC 增量推送目前是按照事务维度的,物理库的事务不会被拆分,所以如果一个事务中有多个表的数据变更,那么这些数据变更会被一起推送。 -如果要支持 XA 事务(目前只支持 openGauss),则 openGauss 和 Proxy 都需要 GLT 模块。 -2. 在存在超大事务的情况下,目前的处理逻辑可能会导致 CDC 所在的进程 OOM,后续可能会强制截断。 -3. 满足推送的条件是满足了一定大小的数据量或者到了一定的时间间隔(目前是 300ms),在处理 XA 事务时,收到的多个分库增量事件超过了 300ms,可能会导致 XA 事务被拆开推送。 +1. If there is a requirement for flow control, it is recommended to configure only one side for reading and writing, and the other side will automatically be affected by the flow control. +2. The configuration of the CDC task needs to be adjusted according to the actual situation, and it is not that the more threads, the better. For example, when the heap memory is relatively small, you need to reduce the value of the blocking queue to avoid insufficient heap memory. \ No newline at end of file From a0f64f9698314a222ee8a756fcdce9ba6caf7dc3 Mon Sep 17 00:00:00 2001 From: Xinze Guo <101622833+azexcy@users.noreply.github.com> Date: Fri, 29 Dec 2023 10:46:30 +0800 Subject: [PATCH 8/9] Improve CDC doc --- .../shardingsphere-proxy/cdc/_index.cn.md | 2 +- .../shardingsphere-proxy/cdc/build.cn.md | 31 ++++++------ .../shardingsphere-proxy/cdc/build.en.md | 32 ++++++------ .../shardingsphere-proxy/cdc/usage.cn.md | 40 ++++++++++----- .../shardingsphere-proxy/cdc/usage.en.md | 50 +++++++++++++------ 5 files changed, 93 insertions(+), 62 deletions(-) diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md index 43f34e863629c..a79e6d988f2fe 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md @@ -5,4 +5,4 @@ weight = 9 CDC(Change Data Capture)增量数据捕捉。CDC 可以监控 ShardingSphere-Proxy 的存储节点中的数据变化,捕捉到数据操作事件,过滤并提取有用信息,最终将这些变化数据发送到指定的目标上. -CDC可以用于数据同步,数据备份和恢复等方面。通常情况下,目前支持 openGauss、MySQL 和 PostgreSQL. +CDC 可以用于数据同步,数据备份和恢复等方面。通常情况下,目前支持 openGauss、MySQL 和 PostgreSQL。 diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md index 978322362338b..3385da606725f 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md @@ -24,7 +24,7 @@ ShardingSphere CDC 分为两个部分,一个是 CDC Server,另一个是 CDC ### 配置 GLT 模块(可选) -官网发布的二进制包默认不包含 GLT 模块,不保证跨库事务完整性,如果使用的是包含 GLT 功能的 openGauss 数据库,则可以额外引入 GLT 模块,保证跨库事务的完整性。 +官网发布的二进制包默认不包含 GLT 模块,如果使用的是包含 GLT 功能的 openGauss 数据库,则可以额外引入 GLT 模块,保证 XA 事务的完整性。 目前有两种方式引入 GLT 模块,并且需要在 server.yaml 中也进行相应的配置。 @@ -73,7 +73,7 @@ authority: privilege: type: ALL_PERMITTED -# 使用 GLT 的时候也需要开启分布式事务 +# 使用 GLT 的时候也需要开启分布式事务,目前 GLT 只有 openGauss 数据库支持 #transaction: # defaultType: XA # providerType: Atomikos @@ -91,19 +91,18 @@ props: check-table-metadata-enabled: false proxy-default-port: 3307 # Proxy default port. cdc-server-port: 33071 # CDC Server 端口,必须配置 - #proxy-frontend-database-protocol-type: openGauss # 和后端数据库的类型一致 + proxy-frontend-database-protocol-type: openGauss # 和后端数据库的类型一致 ``` 2. 引入 JDBC 驱动。 -proxy 已包含 PostgreSQL JDBC 驱动。 +proxy 已包含 PostgreSQL、openGauss JDBC 驱动。 如果后端连接以下数据库,请下载相应 JDBC 驱动 jar 包,并将其放入 `${shardingsphere-proxy}/ext-lib` 目录。 | 数据库 | JDBC 驱动 | |-----------|---------------------------------------------------------------------------------------------------------------------------------| | MySQL | [mysql-connector-java-8.0.31.jar](https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.31/) | -| openGauss | [opengauss-jdbc-3.1.1-og.jar](https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/3.1.1-og/opengauss-jdbc-3.1.1-og.jar) | 4. 启动 ShardingSphere-Proxy: @@ -165,7 +164,7 @@ STREAM_CHANNEL (TYPE(NAME='MEMORY',PROPERTIES('block-queue-size'='2000'))) ```sql ALTER STREAMING RULE ( READ( -- 数据读取配置。如果不配置则部分参数默认生效。 - WORKER_THREAD=20, -- 影响全量、增量任务,从源端摄取数据的线程池大小。不配置则使用默认值。需要确保该值不低于分库的数量 + WORKER_THREAD=20, -- 影响全量、增量任务,从源端摄取数据的线程池大小。不配置则使用默认值。需要确保该值不低于物理库的数量 BATCH_SIZE=1000, -- 影响全量、增量任务,一次查询操作返回的最大记录数。如果一个事务中的数据量大于该值,增量情况下可能超过设定的值。 SHARDING_SIZE=10000000, -- 影响全量任务,存量数据分片大小。如果不配置则使用默认值。 RATE_LIMITER ( -- 影响全量、增量任务,限流算法。如果不配置则不限流。 @@ -212,13 +211,13 @@ CDC Client 不需要额外部署,只需要通过 maven 引入 CDC Client 的 `org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` 是 CDC Client 的入口类,用户可以通过该类和 CDC Server 进行交互。主要的和新方法如下。 -| 方法名 | 返回值 | 说明 | -|-----------------------------------------------------------------------------------------------------------------------------|------------------------------------|---------------------------------------------------------------------------------------------------------| -| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | 和服务端进行连接,连接的时候需要指定
1. 数据的消费处理逻辑
2. 消费时候的异常处理逻辑
3. 服务端错误的异常处理逻辑 | -| login(CDCLoginParameter parameter) | void | CDC登陆,参数
username:用户名
password:密码 | -| startStreaming(StartStreamingParameter parameter) | String (CDC 任务唯一标识,用于后续操作) | 开启 CDC 订阅, StartStreamingParameter 参数
database:逻辑库名称
schemaTables:订阅的表名
full:是否订阅全量数据 | -| restartStreaming(String streamingId) | void | 重启订阅 | -| stopStreaming(String streamingId) | void | 停止订阅 | -| dropStreaming(String streamingId) | void | 删除订阅 | -| await() | void | 阻塞 CDC 线程,等待 channel 关闭 | -| close() | void | 关闭 channel,流程结束。 | +| 方法名 | 返回值 | 说明 | +|-----------------------------------------------------------------------------------------------------------------------------|--------------------------|--------------------------------------------------------------------------------------------------------| +| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | 和服务端进行连接,连接的时候需要指定
1. 数据的消费处理逻辑
2. 消费时候的异常处理逻辑
3. 服务端错误的异常处理逻辑 | +| login(CDCLoginParameter parameter) | void | CDC登陆,参数
username:用户名
password:密码 | +| startStreaming(StartStreamingParameter parameter) | streamingId (CDC 任务唯一标识) | 开启 CDC 订阅, StartStreamingParameter 参数
database:逻辑库名称
schemaTables:订阅的表名
full:是否订阅全量数据 | +| restartStreaming(String streamingId) | void | 重启订阅 | +| stopStreaming(String streamingId) | void | 停止订阅 | +| dropStreaming(String streamingId) | void | 删除订阅 | +| await() | void | 阻塞 CDC 线程,等待 channel 关闭 | +| close() | void | 关闭 channel,流程结束 | diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md index b7263cb180dc6..59d427dbe8b10 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md @@ -14,7 +14,7 @@ Users can introduce the CDC Client into their own projects to implement data con - Pure JAVA development, JDK recommended 1.8 or above. - CDC Server requires SharingSphere-Proxy to use cluster mode, currently supports ZooKeeper as the registry center. - CDC only synchronizes data, does not synchronize table structure, and currently does not support DDL statement synchronization. -- CDC incremental stage will output data according to the dimension of the transaction. If you want to enable XA transaction compatibility, both openGauss and ShardingSphere-Proxy need the GLT module. +- CDC incremental task will not split transaction data of the physical database. If you want to enable XA transaction compatibility, both openGauss and ShardingSphere-Proxy need the GLT module. ## CDC Server Deployment Steps @@ -24,7 +24,7 @@ Since the CDC Server is built into ShardingSphere-Proxy, you need to get Shardin ### Configure GLT Module (Optional) -The official website's released binary package does not include the GLT module by default and does not guarantee the integrity of cross-library transactions. If you are using the openGauss database with GLT functionality, you can additionally introduce the GLT module to ensure the integrity of cross-library transactions. +The official website's released binary package does not include the GLT module by default, if you are using the openGauss database with GLT functionality, you can additionally introduce the GLT module to ensure the integrity of XA transactions. There are currently two ways to introduce the GLT module, and corresponding configurations need to be made in server.yaml. @@ -72,7 +72,7 @@ authority: privilege: type: ALL_PERMITTED -# When using GLT, you also need to enable distributed transactions +# When using GLT, you also need to enable distributed transactions, GLT is only supported by the openGauss database currently. #transaction: # defaultType: XA # providerType: Atomikos @@ -90,7 +90,7 @@ props: check-table-metadata-enabled: false proxy-default-port: 3307 # Proxy default port. cdc-server-port: 33071 # CDC Server port, must be configured - #proxy-frontend-database-protocol-type: openGauss # Consistent with the type of backend database + proxy-frontend-database-protocol-type: openGauss # Consistent with the type of backend database ``` 2. Introduce JDBC driver. @@ -164,7 +164,7 @@ Configuration item description: ```sql ALTER STREAMING RULE ( READ( -- Data reading configuration. If not configured, some parameters will take effect by default. - WORKER_THREAD=20, -- Affects full and incremental tasks, the size of the thread pool for fetching data from the source end. If not configured, the default value will be used. It needs to ensure that this value is not lower than the number of sub-libraries + WORKER_THREAD=20, -- Affects full and incremental tasks, the size of the thread pool for fetching data from the source end. If not configured, the default value will be used. It needs to ensure that this value is not lower than the number of physical database BATCH_SIZE=1000, -- Affects full and incremental tasks, the maximum number of records returned by a query operation. If the amount of data in a transaction is greater than this value, the incremental situation may exceed the set value. SHARDING_SIZE=10000000, -- Affects full tasks, the size of stock data sharding. If not configured, the default value will be used. RATE_LIMITER ( -- Affects full and incremental tasks, rate limiting algorithm. If not configured, no rate limiting. @@ -211,15 +211,13 @@ If necessary, users can also implement a CDC Client themselves to consume data a `org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` is the entry class of the CDC Client. Users can interact with the CDC Server through this class. The main new methods are as follows. -| Method Name | Return Value | Description | -|-----------------------------------------------------------------------------------------------------------------------------|------------------------------------|---------------------------------------------------------------------------------------------------------| -| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | Connect with the server, when connecting, you need to specify
1. Data consumption processing logic
2. Exception handling logic during consumption
3. Server error exception handling logic | -| login(CDCLoginParameter parameter) | void | CDC login, parameters
username: username
password: password | -| startStreaming(StartStreamingParameter parameter) | String (CDC task unique identifier, used for subsequent operations) | Start CDC subscription, StartStreamingParameter parameters
database: logical library name
schemaTables: subscribed table name
full: whether to subscribe to full data | -| restartStreaming(String streamingId) | void | Restart subscription | -| stopStreaming(String streamingId) | void | Stop subscription | -| dropStreaming(String streamingId) | void | Delete subscription | -| await() | void | Block the CDC thread and wait for the channel to close | -| close() | void | Close the channel, the process ends. | - -``` \ No newline at end of file +| Method Name | Return Value | Description | +|-----------------------------------------------------------------------------------------------------------------------------|---------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | Connect with the server, when connecting, you need to specify
1. Data consumption processing function
2. Exception handling logic during consumption
3. Server error exception handling function | +| login(CDCLoginParameter parameter) | void | CDC login, parameters
username: username
password: password | +| startStreaming(StartStreamingParameter parameter) | streamId (CDC task unique identifier) | Start CDC subscription, StartStreamingParameter parameters
database: logical library name
schemaTables: subscribed table name
full: whether to subscribe to full data | +| restartStreaming(String streamingId) | void | Restart subscription | +| stopStreaming(String streamingId) | void | Stop subscription | +| dropStreaming(String streamingId) | void | Delete subscription | +| await() | void | Block the CDC thread and wait for the channel to close | +| close() | void | Close the channel, the process ends | diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md index 1ed2db9aea6a7..4bddeabed698c 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md @@ -78,7 +78,7 @@ host replication repl_acct 0.0.0.0/0 md5 GRANT CREATE, CONNECT ON DATABASE source_ds TO cdc_user; ``` -还需要账号对迁移的表和 schema 具备访问权限,以 test schema 下的 t_order 表为例。 +还需要账号对订阅的表和 schema 具备访问权限,以 test schema 下的 t_order 表为例。 ```sql \c source_ds @@ -141,20 +141,18 @@ REGISTER STORAGE UNIT ds_0 ( ```sql CREATE SHARDING TABLE RULE t_order( STORAGE_UNITS(ds_0,ds_1), -SHARDING_COLUMN=order_id, +SHARDING_COLUMN=id, TYPE(NAME="hash_mod",PROPERTIES("sharding-count"="2")), -KEY_GENERATE_STRATEGY(COLUMN=order_id,TYPE(NAME="snowflake")) +KEY_GENERATE_STRATEGY(COLUMN=id,TYPE(NAME="snowflake")) ); ``` -4. 创建表和初始化数据 +4. 创建表 在 proxy 执行建表语句。 ```sql CREATE TABLE t_order (id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (id)); - -INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); ``` #### 启动 CDC Client @@ -207,6 +205,25 @@ public final class Bootstrap { 如果需要更复杂数据消费的实现,例如写入到数据库,可以参考 [DataSourceRecordConsumer](https://github.com/apache/shardingsphere/blob/master/test/e2e/operation/pipeline/src/test/java/org/apache/shardingsphere/test/e2e/data/pipeline/cases/cdc/DataSourceRecordConsumer.java) +#### 写入数据 + +通过 proxy 写入数据,此时 CDC Client 会收到数据变更的通知。 + +``` +INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); +``` + +Bootstrap 会输出类似的日志 + +``` + records: [before { + name: "id" + value { + type_url: "type.googleapis.com/google.protobuf.Empty" + } + ...... +``` + #### 查看 CDC 任务运行情况 CDC 任务的启动和停止目前只能通过 CDC Client 控制,可以通过在 proxy 中执行 DistSQL 查看 CDC 任务状态 @@ -233,10 +250,10 @@ SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; ``` sharding_db=> SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; - item | data_source | status | active | processed_records_count | inventory_finished_percentage | incremental_idle_seconds | error_message -------+-------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+--------------- - 0 | ds_0 | EXECUTE_INCREMENTAL_TASK | true | 1 | 100 | 101 | - 1 | ds_1 | EXECUTE_INCREMENTAL_TASK | true | 2 | 100 | 100 | + item | data_source | status | active | processed_records_count | inventory_finished_percentage | incremental_idle_seconds | confirmed_position | current_position | error_message +------+-------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+--------------------+------------------+--------------- + 0 | ds_0 | EXECUTE_INCREMENTAL_TASK | false | 1 | 100 | 115 | 5/597E43D0 | 5/597E4810 | + 1 | ds_1 | EXECUTE_INCREMENTAL_TASK | false | 2 | 100 | 115 | 5/597E4450 | 5/597E4810 | (2 rows) ``` @@ -265,5 +282,4 @@ SUCCESS ## 建议的配置 -1. 如果有限流的要求,读写推荐只配置一侧,另一侧会自动受到限流的影响。 -2. CDC 的任务配置需要根据实际情况调整,不是线程数越多越好。例如堆内存比较小的时候需要调小阻塞队列的值,以免堆内存不够用。 \ No newline at end of file +CDC 的性能目前没有一个固定的值,可以关注配置中读/写的 batchSize,以及内存队列的大小,根据实际情况进行调优。 diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md index 546cf6f292fed..4bf4b1462eaab 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md @@ -78,7 +78,7 @@ Example: GRANT CREATE, CONNECT ON DATABASE source_ds TO cdc_user; ``` -The account also needs to have access permissions to the table and schema to be migrated, taking the t_order table under the test schema as an example. +The account also needs to have access permissions to the table and schema to be subscribed, taking the t_order table under the test schema as an example. ```sql \c source_ds @@ -141,15 +141,15 @@ REGISTER STORAGE UNIT ds_0 ( ```sql CREATE SHARDING TABLE RULE t_order( STORAGE_UNITS(ds_0,ds_1), -SHARDING_COLUMN=order_id, +SHARDING_COLUMN=id, TYPE(NAME="hash_mod",PROPERTIES("sharding-count"="2")), -KEY_GENERATE_STRATEGY(COLUMN=order_id,TYPE(NAME="snowflake")) +KEY_GENERATE_STRATEGY(COLUMN=id,TYPE(NAME="snowflake")) ); ``` -4. Create tables and initialize data +4. Create tables -Execute the create table statement in the proxy. +Execute the creation table statement in the proxy. ```sql CREATE TABLE t_order (id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (id)); @@ -159,7 +159,7 @@ INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'o #### Start CDC Client -Currently, the CDC Client only provides a Java API, and users need to implement the data consumption logic themselves. +Currently, the CDC Client only provides a Java API, and users need to implement the data consumption themselves. Below is a simple example of starting the CDC Client. @@ -201,12 +201,31 @@ There are mainly 4 steps 1. Construct CDCClient, pass in CDCClientConfiguration 2. Call CDCClient.connect(), this step is to establish a connection with the CDC Server 3. Call CDCClient.login(), log in with the username and password configured in server.yaml -4. Call CDCClient.startStreaming(), start subscribing, you need to ensure that the subscribed library and table exist in ShardingSphere-Proxy, otherwise an error will be reported. +4. Call CDCClient.startStreaming(), start subscribing, you need to ensure that the subscribed library and table exist in ShardingSphere-Proxy, otherwise an error will be reported > CDCClient.await is to block the main thread, it is not a necessary step, other methods can also be used, as long as the CDC thread is always working. If you need more complex data consumption implementation, such as writing to the database, you can refer to [DataSourceRecordConsumer](https://github.com/apache/shardingsphere/blob/master/test/e2e/operation/pipeline/src/test/java/org/apache/shardingsphere/test/e2e/data/pipeline/cases/cdc/DataSourceRecordConsumer.java) +#### Write Data + +When write data through a proxy, the CDC Client is notified of the data change. + +``` +INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); +``` + +Bootstrap will output a similar log. + +``` + records: [before { + name: "id" + value { + type_url: "type.googleapis.com/google.protobuf.Empty" + } + ...... +``` + #### View the Running Status of the CDC Task The start and stop of the CDC task can only be controlled by the CDC Client. You can view the status of the CDC task by executing DistSQL in the proxy @@ -233,10 +252,10 @@ Running result ``` sharding_db=> SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; - item | data_source | status | active | processed_records_count | inventory_finished_percentage | incremental_idle_seconds | error_message -------+-------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+--------------- - 0 | ds_0 | EXECUTE_INCREMENTAL_TASK | true | 1 | 100 | 101 | - 1 | ds_1 | EXECUTE_INCREMENTAL_TASK | true | 2 | 100 | 100 | + item | data_source | status | active | processed_records_count | inventory_finished_percentage | incremental_idle_seconds | confirmed_position | current_position | error_message +------+-------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+--------------------+------------------+--------------- + 0 | ds_0 | EXECUTE_INCREMENTAL_TASK | false | 1 | 100 | 115 | 5/597E43D0 | 5/597E4810 | + 1 | ds_1 | EXECUTE_INCREMENTAL_TASK | false | 2 | 100 | 115 | 5/597E4450 | 5/597E4810 | (2 rows) ``` @@ -244,7 +263,7 @@ sharding_db=> SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; -The CDC task can only be deleted when there are no subscriptions. At this time, the replication slots on the openGauss physical library will also be deleted. +The CDC task can only be deleted when there are no subscriptions. At this time, the replication slots on the openGauss physical database will also be deleted. ``` sharding_db=> DROP STREAMING j0302p0000702a83116fcee83f70419ca5e2993791; @@ -255,9 +274,9 @@ SUCCESS ## Explanation of incremental data push -1. The CDC incremental push is currently transactional, and the transactions of the physical library will not be split. Therefore, if there are data changes in multiple tables in a transaction, these data changes will be pushed together. +1. The CDC incremental push is currently transactional, and the transactions of the physical database will not be split. Therefore, if there are data changes in multiple tables in a transaction, these data changes will be pushed together. If you want to support XA transactions (currently only supports openGauss), both openGauss and Proxy need the GLT module. -2. The conditions for push are met when a certain amount of data is met or a certain time interval is reached (currently 300ms). When processing XA transactions, if the received multiple sub-library incremental events exceed 300ms, it may cause the XA transaction to be split and pushed. +2. The conditions for push are met when a certain amount of data is met or a certain time interval is reached (currently 300ms). When processing XA transactions, if the received multiple physical database incremental events exceed 300ms, it may cause the XA transaction to be split and pushed. ## Handling of large transactions @@ -265,5 +284,4 @@ Currently, large transactions are fully parsed, which may cause the CDC Server p ## Recommended configuration -1. If there is a requirement for flow control, it is recommended to configure only one side for reading and writing, and the other side will automatically be affected by the flow control. -2. The configuration of the CDC task needs to be adjusted according to the actual situation, and it is not that the more threads, the better. For example, when the heap memory is relatively small, you need to reduce the value of the blocking queue to avoid insufficient heap memory. \ No newline at end of file +There is no fixed value for the performance of CDC, you can focus on the batchSize of read/write in the configuration, and the size of the memory queue, and tune it according to the actual situation. From ab5022883561ac9b864be67988b5d735fef722e7 Mon Sep 17 00:00:00 2001 From: Xinze Guo <101622833+azexcy@users.noreply.github.com> Date: Fri, 29 Dec 2023 12:18:47 +0800 Subject: [PATCH 9/9] Improve CDC doc --- .../shardingsphere-proxy/cdc/_index.cn.md | 2 +- .../shardingsphere-proxy/cdc/build.cn.md | 26 +++++++++--------- .../shardingsphere-proxy/cdc/build.en.md | 27 +++++++++---------- .../shardingsphere-proxy/cdc/usage.cn.md | 16 ++++++----- .../shardingsphere-proxy/cdc/usage.en.md | 24 ++++++++--------- 5 files changed, 48 insertions(+), 47 deletions(-) diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md index a79e6d988f2fe..d19f9e57988b8 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/_index.cn.md @@ -3,6 +3,6 @@ title = "CDC" weight = 9 +++ -CDC(Change Data Capture)增量数据捕捉。CDC 可以监控 ShardingSphere-Proxy 的存储节点中的数据变化,捕捉到数据操作事件,过滤并提取有用信息,最终将这些变化数据发送到指定的目标上. +CDC(Change Data Capture)增量数据捕捉。CDC 可以监控 ShardingSphere-Proxy 的存储节点中的数据变化,捕捉到数据操作事件,过滤并提取有用信息,最终将这些变化数据发送到指定的目标上。 CDC 可以用于数据同步,数据备份和恢复等方面。通常情况下,目前支持 openGauss、MySQL 和 PostgreSQL。 diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md index 3385da606725f..6ffeb3b4040c8 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.cn.md @@ -14,7 +14,7 @@ ShardingSphere CDC 分为两个部分,一个是 CDC Server,另一个是 CDC - 纯 JAVA 开发,JDK 建议 1.8 或以上版本。 - CDC Server 要求 SharingSphere-Proxy 使用集群模式,目前支持 ZooKeeper 作为注册中心。 - CDC 只同步数据,不会同步表结构,目前也不支持 DDL 的语句同步。 -- CDC 增量阶段会按照事务的维度输出数据, 如果要开启 XA 事务的兼容,则 openGauss 和 ShardingSphere-Proxy 都需要 GLT 模块 +- CDC 增量阶段会按照分库事务的维度输出数据, 如果要开启 XA 事务的兼容,则 openGauss 和 ShardingSphere-Proxy 都需要 GLT 模块 ## CDC Server 部署步骤 @@ -89,7 +89,7 @@ authority: props: system-log-level: INFO check-table-metadata-enabled: false - proxy-default-port: 3307 # Proxy default port. + proxy-default-port: 3307 # Proxy default port cdc-server-port: 33071 # CDC Server 端口,必须配置 proxy-frontend-database-protocol-type: openGauss # 和后端数据库的类型一致 ``` @@ -164,7 +164,7 @@ STREAM_CHANNEL (TYPE(NAME='MEMORY',PROPERTIES('block-queue-size'='2000'))) ```sql ALTER STREAMING RULE ( READ( -- 数据读取配置。如果不配置则部分参数默认生效。 - WORKER_THREAD=20, -- 影响全量、增量任务,从源端摄取数据的线程池大小。不配置则使用默认值。需要确保该值不低于物理库的数量 + WORKER_THREAD=20, -- 影响全量、增量任务,从源端摄取数据的线程池大小。不配置则使用默认值。需要确保该值不低于分库的数量 BATCH_SIZE=1000, -- 影响全量、增量任务,一次查询操作返回的最大记录数。如果一个事务中的数据量大于该值,增量情况下可能超过设定的值。 SHARDING_SIZE=10000000, -- 影响全量任务,存量数据分片大小。如果不配置则使用默认值。 RATE_LIMITER ( -- 影响全量、增量任务,限流算法。如果不配置则不限流。 @@ -211,13 +211,13 @@ CDC Client 不需要额外部署,只需要通过 maven 引入 CDC Client 的 `org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` 是 CDC Client 的入口类,用户可以通过该类和 CDC Server 进行交互。主要的和新方法如下。 -| 方法名 | 返回值 | 说明 | -|-----------------------------------------------------------------------------------------------------------------------------|--------------------------|--------------------------------------------------------------------------------------------------------| -| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | 和服务端进行连接,连接的时候需要指定
1. 数据的消费处理逻辑
2. 消费时候的异常处理逻辑
3. 服务端错误的异常处理逻辑 | -| login(CDCLoginParameter parameter) | void | CDC登陆,参数
username:用户名
password:密码 | -| startStreaming(StartStreamingParameter parameter) | streamingId (CDC 任务唯一标识) | 开启 CDC 订阅, StartStreamingParameter 参数
database:逻辑库名称
schemaTables:订阅的表名
full:是否订阅全量数据 | -| restartStreaming(String streamingId) | void | 重启订阅 | -| stopStreaming(String streamingId) | void | 停止订阅 | -| dropStreaming(String streamingId) | void | 删除订阅 | -| await() | void | 阻塞 CDC 线程,等待 channel 关闭 | -| close() | void | 关闭 channel,流程结束 | +| 方法名 | 返回值 | 说明 | +|-----------------------------------------------------------------------------------------------------------------------------|-------------|---------------------------------------------------------------------------------------------------------| +| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | 和服务端进行连接,连接的时候需要指定
1. 数据的消费处理逻辑
2. 消费时候的异常处理逻辑
3. 服务端错误的异常处理逻辑 | +| login(CDCLoginParameter parameter) | void | CDC登陆,参数
username:用户名
password:密码 | +| startStreaming(StartStreamingParameter parameter) | streamingId | 开启 CDC 订阅, StartStreamingParameter 参数
database:逻辑库名称
schemaTables:订阅的表名
full:是否订阅全量数据 | +| restartStreaming(String streamingId) | void | 重启订阅 | +| stopStreaming(String streamingId) | void | 停止订阅 | +| dropStreaming(String streamingId) | void | 删除订阅 | +| await() | void | 阻塞 CDC 线程,等待 channel 关闭 | +| close() | void | 关闭 channel,流程结束 | diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md index 59d427dbe8b10..7a6e191679196 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/build.en.md @@ -14,7 +14,7 @@ Users can introduce the CDC Client into their own projects to implement data con - Pure JAVA development, JDK recommended 1.8 or above. - CDC Server requires SharingSphere-Proxy to use cluster mode, currently supports ZooKeeper as the registry center. - CDC only synchronizes data, does not synchronize table structure, and currently does not support DDL statement synchronization. -- CDC incremental task will not split transaction data of the physical database. If you want to enable XA transaction compatibility, both openGauss and ShardingSphere-Proxy need the GLT module. +- CDC incremental task will not split transaction data of the database shards. If you want to enable XA transaction compatibility, both openGauss and ShardingSphere-Proxy need the GLT module. ## CDC Server Deployment Steps @@ -95,14 +95,13 @@ props: 2. Introduce JDBC driver. -Proxy already includes PostgreSQL JDBC driver. +Proxy already includes PostgreSQL, openGauss JDBC driver. If the backend connects to the following databases, please download the corresponding JDBC driver jar package and put it in the `${shardingsphere-proxy}/ext-lib` directory. | Database | JDBC Driver | |-----------|---------------------------------------------------------------------------------------------------------------------------------| | MySQL | [mysql-connector-java-8.0.31.jar](https://repo1.maven.org/maven2/mysql/mysql-connector-java/8.0.31/) | -| openGauss | [opengauss-jdbc-3.1.1-og.jar](https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/3.1.1-og/opengauss-jdbc-3.1.1-og.jar) | 4. Start ShardingSphere-Proxy: @@ -164,7 +163,7 @@ Configuration item description: ```sql ALTER STREAMING RULE ( READ( -- Data reading configuration. If not configured, some parameters will take effect by default. - WORKER_THREAD=20, -- Affects full and incremental tasks, the size of the thread pool for fetching data from the source end. If not configured, the default value will be used. It needs to ensure that this value is not lower than the number of physical database + WORKER_THREAD=20, -- Affects full and incremental tasks, the size of the thread pool for fetching data from the source end. If not configured, the default value will be used. It needs to ensure that this value is not lower than the number of database shards BATCH_SIZE=1000, -- Affects full and incremental tasks, the maximum number of records returned by a query operation. If the amount of data in a transaction is greater than this value, the incremental situation may exceed the set value. SHARDING_SIZE=10000000, -- Affects full tasks, the size of stock data sharding. If not configured, the default value will be used. RATE_LIMITER ( -- Affects full and incremental tasks, rate limiting algorithm. If not configured, no rate limiting. @@ -211,13 +210,13 @@ If necessary, users can also implement a CDC Client themselves to consume data a `org.apache.shardingsphere.data.pipeline.cdc.client.CDCClient` is the entry class of the CDC Client. Users can interact with the CDC Server through this class. The main new methods are as follows. -| Method Name | Return Value | Description | -|-----------------------------------------------------------------------------------------------------------------------------|---------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | Connect with the server, when connecting, you need to specify
1. Data consumption processing function
2. Exception handling logic during consumption
3. Server error exception handling function | -| login(CDCLoginParameter parameter) | void | CDC login, parameters
username: username
password: password | -| startStreaming(StartStreamingParameter parameter) | streamId (CDC task unique identifier) | Start CDC subscription, StartStreamingParameter parameters
database: logical library name
schemaTables: subscribed table name
full: whether to subscribe to full data | -| restartStreaming(String streamingId) | void | Restart subscription | -| stopStreaming(String streamingId) | void | Stop subscription | -| dropStreaming(String streamingId) | void | Delete subscription | -| await() | void | Block the CDC thread and wait for the channel to close | -| close() | void | Close the channel, the process ends | +| Method Name | Return Value | Description | +|-----------------------------------------------------------------------------------------------------------------------------|--------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| connect(Consumer> dataConsumer, ExceptionHandler exceptionHandler, ServerErrorResultHandler errorResultHandler | void | Connect with the server, when connecting, you need to specify
1. Data consumption processing function
2. Exception handling logic during consumption
3. Server error exception handling function | +| login(CDCLoginParameter parameter) | void | CDC login, parameters
username: username
password: password | +| startStreaming(StartStreamingParameter parameter) | streamingId | Start CDC subscription, StartStreamingParameter parameters
database: logical database name
schemaTables: subscribed table name
full: whether to subscribe to full data | +| restartStreaming(String streamingId) | void | Restart subscription | +| stopStreaming(String streamingId) | void | Stop subscription | +| dropStreaming(String streamingId) | void | Delete subscription | +| await() | void | Block the CDC thread and wait for the channel to close | +| close() | void | Close the channel, the process ends | diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md index 4bddeabed698c..fdb667869516d 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.cn.md @@ -141,9 +141,9 @@ REGISTER STORAGE UNIT ds_0 ( ```sql CREATE SHARDING TABLE RULE t_order( STORAGE_UNITS(ds_0,ds_1), -SHARDING_COLUMN=id, +SHARDING_COLUMN=order_id, TYPE(NAME="hash_mod",PROPERTIES("sharding-count"="2")), -KEY_GENERATE_STRATEGY(COLUMN=id,TYPE(NAME="snowflake")) +KEY_GENERATE_STRATEGY(COLUMN=order_id,TYPE(NAME="snowflake")) ); ``` @@ -152,7 +152,7 @@ KEY_GENERATE_STRATEGY(COLUMN=id,TYPE(NAME="snowflake")) 在 proxy 执行建表语句。 ```sql -CREATE TABLE t_order (id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (id)); +CREATE TABLE t_order (order_id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (order_id)); ``` #### 启动 CDC Client @@ -210,14 +210,16 @@ public final class Bootstrap { 通过 proxy 写入数据,此时 CDC Client 会收到数据变更的通知。 ``` -INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); +INSERT INTO t_order (order_id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); +UPDATE t_order SET status='updated' WHERE order_id = 1; +DELETE FROM t_order WHERE order_id = 2; ``` Bootstrap 会输出类似的日志 ``` records: [before { - name: "id" + name: "order_id" value { type_url: "type.googleapis.com/google.protobuf.Empty" } @@ -252,8 +254,8 @@ SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; sharding_db=> SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; item | data_source | status | active | processed_records_count | inventory_finished_percentage | incremental_idle_seconds | confirmed_position | current_position | error_message ------+-------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+--------------------+------------------+--------------- - 0 | ds_0 | EXECUTE_INCREMENTAL_TASK | false | 1 | 100 | 115 | 5/597E43D0 | 5/597E4810 | - 1 | ds_1 | EXECUTE_INCREMENTAL_TASK | false | 2 | 100 | 115 | 5/597E4450 | 5/597E4810 | + 0 | ds_0 | EXECUTE_INCREMENTAL_TASK | false | 2 | 100 | 115 | 5/597E43D0 | 5/597E4810 | + 1 | ds_1 | EXECUTE_INCREMENTAL_TASK | false | 3 | 100 | 115 | 5/597E4450 | 5/597E4810 | (2 rows) ``` diff --git a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md index 4bf4b1462eaab..1cb1bc8961866 100644 --- a/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md +++ b/docs/document/content/user-manual/shardingsphere-proxy/cdc/usage.en.md @@ -101,7 +101,7 @@ For details, please refer to [openGauss GRANT](https://docs.opengauss.org/zh/doc #### Prerequisites -1. Prepare the library, table, and data of the CDC source end. +1. Prepare the database, table, and data of the CDC source end. ```sql DROP DATABASE IF EXISTS ds_0; @@ -113,7 +113,7 @@ CREATE DATABASE ds_1; #### Configure CDC Server -1. Create a logical library. +1. Create a logical database. ```sql CREATE DATABASE sharding_db; @@ -141,9 +141,9 @@ REGISTER STORAGE UNIT ds_0 ( ```sql CREATE SHARDING TABLE RULE t_order( STORAGE_UNITS(ds_0,ds_1), -SHARDING_COLUMN=id, +SHARDING_COLUMN=order_id, TYPE(NAME="hash_mod",PROPERTIES("sharding-count"="2")), -KEY_GENERATE_STRATEGY(COLUMN=id,TYPE(NAME="snowflake")) +KEY_GENERATE_STRATEGY(COLUMN=order_id,TYPE(NAME="snowflake")) ); ``` @@ -152,9 +152,7 @@ KEY_GENERATE_STRATEGY(COLUMN=id,TYPE(NAME="snowflake")) Execute the creation table statement in the proxy. ```sql -CREATE TABLE t_order (id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (id)); - -INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); +CREATE TABLE t_order (order_id INT NOT NULL, user_id INT NOT NULL, status VARCHAR(45) NULL, PRIMARY KEY (order_id)); ``` #### Start CDC Client @@ -201,7 +199,7 @@ There are mainly 4 steps 1. Construct CDCClient, pass in CDCClientConfiguration 2. Call CDCClient.connect(), this step is to establish a connection with the CDC Server 3. Call CDCClient.login(), log in with the username and password configured in server.yaml -4. Call CDCClient.startStreaming(), start subscribing, you need to ensure that the subscribed library and table exist in ShardingSphere-Proxy, otherwise an error will be reported +4. Call CDCClient.startStreaming(), start subscribing, you need to ensure that the subscribed database and table exist in ShardingSphere-Proxy, otherwise an error will be reported > CDCClient.await is to block the main thread, it is not a necessary step, other methods can also be used, as long as the CDC thread is always working. @@ -212,14 +210,16 @@ If you need more complex data consumption implementation, such as writing to the When write data through a proxy, the CDC Client is notified of the data change. ``` -INSERT INTO t_order (id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); +INSERT INTO t_order (order_id, user_id, status) VALUES (1,1,'ok1'),(2,2,'ok2'),(3,3,'ok3'); +UPDATE t_order SET status='updated' WHERE order_id = 1; +DELETE FROM t_order WHERE order_id = 2; ``` Bootstrap will output a similar log. ``` records: [before { - name: "id" + name: "order_id" value { type_url: "type.googleapis.com/google.protobuf.Empty" } @@ -254,8 +254,8 @@ Running result sharding_db=> SHOW STREAMING STATUS j0302p0000702a83116fcee83f70419ca5e2993791; item | data_source | status | active | processed_records_count | inventory_finished_percentage | incremental_idle_seconds | confirmed_position | current_position | error_message ------+-------------+--------------------------+--------+-------------------------+-------------------------------+--------------------------+--------------------+------------------+--------------- - 0 | ds_0 | EXECUTE_INCREMENTAL_TASK | false | 1 | 100 | 115 | 5/597E43D0 | 5/597E4810 | - 1 | ds_1 | EXECUTE_INCREMENTAL_TASK | false | 2 | 100 | 115 | 5/597E4450 | 5/597E4810 | + 0 | ds_0 | EXECUTE_INCREMENTAL_TASK | false | 2 | 100 | 115 | 5/597E43D0 | 5/597E4810 | + 1 | ds_1 | EXECUTE_INCREMENTAL_TASK | false | 3 | 100 | 115 | 5/597E4450 | 5/597E4810 | (2 rows) ```