From 342807f4061aa03a480682aee1b9133b9c5f01b1 Mon Sep 17 00:00:00 2001 From: RickyHuo Date: Wed, 21 Mar 2018 11:13:27 +0800 Subject: [PATCH] Update to 0.0.6 --- README.md | 23 +++++++++++++++--- docs/jdbc_format.md | 57 +++++++++++++++++++++++++++++++++++++++++++++ pom.xml | 2 +- 3 files changed, 78 insertions(+), 4 deletions(-) create mode 100644 docs/jdbc_format.md diff --git a/README.md b/README.md index 7bf49e6..beff10c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ * Author: rickyHuo * Homepage: https://github.com/RickyHuo/hangout-output-clickhouse -* Version: 0.0.5 +* Version: 0.0.6 ### Description @@ -39,7 +39,9 @@ table fields, 必须和Hangout清洗后的字段保持一致 数据插入格式[Format Introduction](https://clickhouse.yandex/docs/en/formats/) -当前支持`Values`以及`JSONEachRow` +当前支持`Values`、`JSONEachRow`以及`TabSeparated` + +[Format Performance TEST](./docs/jdbc_format.md) ##### host [string] @@ -93,4 +95,19 @@ outputs: table: apm_netdiagno bulk_size: 500 ``` -> 使用`JSONEachRow`将数据写入ClickHouse,使用时务必保证清洗后的数据没有多余的字段 \ No newline at end of file +> 使用`JSONEachRow`将数据写入ClickHouse,使用时务必保证清洗后的数据没有多余的字段 + +``` +outputs: + - com.sina.bip.hangout.outputs.Clickhouse: + host: clickhouse.bip.sina.com.cn:8123 + format: TabSeparated + username: user + password: passwd + database: apm + table: apm_netdiagno + fields: ['_device_id', '_ping_small', '_domain', '_traceroute', '_ping_big', 'date', 'ts', '_snet'] + bulk_size: 500 +``` + +> 将fields中对应的字段写入ClickHouse \ No newline at end of file diff --git a/docs/jdbc_format.md b/docs/jdbc_format.md new file mode 100644 index 0000000..88ee3ba --- /dev/null +++ b/docs/jdbc_format.md @@ -0,0 +1,57 @@ +# ClickHouse Format Performance TEST of JDBC + +## 前言 + +[Hangout-output-Clickhouse](https://github.com/RickyHuo/hangout-output-clickhouse)目前支持3种形式的数据插入,[Values](https://clickhouse.yandex/docs/en/formats/values/)、[JSONEachRow](https://clickhouse.yandex/docs/en/formats/jsoneachrow/)以及[TabSeparated](https://clickhouse.yandex/docs/en/formats/tabseparated/)。这三种方式最终插入SQL如下 + +1. Values +``` +insert into db.table (date, datetime, domian, uri, http_code) values ('2018-03-18', '2018-03-19 10:44:57', 'sina.com.cn', '/sports', 200), ('2018-03-18', '2018-03-19 10:44:57', 'sina.com.cn', '/finance', 403) +``` + +2. JSONEachRow +``` +insert into db.table format JSONEachRow {"date":"2018-03-18", "datetime": "2018-03-19 10:44:57", "domain":"sina.com.cn", "uri": "/sports", "http_code":200}{"date":"2018-03-18", "datetime": "2018-03-19 10:44:57", "domain":"sina.com.cn", "uri": "/finance", "http_code":403} +``` + +3. TabSeparated +``` +insert into db.table (date, datetime, domian, uri, http_code) FORMAT TabSeparated +``` + +#### 测试环境准备 + +**为了比较三种插入方式的性能,模拟测试场景:** + +- ClickHouse + - 单点(Standalone) + - Intel(R) Xeon(R) CPU E5-2620 v2 @ 2.10GHz + - 12 Core + - HDD + +- 数据处理服务器 + - Intel(R) Xeon(R) CPU E5620 @ 2.40GHz + - 8 Core + +- 数据处理信息 + - 2W(Bulk Size) + - 100(Bulk Number) + - 0.85KB(Single line) + - 并发4 + - 测试原始数据条数:2W\*100*4 + + +## 测试结果 + +![](http://oupfz5jq2.bkt.clouddn.com/18-3-21/66751746.jpg) + +## 总结 + +- Values +性能中等,不需要严格把控字段类型,容易产生插入报错,不推荐使用 + +- JSONEachRow +性能较差,操作方便,不会产生插入报错 + +- TabSeparated +性能较好,不产生插入报错,但是需要在配置里严格把控各字段的数据类型,推荐使用 diff --git a/pom.xml b/pom.xml index e45d4e7..2537318 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ sina hangout-output-plugins-clickhouse - 0.0.5 + 0.0.6 jar hangout-output-plugins-clickhouse