diff --git a/.github/actions/deploy-greptimedb/action.yml b/.github/actions/deploy-greptimedb/action.yml new file mode 100644 index 000000000000..b18830ef8d7f --- /dev/null +++ b/.github/actions/deploy-greptimedb/action.yml @@ -0,0 +1,31 @@ +name: Deploy GreptimeDB cluster +description: Deploy GreptimeDB cluster on Kubernetes +inputs: + aws-ci-test-bucket: + description: 'AWS S3 bucket name for testing' + required: true + aws-region: + description: 'AWS region for testing' + required: true + data-root: + description: 'Data root for testing' + required: true + aws-access-key-id: + description: 'AWS access key id for testing' + required: true + aws-secret-access-key: + description: 'AWS secret access key for testing' + required: true +runs: + using: composite + steps: + - name: Deploy GreptimeDB by Helm + shell: bash + env: + DATA_ROOT: ${{ inputs.data-root }} + AWS_CI_TEST_BUCKET: ${{ inputs.aws-ci-test-bucket }} + AWS_REGION: ${{ inputs.aws-region }} + AWS_ACCESS_KEY_ID: ${{ inputs.aws-access-key-id }} + AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-secret-access-key }} + run: | + ./.github/scripts/deploy-greptimedb.sh diff --git a/.github/actions/sqlness-test/action.yml b/.github/actions/sqlness-test/action.yml new file mode 100644 index 000000000000..28d58902a6ba --- /dev/null +++ b/.github/actions/sqlness-test/action.yml @@ -0,0 +1,59 @@ +name: Run sqlness test +description: Run sqlness test on GreptimeDB + +inputs: + aws-ci-test-bucket: + description: 'AWS S3 bucket name for testing' + required: true + aws-region: + description: 'AWS region for testing' + required: true + data-root: + description: 'Data root for testing' + required: true + aws-access-key-id: + description: 'AWS access key id for testing' + required: true + aws-secret-access-key: + description: 'AWS secret access key for testing' + required: true + +runs: + using: composite + steps: + - name: Deploy GreptimeDB cluster by Helm + uses: ./.github/actions/deploy-greptimedb + with: + data-root: ${{ inputs.data-root }} + aws-ci-test-bucket: ${{ inputs.aws-ci-test-bucket }} + aws-region: ${{ inputs.aws-region }} + aws-access-key-id: ${{ inputs.aws-access-key-id }} + aws-secret-access-key: ${{ inputs.aws-secret-access-key }} + + # TODO(zyy17): The following tests will be replaced by the real sqlness test. + - name: Run tests on greptimedb cluster + shell: bash + run: | + mysql -h 127.0.0.1 -P 14002 -e "CREATE TABLE IF NOT EXISTS system_metrics (host VARCHAR(255), idc VARCHAR(255), cpu_util DOUBLE, memory_util DOUBLE, disk_util DOUBLE, ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(host, idc), TIME INDEX(ts));" && \ + mysql -h 127.0.0.1 -P 14002 -e "SHOW TABLES;" + + - name: Run tests on greptimedb cluster that uses S3 + shell: bash + run: | + mysql -h 127.0.0.1 -P 24002 -e "CREATE TABLE IF NOT EXISTS system_metrics (host VARCHAR(255), idc VARCHAR(255), cpu_util DOUBLE, memory_util DOUBLE, disk_util DOUBLE, ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(host, idc), TIME INDEX(ts));" && \ + mysql -h 127.0.0.1 -P 24002 -e "SHOW TABLES;" + + - name: Run tests on standalone greptimedb + shell: bash + run: | + mysql -h 127.0.0.1 -P 34002 -e "CREATE TABLE IF NOT EXISTS system_metrics (host VARCHAR(255), idc VARCHAR(255), cpu_util DOUBLE, memory_util DOUBLE, disk_util DOUBLE, ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(host, idc), TIME INDEX(ts));" && \ + mysql -h 127.0.0.1 -P 34002 -e "SHOW TABLES;" + + - name: Clean S3 data + shell: bash + env: + AWS_DEFAULT_REGION: ${{ inputs.aws-region }} + AWS_ACCESS_KEY_ID: ${{ inputs.aws-access-key-id }} + AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-secret-access-key }} + run: | + aws s3 rm s3://${{ inputs.aws-ci-test-bucket }}/${{ inputs.data-root }} --recursive diff --git a/.github/scripts/deploy-greptimedb.sh b/.github/scripts/deploy-greptimedb.sh new file mode 100755 index 000000000000..2d063e9e9e78 --- /dev/null +++ b/.github/scripts/deploy-greptimedb.sh @@ -0,0 +1,172 @@ +#!/usr/bin/env bash + +set -e +set -o pipefail + +KUBERNETES_VERSION="${KUBERNETES_VERSION:-v1.24.0}" +ENABLE_STANDALONE_MODE="${ENABLE_STANDALONE_MODE:-true}" +DEFAULT_INSTALL_NAMESPACE=${DEFAULT_INSTALL_NAMESPACE:-default} +GREPTIMEDB_IMAGE_TAG=${GREPTIMEDB_IMAGE_TAG:-latest} +ETCD_CHART="oci://registry-1.docker.io/bitnamicharts/etcd" +GREPTIME_CHART="https://greptimeteam.github.io/helm-charts/" + +# Ceate a cluster with 1 control-plane node and 5 workers. +function create_kind_cluster() { + cat < /tmp/connections.out & +} + +# Deploy greptimedb cluster by using S3. +# It will expose cluster service ports as '24000', '24001', '24002', '24003' to local access. +function deploy_greptimedb_cluster_with_s3_storage() { + local cluster_name=$1 + local install_namespace=$2 + + kubectl create ns "$install_namespace" + + deploy_etcd_cluster "$install_namespace" + + helm install "$cluster_name" greptime/greptimedb-cluster -n "$install_namespace" \ + --set image.tag="$GREPTIMEDB_IMAGE_TAG" \ + --set meta.etcdEndpoints="etcd.$install_namespace:2379" \ + --set storage.s3.bucket="$AWS_CI_TEST_BUCKET" \ + --set storage.s3.region="$AWS_REGION" \ + --set storage.s3.root="$DATA_ROOT" \ + --set storage.s3.secretName=s3-credentials \ + --set storage.credentials.secretName=s3-credentials \ + --set storage.credentials.secretCreation.enabled=true \ + --set storage.credentials.secretCreation.enableEncryption=false \ + --set storage.credentials.secretCreation.data.access-key-id="$AWS_ACCESS_KEY_ID" \ + --set storage.credentials.secretCreation.data.secret-access-key="$AWS_SECRET_ACCESS_KEY" + + # Wait for greptimedb cluster to be ready. + while true; do + PHASE=$(kubectl -n "$install_namespace" get gtc "$cluster_name" -o jsonpath='{.status.clusterPhase}') + if [ "$PHASE" == "Running" ]; then + echo "Cluster is ready" + break + else + echo "Cluster is not ready yet: Current phase: $PHASE" + sleep 5 # wait for 5 seconds before check again. + fi + done + + # Expose greptimedb cluster to local access. + kubectl -n "$install_namespace" port-forward svc/"$cluster_name"-frontend \ + 24000:4000 \ + 24001:4001 \ + 24002:4002 \ + 24003:4003 > /tmp/connections.out & +} + +# Deploy standalone greptimedb. +# It will expose cluster service ports as '34000', '34001', '34002', '34003' to local access. +function deploy_standalone_greptimedb() { + helm install greptimedb-standalone greptime/greptimedb-standalone \ + --set image.tag="$GREPTIMEDB_IMAGE_TAG" \ + -n "$DEFAULT_INSTALL_NAMESPACE" + + # Wait for etcd cluster to be ready. + kubectl rollout status statefulset/greptimedb-standalone -n "$DEFAULT_INSTALL_NAMESPACE" + + # Expose greptimedb to local access. + kubectl -n "$DEFAULT_INSTALL_NAMESPACE" port-forward svc/greptimedb-standalone \ + 34000:4000 \ + 34001:4001 \ + 34002:4002 \ + 34003:4003 > /tmp/connections.out & +} + +# Entrypoint of the script. +function main() { + create_kind_cluster + add_greptime_chart + + # Deploy standalone greptimedb in the same K8s. + if [ "$ENABLE_STANDALONE_MODE" == "true" ]; then + deploy_standalone_greptimedb + fi + + deploy_greptimedb_operator + deploy_greptimedb_cluster testcluster testcluster + deploy_greptimedb_cluster_with_s3_storage testcluster-s3 testcluster-s3 +} + +# Usages: +# - Deploy greptimedb cluster: ./deploy-greptimedb.sh +main diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml index 7d2b6be636b7..2f4c587ce645 100644 --- a/.github/workflows/nightly-ci.yml +++ b/.github/workflows/nightly-ci.yml @@ -34,6 +34,14 @@ jobs: uses: Swatinem/rust-cache@v2 - name: Run sqlness run: cargo sqlness + - name: Notify slack if failed + if: failure() + uses: slackapi/slack-github-action@v1.23.0 + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_DEVELOP_CHANNEL }} + with: + payload: | + {"text": "Nightly CI failed for sqlness tests"} - name: Upload sqlness logs if: always() uses: actions/upload-artifact@v3 @@ -80,3 +88,11 @@ jobs: GT_S3_ACCESS_KEY: ${{ secrets.S3_ACCESS_KEY }} GT_S3_REGION: ${{ secrets.S3_REGION }} UNITTEST_LOG_DIR: "__unittest_logs" + - name: Notify slack if failed + if: failure() + uses: slackapi/slack-github-action@v1.23.0 + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_DEVELOP_CHANNEL }} + with: + payload: | + {"text": "Nightly CI failed for cargo test"} diff --git a/.github/workflows/nightly-funtional-tests.yml b/.github/workflows/nightly-funtional-tests.yml new file mode 100644 index 000000000000..5dbd04d30c28 --- /dev/null +++ b/.github/workflows/nightly-funtional-tests.yml @@ -0,0 +1,26 @@ +name: Nightly functional tests + +on: + schedule: + # At 00:00 on Tuesday. + - cron: '0 0 * * 2' + workflow_dispatch: + +jobs: + sqlness-test: + name: Run sqlness test + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Run sqlness test + uses: ./.github/actions/sqlness-test + with: + data-root: sqlness-test + aws-ci-test-bucket: ${{ vars.AWS_CI_TEST_BUCKET }} + aws-region: ${{ vars.AWS_CI_TEST_BUCKET_REGION }} + aws-access-key-id: ${{ secrets.AWS_CI_TEST_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_CI_TEST_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7a4a4bb9651b..9c3f1be562d5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -302,8 +302,12 @@ jobs: release-cn-artifacts: name: Release artifacts to CN region if: ${{ inputs.release_images || github.event_name == 'push' || github.event_name == 'schedule' }} - needs: [ + needs: [ # The job have to wait for all the artifacts are built. allocate-runners, + build-linux-amd64-artifacts, + build-linux-arm64-artifacts, + build-macos-artifacts, + build-windows-artifacts, release-images-to-dockerhub, ] runs-on: ubuntu-20.04 @@ -338,11 +342,12 @@ jobs: publish-github-release: name: Create GitHub release and upload artifacts if: ${{ inputs.publish_github_release || github.event_name == 'push' || github.event_name == 'schedule' }} - needs: [ + needs: [ # The job have to wait for all the artifacts are built. allocate-runners, build-linux-amd64-artifacts, build-linux-arm64-artifacts, build-macos-artifacts, + build-windows-artifacts, release-images-to-dockerhub, ] runs-on: ubuntu-20.04 diff --git a/.github/workflows/size-label.yml b/.github/workflows/size-label.yml new file mode 100644 index 000000000000..2b504f32f6a2 --- /dev/null +++ b/.github/workflows/size-label.yml @@ -0,0 +1,26 @@ +name: size-labeler + +on: [pull_request] + +jobs: + labeler: + runs-on: ubuntu-latest + name: Label the PR size + steps: + - uses: codelytv/pr-size-labeler@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + s_label: 'Size: S' + s_max_size: '100' + m_label: 'Size: M' + m_max_size: '500' + l_label: 'Size: L' + l_max_size: '1000' + xl_label: 'Size: XL' + fail_if_xl: 'false' + message_if_xl: > + This PR exceeds the recommended size of 1000 lines. + Please make sure you are NOT addressing multiple issues with one PR. + Note this PR might be rejected due to its size. + github_api_url: 'api.github.com' + files_to_ignore: 'Cargo.lock' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2b2f1685bc1f..48e632f2c2b1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,7 @@ Thanks a lot for considering contributing to GreptimeDB. We believe people like you would make GreptimeDB a great product. We intend to build a community where individuals can have open talks, show respect for one another, and speak with true ❤️. Meanwhile, we are to keep transparency and make your effort count here. -Read the guidelines, and they can help you get started. Communicate with respect to developers maintaining and developing the project. In return, they should reciprocate that respect by addressing your issue, reviewing changes, as well as helping finalize and merge your pull requests. +Please read the guidelines, and they can help you get started. Communicate with respect to developers maintaining and developing the project. In return, they should reciprocate that respect by addressing your issue, reviewing changes, as well as helping finalize and merge your pull requests. Follow our [README](https://github.com/GreptimeTeam/greptimedb#readme) to get the whole picture of the project. To learn about the design of GreptimeDB, please refer to the [design docs](https://github.com/GrepTimeTeam/docs). @@ -21,7 +21,7 @@ Pull requests are great, but we accept all kinds of other help if you like. Such - Write tutorials or blog posts. Blog, speak about, or create tutorials about one of GreptimeDB's many features. Mention [@greptime](https://twitter.com/greptime) on Twitter and email info@greptime.com so we can give pointers and tips and help you spread the word by promoting your content on Greptime communication channels. - Improve the documentation. [Submit documentation](http://github.com/greptimeTeam/docs/) updates, enhancements, designs, or bug fixes, and fixing any spelling or grammar errors will be very much appreciated. - Present at meetups and conferences about your GreptimeDB projects. Your unique challenges and successes in building things with GreptimeDB can provide great speaking material. We'd love to review your talk abstract, so get in touch with us if you'd like some help! -- Submit bug reports. To report a bug or a security issue, you can [open a new GitHub issue](https://github.com/GrepTimeTeam/greptimedb/issues/new). +- Submitting bug reports. To report a bug or a security issue, you can [open a new GitHub issue](https://github.com/GrepTimeTeam/greptimedb/issues/new). - Speak up feature requests. Send feedback is a great way for us to understand your different use cases of GreptimeDB better. If you want to share your experience with GreptimeDB, or if you want to discuss any ideas, you can start a discussion on [GitHub discussions](https://github.com/GreptimeTeam/greptimedb/discussions), chat with the Greptime team on [Slack](https://greptime.com/slack), or you can tweet [@greptime](https://twitter.com/greptime) on Twitter. ## Code of Conduct @@ -81,7 +81,7 @@ Now, `pre-commit` will run automatically on `git commit`. ### Title The titles of pull requests should be prefixed with category names listed in [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0) -like `feat`/`fix`/`docs`, with a concise summary of code change following. DO NOT use last commit message as pull request title. +like `feat`/`fix`/`docs`, with a concise summary of code change following. AVOID using the last commit message as pull request title. ### Description @@ -100,7 +100,7 @@ of what you were trying to do and what went wrong. You can also reach for help i ## Community -The core team will be thrilled if you participate in any way you like. When you are stuck, try ask for help by filing an issue, with a detailed description of what you were trying to do and what went wrong. If you have any questions or if you would like to get involved in our community, please check out: +The core team will be thrilled if you would like to participate in any way you like. When you are stuck, try to ask for help by filing an issue, with a detailed description of what you were trying to do and what went wrong. If you have any questions or if you would like to get involved in our community, please check out: - [GreptimeDB Community Slack](https://greptime.com/slack) - [GreptimeDB Github Discussions](https://github.com/GreptimeTeam/greptimedb/discussions) diff --git a/Cargo.lock b/Cargo.lock index 7a62a79a9467..8e1ad37b6974 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -205,7 +205,7 @@ checksum = "8f1f8f5a6f3d50d89e3797d7593a50f96bb2aaa20ca0cc7be1fb673232c91d72" [[package]] name = "api" -version = "0.4.1" +version = "0.4.2" dependencies = [ "common-base", "common-error", @@ -669,7 +669,7 @@ dependencies = [ [[package]] name = "auth" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "async-trait", @@ -842,7 +842,7 @@ dependencies = [ [[package]] name = "benchmarks" -version = "0.4.1" +version = "0.4.2" dependencies = [ "arrow", "chrono", @@ -1204,7 +1204,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "catalog" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "arc-swap", @@ -1486,7 +1486,7 @@ checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" [[package]] name = "client" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "arrow-flight", @@ -1517,7 +1517,7 @@ dependencies = [ "session", "snafu", "substrait 0.17.1", - "substrait 0.4.1", + "substrait 0.4.2", "tokio", "tokio-stream", "tonic 0.10.2", @@ -1547,7 +1547,7 @@ dependencies = [ [[package]] name = "cmd" -version = "0.4.1" +version = "0.4.2" dependencies = [ "anymap", "async-trait", @@ -1595,7 +1595,7 @@ dependencies = [ "servers", "session", "snafu", - "substrait 0.4.1", + "substrait 0.4.2", "table", "temp-env", "tikv-jemallocator", @@ -1628,7 +1628,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335" [[package]] name = "common-base" -version = "0.4.1" +version = "0.4.2" dependencies = [ "anymap", "bitvec", @@ -1643,7 +1643,7 @@ dependencies = [ [[package]] name = "common-catalog" -version = "0.4.1" +version = "0.4.2" dependencies = [ "chrono", "common-error", @@ -1656,7 +1656,7 @@ dependencies = [ [[package]] name = "common-config" -version = "0.4.1" +version = "0.4.2" dependencies = [ "common-base", "humantime-serde", @@ -1665,7 +1665,7 @@ dependencies = [ [[package]] name = "common-datasource" -version = "0.4.1" +version = "0.4.2" dependencies = [ "arrow", "arrow-schema", @@ -1694,7 +1694,7 @@ dependencies = [ [[package]] name = "common-error" -version = "0.4.1" +version = "0.4.2" dependencies = [ "snafu", "strum 0.25.0", @@ -1702,7 +1702,7 @@ dependencies = [ [[package]] name = "common-function" -version = "0.4.1" +version = "0.4.2" dependencies = [ "arc-swap", "chrono-tz 0.6.3", @@ -1725,7 +1725,7 @@ dependencies = [ [[package]] name = "common-greptimedb-telemetry" -version = "0.4.1" +version = "0.4.2" dependencies = [ "async-trait", "common-error", @@ -1744,7 +1744,7 @@ dependencies = [ [[package]] name = "common-grpc" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "arrow-flight", @@ -1774,7 +1774,7 @@ dependencies = [ [[package]] name = "common-grpc-expr" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "async-trait", @@ -1793,7 +1793,7 @@ dependencies = [ [[package]] name = "common-macro" -version = "0.4.1" +version = "0.4.2" dependencies = [ "arc-swap", "backtrace", @@ -1810,7 +1810,7 @@ dependencies = [ [[package]] name = "common-mem-prof" -version = "0.4.1" +version = "0.4.2" dependencies = [ "common-error", "common-macro", @@ -1823,7 +1823,7 @@ dependencies = [ [[package]] name = "common-meta" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "arrow-flight", @@ -1861,7 +1861,7 @@ dependencies = [ [[package]] name = "common-procedure" -version = "0.4.1" +version = "0.4.2" dependencies = [ "async-stream", "async-trait", @@ -1885,7 +1885,7 @@ dependencies = [ [[package]] name = "common-procedure-test" -version = "0.4.1" +version = "0.4.2" dependencies = [ "async-trait", "common-procedure", @@ -1893,7 +1893,7 @@ dependencies = [ [[package]] name = "common-query" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "async-trait", @@ -1916,7 +1916,7 @@ dependencies = [ [[package]] name = "common-recordbatch" -version = "0.4.1" +version = "0.4.2" dependencies = [ "common-error", "common-macro", @@ -1933,7 +1933,7 @@ dependencies = [ [[package]] name = "common-runtime" -version = "0.4.1" +version = "0.4.2" dependencies = [ "async-trait", "common-error", @@ -1950,7 +1950,7 @@ dependencies = [ [[package]] name = "common-telemetry" -version = "0.4.1" +version = "0.4.2" dependencies = [ "backtrace", "common-error", @@ -1977,7 +1977,7 @@ dependencies = [ [[package]] name = "common-test-util" -version = "0.4.1" +version = "0.4.2" dependencies = [ "once_cell", "rand", @@ -1986,7 +1986,7 @@ dependencies = [ [[package]] name = "common-time" -version = "0.4.1" +version = "0.4.2" dependencies = [ "arrow", "chrono", @@ -2001,7 +2001,7 @@ dependencies = [ [[package]] name = "common-version" -version = "0.4.1" +version = "0.4.2" dependencies = [ "build-data", ] @@ -2660,7 +2660,7 @@ dependencies = [ [[package]] name = "datanode" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "arrow-flight", @@ -2719,7 +2719,7 @@ dependencies = [ "sql", "storage", "store-api", - "substrait 0.4.1", + "substrait 0.4.2", "table", "tokio", "tokio-stream", @@ -2733,7 +2733,7 @@ dependencies = [ [[package]] name = "datatypes" -version = "0.4.1" +version = "0.4.2" dependencies = [ "arrow", "arrow-array", @@ -3190,7 +3190,7 @@ dependencies = [ [[package]] name = "file-engine" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "async-trait", @@ -3306,7 +3306,7 @@ dependencies = [ [[package]] name = "frontend" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "arc-swap", @@ -3370,7 +3370,7 @@ dependencies = [ "storage", "store-api", "strfmt", - "substrait 0.4.1", + "substrait 0.4.2", "table", "tokio", "toml 0.7.8", @@ -4440,7 +4440,7 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "log-store" -version = "0.4.1" +version = "0.4.2" dependencies = [ "async-stream", "async-trait", @@ -4711,7 +4711,7 @@ dependencies = [ [[package]] name = "meta-client" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "async-trait", @@ -4741,7 +4741,7 @@ dependencies = [ [[package]] name = "meta-srv" -version = "0.4.1" +version = "0.4.2" dependencies = [ "anymap", "api", @@ -4933,7 +4933,7 @@ dependencies = [ [[package]] name = "mito2" -version = "0.4.1" +version = "0.4.2" dependencies = [ "anymap", "api", @@ -5386,11 +5386,13 @@ dependencies = [ [[package]] name = "object-store" -version = "0.4.1" +version = "0.4.2" dependencies = [ "anyhow", "async-trait", "bytes", + "common-error", + "common-macro", "common-runtime", "common-telemetry", "common-test-util", @@ -5399,6 +5401,7 @@ dependencies = [ "metrics", "moka", "opendal", + "snafu", "tokio", "uuid", ] @@ -5581,6 +5584,7 @@ version = "0.20.0" source = "git+https://github.com/waynexia/opentelemetry-rust.git?rev=33841b38dda79b15f2024952be5f32533325ca02#33841b38dda79b15f2024952be5f32533325ca02" dependencies = [ "async-trait", + "crossbeam-channel", "futures-channel", "futures-executor", "futures-util", @@ -5588,12 +5592,14 @@ dependencies = [ "once_cell", "opentelemetry 0.21.0", "ordered-float 4.1.1", + "percent-encoding", + "rand", "thiserror", ] [[package]] name = "operator" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "async-compat", @@ -5638,7 +5644,7 @@ dependencies = [ "sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=602d7878c9949e48512251c7f18695a50936e51c)", "storage", "store-api", - "substrait 0.4.1", + "substrait 0.4.2", "table", "tokio", "tonic 0.10.2", @@ -5866,7 +5872,7 @@ dependencies = [ [[package]] name = "partition" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "async-trait", @@ -6193,7 +6199,7 @@ dependencies = [ [[package]] name = "plugins" -version = "0.4.1" +version = "0.4.2" dependencies = [ "auth", "common-base", @@ -6437,7 +6443,7 @@ dependencies = [ [[package]] name = "promql" -version = "0.4.1" +version = "0.4.2" dependencies = [ "async-recursion", "async-trait", @@ -6446,6 +6452,7 @@ dependencies = [ "common-catalog", "common-error", "common-macro", + "common-recordbatch", "common-telemetry", "datafusion", "datatypes", @@ -6753,7 +6760,7 @@ dependencies = [ [[package]] name = "query" -version = "0.4.1" +version = "0.4.2" dependencies = [ "ahash 0.8.6", "api", @@ -6810,7 +6817,7 @@ dependencies = [ "stats-cli", "store-api", "streaming-stats", - "substrait 0.4.1", + "substrait 0.4.2", "table", "tokio", "tokio-stream", @@ -8038,7 +8045,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "script" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "arc-swap", @@ -8307,7 +8314,7 @@ dependencies = [ [[package]] name = "servers" -version = "0.4.1" +version = "0.4.2" dependencies = [ "aide", "api", @@ -8401,7 +8408,7 @@ dependencies = [ [[package]] name = "session" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "arc-swap", @@ -8669,7 +8676,7 @@ dependencies = [ [[package]] name = "sql" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "common-base", @@ -8720,7 +8727,7 @@ dependencies = [ [[package]] name = "sqlness-runner" -version = "0.4.1" +version = "0.4.2" dependencies = [ "async-trait", "clap 4.4.7", @@ -8926,7 +8933,7 @@ dependencies = [ [[package]] name = "storage" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "arc-swap", @@ -8980,7 +8987,7 @@ dependencies = [ [[package]] name = "store-api" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "aquamarine", @@ -9119,7 +9126,7 @@ dependencies = [ [[package]] name = "substrait" -version = "0.4.1" +version = "0.4.2" dependencies = [ "async-recursion", "async-trait", @@ -9255,7 +9262,7 @@ dependencies = [ [[package]] name = "table" -version = "0.4.1" +version = "0.4.2" dependencies = [ "anymap", "async-trait", @@ -9361,7 +9368,7 @@ dependencies = [ [[package]] name = "tests-integration" -version = "0.4.1" +version = "0.4.2" dependencies = [ "api", "async-trait", @@ -9414,7 +9421,7 @@ dependencies = [ "sql", "sqlx", "store-api", - "substrait 0.4.1", + "substrait 0.4.2", "table", "tempfile", "tokio", diff --git a/Cargo.toml b/Cargo.toml index b221213fc441..2dcc975d6f31 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,7 +55,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.4.1" +version = "0.4.2" edition = "2021" license = "Apache-2.0" @@ -87,10 +87,10 @@ meter-core = { git = "https://github.com/GreptimeTeam/greptime-meter.git", rev = metrics = "0.20" moka = "0.12" once_cell = "1.18" -# opentelemetry-proto = { version = "0.2", features = ["gen-tonic", "metrics"] } opentelemetry-proto = { git = "https://github.com/waynexia/opentelemetry-rust.git", rev = "33841b38dda79b15f2024952be5f32533325ca02", features = [ "gen-tonic", "metrics", + "trace", ] } parquet = "47.0" paste = "1.0" diff --git a/README.md b/README.md index eb42d7d1ce55..94a45dad3b36 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,6 @@ Please refer to [contribution guidelines](CONTRIBUTING.md) for more information. ## Acknowledgement - GreptimeDB uses [Apache Arrow](https://arrow.apache.org/) as the memory model and [Apache Parquet](https://parquet.apache.org/) as the persistent file format. - GreptimeDB's query engine is powered by [Apache Arrow DataFusion](https://github.com/apache/arrow-datafusion). -- [OpenDAL](https://github.com/datafuselabs/opendal) from [Datafuse Labs](https://github.com/datafuselabs) gives GreptimeDB a very general and elegant data access abstraction layer. -- GreptimeDB’s meta service is based on [etcd](https://etcd.io/). +- [Apache OpenDAL (incubating)](https://opendal.apache.org) gives GreptimeDB a very general and elegant data access abstraction layer. +- GreptimeDB's meta service is based on [etcd](https://etcd.io/). - GreptimeDB uses [RustPython](https://github.com/RustPython/RustPython) for experimental embedded python scripting. diff --git a/config/datanode.example.toml b/config/datanode.example.toml index d4058f6e3f83..8376b76f6888 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -101,6 +101,10 @@ auto_flush_interval = "1h" global_write_buffer_size = "1GB" # Global write buffer size threshold to reject write requests (default 2G). global_write_buffer_reject_size = "2GB" +# Cache size for SST metadata (default 128MB). Setting it to 0 to disable the cache. +sst_meta_cache_size = "128MB" +# Cache size for vectors and arrow arrays (default 512MB). Setting it to 0 to disable the cache. +vector_cache_size = "512MB" # Log options # [logging] diff --git a/docker/dev-builder/ubuntu/Dockerfile b/docker/dev-builder/ubuntu/Dockerfile index b117d6f0f409..34855b5476fe 100644 --- a/docker/dev-builder/ubuntu/Dockerfile +++ b/docker/dev-builder/ubuntu/Dockerfile @@ -19,8 +19,13 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ build-essential \ pkg-config \ python3.10 \ - python3.10-dev \ - python3-pip + python3.10-dev + +# Remove Python 3.8 and install pip. +RUN apt-get -y purge python3.8 && \ + apt-get -y autoremove && \ + ln -s /usr/bin/python3.10 /usr/bin/python3 && \ + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 RUN git config --global --add safe.directory /greptimedb diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index f744c03ea589..f665be62a4ee 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -187,8 +187,8 @@ impl StartCommand { Ok(Options::Frontend(Box::new(opts))) } - async fn build(self, mut opts: FrontendOptions) -> Result { - let plugins = plugins::setup_frontend_plugins(&mut opts) + async fn build(self, opts: FrontendOptions) -> Result { + let plugins = plugins::setup_frontend_plugins(&opts) .await .context(StartFrontendSnafu)?; @@ -303,7 +303,7 @@ mod tests { #[tokio::test] async fn test_try_from_start_command_to_anymap() { - let mut fe_opts = FrontendOptions { + let fe_opts = FrontendOptions { http: HttpOptions { disable_dashboard: false, ..Default::default() @@ -312,7 +312,7 @@ mod tests { ..Default::default() }; - let plugins = plugins::setup_frontend_plugins(&mut fe_opts).await.unwrap(); + let plugins = plugins::setup_frontend_plugins(&fe_opts).await.unwrap(); let provider = plugins.get::().unwrap(); let result = provider diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 6b313ffa6ec1..8bc18d4bcae3 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -316,8 +316,8 @@ impl StartCommand { #[allow(unused_variables)] #[allow(clippy::diverging_sub_expression)] async fn build(self, opts: MixOptions) -> Result { - let mut fe_opts = opts.frontend; - let fe_plugins = plugins::setup_frontend_plugins(&mut fe_opts) + let fe_opts = opts.frontend; + let fe_plugins = plugins::setup_frontend_plugins(&fe_opts) .await .context(StartFrontendSnafu)?; @@ -421,12 +421,12 @@ mod tests { #[tokio::test] async fn test_try_from_start_command_to_anymap() { - let mut fe_opts = FrontendOptions { + let fe_opts = FrontendOptions { user_provider: Some("static_user_provider:cmd:test=test".to_string()), ..Default::default() }; - let plugins = plugins::setup_frontend_plugins(&mut fe_opts).await.unwrap(); + let plugins = plugins::setup_frontend_plugins(&fe_opts).await.unwrap(); let provider = plugins.get::().unwrap(); let result = provider diff --git a/src/common/error/src/ext.rs b/src/common/error/src/ext.rs index fd6d04b6778b..690ea23dc3e2 100644 --- a/src/common/error/src/ext.rs +++ b/src/common/error/src/ext.rs @@ -39,17 +39,25 @@ pub trait ErrorExt: StackError { where Self: Sized, { - let error = self.last(); - if let Some(external_error) = error.source() { - let external_root = external_error.sources().last().unwrap(); - - if error.to_string().is_empty() { - format!("{external_root}") - } else { - format!("{error}: {external_root}") + match self.status_code() { + StatusCode::Unknown | StatusCode::Internal => { + // masks internal error from end user + format!("Internal error: {}", self.status_code() as u32) + } + _ => { + let error = self.last(); + if let Some(external_error) = error.source() { + let external_root = external_error.sources().last().unwrap(); + + if error.to_string().is_empty() { + format!("{external_root}") + } else { + format!("{error}: {external_root}") + } + } else { + format!("{error}") + } } - } else { - format!("{error}") } } } diff --git a/src/common/greptimedb-telemetry/src/lib.rs b/src/common/greptimedb-telemetry/src/lib.rs index 37b86c642544..a0b735e55a02 100644 --- a/src/common/greptimedb-telemetry/src/lib.rs +++ b/src/common/greptimedb-telemetry/src/lib.rs @@ -57,7 +57,10 @@ impl GreptimeDBTelemetryTask { task_fn: BoxedTaskFunction, should_report: Arc, ) -> Self { - GreptimeDBTelemetryTask::Enable((RepeatedTask::new(interval, task_fn), should_report)) + GreptimeDBTelemetryTask::Enable(( + RepeatedTask::new(interval, task_fn).with_initial_delay(Some(Duration::ZERO)), + should_report, + )) } pub fn disable() -> Self { @@ -207,6 +210,7 @@ pub struct GreptimeDBTelemetry { working_home: Option, telemetry_url: &'static str, should_report: Arc, + report_times: usize, } #[async_trait::async_trait] @@ -239,6 +243,7 @@ impl GreptimeDBTelemetry { client: client.ok(), telemetry_url: TELEMETRY_URL, should_report, + report_times: 0, } } @@ -256,8 +261,11 @@ impl GreptimeDBTelemetry { }; if let Some(client) = self.client.as_ref() { - info!("reporting greptimedb version: {:?}", data); + if self.report_times == 0 { + info!("reporting greptimedb version: {:?}", data); + } let result = client.post(self.telemetry_url).json(&data).send().await; + self.report_times += 1; debug!("report version result: {:?}", result); result.ok() } else { diff --git a/src/common/meta/src/kv_backend.rs b/src/common/meta/src/kv_backend.rs index c0459f68cef4..d924b5b7f757 100644 --- a/src/common/meta/src/kv_backend.rs +++ b/src/common/meta/src/kv_backend.rs @@ -13,6 +13,7 @@ // limitations under the License. pub mod memory; +pub mod test; pub mod txn; use std::any::Any; diff --git a/src/common/meta/src/kv_backend/memory.rs b/src/common/meta/src/kv_backend/memory.rs index 347c84a5876e..dd434fe85017 100644 --- a/src/common/meta/src/kv_backend/memory.rs +++ b/src/common/meta/src/kv_backend/memory.rs @@ -17,7 +17,6 @@ use std::collections::btree_map::Entry; use std::collections::BTreeMap; use std::fmt::{Display, Formatter}; use std::marker::PhantomData; -use std::ops::Range; use std::sync::RwLock; use async_trait::async_trait; @@ -85,21 +84,25 @@ impl KvBackend for MemoryKvBackend { } async fn range(&self, req: RangeRequest) -> Result { + let range = req.range(); let RangeRequest { - key, - range_end, - limit, - keys_only, + limit, keys_only, .. } = req; let kvs = self.kvs.read().unwrap(); + let values = kvs.range(range); - let iter: Box, &Vec)>> = if range_end.is_empty() { - Box::new(kvs.get_key_value(&key).into_iter()) - } else { - Box::new(kvs.range(key..range_end)) - }; - let mut kvs = iter + let mut more = false; + let mut iter: i64 = 0; + + let kvs = values + .take_while(|_| { + let take = limit == 0 || iter != limit; + iter += 1; + more = limit > 0 && iter > limit; + + take + }) .map(|(k, v)| { let key = k.clone(); let value = if keys_only { vec![] } else { v.clone() }; @@ -107,13 +110,6 @@ impl KvBackend for MemoryKvBackend { }) .collect::>(); - let more = if limit > 0 && kvs.len() > limit as usize { - kvs.truncate(limit as usize); - true - } else { - false - }; - Ok(RangeResponse { kvs, more }) } @@ -215,36 +211,32 @@ impl KvBackend for MemoryKvBackend { &self, req: DeleteRangeRequest, ) -> Result { - let DeleteRangeRequest { - key, - range_end, - prev_kv, - } = req; + let range = req.range(); + let DeleteRangeRequest { prev_kv, .. } = req; let mut kvs = self.kvs.write().unwrap(); - let prev_kvs = if range_end.is_empty() { - kvs.remove(&key) - .into_iter() - .map(|value| KeyValue { - key: key.clone(), - value, - }) - .collect::>() + let keys = kvs + .range(range) + .map(|(key, _)| key.clone()) + .collect::>(); + + let mut prev_kvs = if prev_kv { + Vec::with_capacity(keys.len()) } else { - let range = Range { - start: key, - end: range_end, - }; - kvs.extract_if(|key, _| range.contains(key)) - .map(Into::into) - .collect::>() + vec![] }; + let deleted = keys.len() as i64; - Ok(DeleteRangeResponse { - deleted: prev_kvs.len() as i64, - prev_kvs: if prev_kv { prev_kvs } else { vec![] }, - }) + for key in keys { + if let Some(value) = kvs.remove(&key) { + if prev_kv { + prev_kvs.push((key.clone(), value).into()) + } + } + } + + Ok(DeleteRangeResponse { deleted, prev_kvs }) } async fn batch_delete( @@ -358,254 +350,63 @@ impl TxnService for MemoryKvBackend { #[cfg(test)] mod tests { - use std::sync::atomic::{AtomicU8, Ordering}; use std::sync::Arc; use super::*; use crate::error::Error; + use crate::kv_backend::test::{ + prepare_kv, test_kv_batch_delete, test_kv_batch_get, test_kv_compare_and_put, + test_kv_delete_range, test_kv_put, test_kv_range, test_kv_range_2, + }; use crate::kv_backend::KvBackend; - use crate::rpc::store::{BatchGetRequest, BatchPutRequest}; - use crate::rpc::KeyValue; - use crate::util; async fn mock_mem_store_with_data() -> MemoryKvBackend { let kv_store = MemoryKvBackend::::new(); - let kvs = mock_kvs(); - - assert!(kv_store - .batch_put(BatchPutRequest { - kvs, - ..Default::default() - }) - .await - .is_ok()); - - assert!(kv_store - .put(PutRequest { - key: b"key11".to_vec(), - value: b"val11".to_vec(), - ..Default::default() - }) - .await - .is_ok()); + prepare_kv(&kv_store).await; kv_store } - fn mock_kvs() -> Vec { - vec![ - KeyValue { - key: b"key1".to_vec(), - value: b"val1".to_vec(), - }, - KeyValue { - key: b"key2".to_vec(), - value: b"val2".to_vec(), - }, - KeyValue { - key: b"key3".to_vec(), - value: b"val3".to_vec(), - }, - ] - } - #[tokio::test] async fn test_put() { let kv_store = mock_mem_store_with_data().await; - let resp = kv_store - .put(PutRequest { - key: b"key11".to_vec(), - value: b"val12".to_vec(), - prev_kv: false, - }) - .await - .unwrap(); - assert!(resp.prev_kv.is_none()); - - let resp = kv_store - .put(PutRequest { - key: b"key11".to_vec(), - value: b"val13".to_vec(), - prev_kv: true, - }) - .await - .unwrap(); - let prev_kv = resp.prev_kv.unwrap(); - assert_eq!(b"key11", prev_kv.key()); - assert_eq!(b"val12", prev_kv.value()); + test_kv_put(kv_store).await; } #[tokio::test] async fn test_range() { let kv_store = mock_mem_store_with_data().await; - let key = b"key1".to_vec(); - let range_end = util::get_prefix_end_key(b"key1"); + test_kv_range(kv_store).await; + } - let resp = kv_store - .range(RangeRequest { - key: key.clone(), - range_end: range_end.clone(), - limit: 0, - keys_only: false, - }) - .await - .unwrap(); - - assert_eq!(2, resp.kvs.len()); - assert_eq!(b"key1", resp.kvs[0].key()); - assert_eq!(b"val1", resp.kvs[0].value()); - assert_eq!(b"key11", resp.kvs[1].key()); - assert_eq!(b"val11", resp.kvs[1].value()); - - let resp = kv_store - .range(RangeRequest { - key: key.clone(), - range_end: range_end.clone(), - limit: 0, - keys_only: true, - }) - .await - .unwrap(); - - assert_eq!(2, resp.kvs.len()); - assert_eq!(b"key1", resp.kvs[0].key()); - assert_eq!(b"", resp.kvs[0].value()); - assert_eq!(b"key11", resp.kvs[1].key()); - assert_eq!(b"", resp.kvs[1].value()); - - let resp = kv_store - .range(RangeRequest { - key: key.clone(), - limit: 0, - keys_only: false, - ..Default::default() - }) - .await - .unwrap(); - - assert_eq!(1, resp.kvs.len()); - assert_eq!(b"key1", resp.kvs[0].key()); - assert_eq!(b"val1", resp.kvs[0].value()); - - let resp = kv_store - .range(RangeRequest { - key, - range_end, - limit: 1, - keys_only: false, - }) - .await - .unwrap(); + #[tokio::test] + async fn test_range_2() { + let kv = MemoryKvBackend::::new(); - assert_eq!(1, resp.kvs.len()); - assert_eq!(b"key1", resp.kvs[0].key()); - assert_eq!(b"val1", resp.kvs[0].value()); + test_kv_range_2(kv).await; } #[tokio::test] async fn test_batch_get() { let kv_store = mock_mem_store_with_data().await; - let keys = vec![]; - let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap(); - - assert!(resp.kvs.is_empty()); - - let keys = vec![b"key10".to_vec()]; - let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap(); - - assert!(resp.kvs.is_empty()); - - let keys = vec![b"key1".to_vec(), b"key3".to_vec(), b"key4".to_vec()]; - let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap(); - - assert_eq!(2, resp.kvs.len()); - assert_eq!(b"key1", resp.kvs[0].key()); - assert_eq!(b"val1", resp.kvs[0].value()); - assert_eq!(b"key3", resp.kvs[1].key()); - assert_eq!(b"val3", resp.kvs[1].value()); + test_kv_batch_get(kv_store).await; } #[tokio::test(flavor = "multi_thread")] async fn test_compare_and_put() { let kv_store = Arc::new(MemoryKvBackend::::new()); - let success = Arc::new(AtomicU8::new(0)); - - let mut joins = vec![]; - for _ in 0..20 { - let kv_store_clone = kv_store.clone(); - let success_clone = success.clone(); - let join = tokio::spawn(async move { - let req = CompareAndPutRequest { - key: b"key".to_vec(), - expect: vec![], - value: b"val_new".to_vec(), - }; - let resp = kv_store_clone.compare_and_put(req).await.unwrap(); - if resp.success { - success_clone.fetch_add(1, Ordering::SeqCst); - } - }); - joins.push(join); - } - - for join in joins { - join.await.unwrap(); - } - assert_eq!(1, success.load(Ordering::SeqCst)); + test_kv_compare_and_put(kv_store).await; } #[tokio::test] async fn test_delete_range() { let kv_store = mock_mem_store_with_data().await; - let req = DeleteRangeRequest { - key: b"key3".to_vec(), - range_end: vec![], - prev_kv: true, - }; - - let resp = kv_store.delete_range(req).await.unwrap(); - assert_eq!(1, resp.prev_kvs.len()); - assert_eq!(b"key3", resp.prev_kvs[0].key()); - assert_eq!(b"val3", resp.prev_kvs[0].value()); - - let resp = kv_store.get(b"key3").await.unwrap(); - assert!(resp.is_none()); - - let req = DeleteRangeRequest { - key: b"key2".to_vec(), - range_end: vec![], - prev_kv: false, - }; - - let resp = kv_store.delete_range(req).await.unwrap(); - assert!(resp.prev_kvs.is_empty()); - - let resp = kv_store.get(b"key2").await.unwrap(); - assert!(resp.is_none()); - - let key = b"key1".to_vec(); - let range_end = util::get_prefix_end_key(b"key1"); - - let req = DeleteRangeRequest { - key: key.clone(), - range_end: range_end.clone(), - prev_kv: true, - }; - let resp = kv_store.delete_range(req).await.unwrap(); - assert_eq!(2, resp.prev_kvs.len()); - - let req = RangeRequest { - key, - range_end, - ..Default::default() - }; - let resp = kv_store.range(req).await.unwrap(); - assert!(resp.kvs.is_empty()); + test_kv_delete_range(kv_store).await; } #[tokio::test] @@ -636,35 +437,6 @@ mod tests { async fn test_batch_delete() { let kv_store = mock_mem_store_with_data().await; - assert!(kv_store.get(b"key1").await.unwrap().is_some()); - assert!(kv_store.get(b"key100").await.unwrap().is_none()); - - let req = BatchDeleteRequest { - keys: vec![b"key1".to_vec(), b"key100".to_vec()], - prev_kv: true, - }; - let resp = kv_store.batch_delete(req).await.unwrap(); - assert_eq!(1, resp.prev_kvs.len()); - assert_eq!( - vec![KeyValue { - key: b"key1".to_vec(), - value: b"val1".to_vec() - }], - resp.prev_kvs - ); - assert!(kv_store.get(b"key1").await.unwrap().is_none()); - - assert!(kv_store.get(b"key2").await.unwrap().is_some()); - assert!(kv_store.get(b"key3").await.unwrap().is_some()); - - let req = BatchDeleteRequest { - keys: vec![b"key2".to_vec(), b"key3".to_vec()], - prev_kv: false, - }; - let resp = kv_store.batch_delete(req).await.unwrap(); - assert!(resp.prev_kvs.is_empty()); - - assert!(kv_store.get(b"key2").await.unwrap().is_none()); - assert!(kv_store.get(b"key3").await.unwrap().is_none()); + test_kv_batch_delete(kv_store).await; } } diff --git a/src/common/meta/src/kv_backend/test.rs b/src/common/meta/src/kv_backend/test.rs new file mode 100644 index 000000000000..4f8911910072 --- /dev/null +++ b/src/common/meta/src/kv_backend/test.rs @@ -0,0 +1,352 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::atomic::{AtomicU8, Ordering}; +use std::sync::Arc; + +use super::{KvBackend, *}; +use crate::error::Error; +use crate::rpc::store::{BatchGetRequest, PutRequest}; +use crate::rpc::KeyValue; +use crate::util; + +pub fn mock_kvs() -> Vec { + vec![ + KeyValue { + key: b"key1".to_vec(), + value: b"val1".to_vec(), + }, + KeyValue { + key: b"key2".to_vec(), + value: b"val2".to_vec(), + }, + KeyValue { + key: b"key3".to_vec(), + value: b"val3".to_vec(), + }, + ] +} + +pub async fn prepare_kv(kv_store: &impl KvBackend) { + let kvs = mock_kvs(); + assert!(kv_store + .batch_put(BatchPutRequest { + kvs, + ..Default::default() + }) + .await + .is_ok()); + + assert!(kv_store + .put(PutRequest { + key: b"key11".to_vec(), + value: b"val11".to_vec(), + ..Default::default() + }) + .await + .is_ok()); +} + +pub async fn test_kv_put(kv_store: impl KvBackend) { + let resp = kv_store + .put(PutRequest { + key: b"key11".to_vec(), + value: b"val12".to_vec(), + prev_kv: false, + }) + .await + .unwrap(); + assert!(resp.prev_kv.is_none()); + + let resp = kv_store + .put(PutRequest { + key: b"key11".to_vec(), + value: b"val13".to_vec(), + prev_kv: true, + }) + .await + .unwrap(); + let prev_kv = resp.prev_kv.unwrap(); + assert_eq!(b"key11", prev_kv.key()); + assert_eq!(b"val12", prev_kv.value()); +} + +pub async fn test_kv_range(kv_store: impl KvBackend) { + let key = b"key1".to_vec(); + let range_end = util::get_prefix_end_key(b"key1"); + + let resp = kv_store + .range(RangeRequest { + key: key.clone(), + range_end: range_end.clone(), + limit: 0, + keys_only: false, + }) + .await + .unwrap(); + + assert_eq!(2, resp.kvs.len()); + assert_eq!(b"key1", resp.kvs[0].key()); + assert_eq!(b"val1", resp.kvs[0].value()); + assert_eq!(b"key11", resp.kvs[1].key()); + assert_eq!(b"val11", resp.kvs[1].value()); + + let resp = kv_store + .range(RangeRequest { + key: key.clone(), + range_end: range_end.clone(), + limit: 0, + keys_only: true, + }) + .await + .unwrap(); + + assert_eq!(2, resp.kvs.len()); + assert_eq!(b"key1", resp.kvs[0].key()); + assert_eq!(b"", resp.kvs[0].value()); + assert_eq!(b"key11", resp.kvs[1].key()); + assert_eq!(b"", resp.kvs[1].value()); + + let resp = kv_store + .range(RangeRequest { + key: key.clone(), + limit: 0, + keys_only: false, + ..Default::default() + }) + .await + .unwrap(); + + assert_eq!(1, resp.kvs.len()); + assert_eq!(b"key1", resp.kvs[0].key()); + assert_eq!(b"val1", resp.kvs[0].value()); + + let resp = kv_store + .range(RangeRequest { + key, + range_end, + limit: 1, + keys_only: false, + }) + .await + .unwrap(); + + assert_eq!(1, resp.kvs.len()); + assert_eq!(b"key1", resp.kvs[0].key()); + assert_eq!(b"val1", resp.kvs[0].value()); +} + +pub async fn test_kv_range_2(kv_store: impl KvBackend) { + kv_store + .put(PutRequest::new().with_key("atest").with_value("value")) + .await + .unwrap(); + + kv_store + .put(PutRequest::new().with_key("test").with_value("value")) + .await + .unwrap(); + + // If both key and range_end are ‘\0’, then range represents all keys. + let result = kv_store + .range(RangeRequest::new().with_range(b"\0".to_vec(), b"\0".to_vec())) + .await + .unwrap(); + + assert_eq!(result.kvs.len(), 2); + assert!(!result.more); + + // If range_end is ‘\0’, the range is all keys greater than or equal to the key argument. + let result = kv_store + .range(RangeRequest::new().with_range(b"a".to_vec(), b"\0".to_vec())) + .await + .unwrap(); + + assert_eq!(result.kvs.len(), 2); + + let result = kv_store + .range(RangeRequest::new().with_range(b"b".to_vec(), b"\0".to_vec())) + .await + .unwrap(); + + assert_eq!(result.kvs.len(), 1); + assert_eq!(result.kvs[0].key, b"test"); + + // Fetches the keys >= "a", set limit to 1, the `more` should be true. + let result = kv_store + .range( + RangeRequest::new() + .with_range(b"a".to_vec(), b"\0".to_vec()) + .with_limit(1), + ) + .await + .unwrap(); + assert_eq!(result.kvs.len(), 1); + assert!(result.more); + + // Fetches the keys >= "a", set limit to 2, the `more` should be false. + let result = kv_store + .range( + RangeRequest::new() + .with_range(b"a".to_vec(), b"\0".to_vec()) + .with_limit(2), + ) + .await + .unwrap(); + assert_eq!(result.kvs.len(), 2); + assert!(!result.more); + + // Fetches the keys >= "a", set limit to 3, the `more` should be false. + let result = kv_store + .range( + RangeRequest::new() + .with_range(b"a".to_vec(), b"\0".to_vec()) + .with_limit(3), + ) + .await + .unwrap(); + assert_eq!(result.kvs.len(), 2); + assert!(!result.more); +} + +pub async fn test_kv_batch_get(kv_store: impl KvBackend) { + let keys = vec![]; + let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap(); + + assert!(resp.kvs.is_empty()); + + let keys = vec![b"key10".to_vec()]; + let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap(); + + assert!(resp.kvs.is_empty()); + + let keys = vec![b"key1".to_vec(), b"key3".to_vec(), b"key4".to_vec()]; + let resp = kv_store.batch_get(BatchGetRequest { keys }).await.unwrap(); + + assert_eq!(2, resp.kvs.len()); + assert_eq!(b"key1", resp.kvs[0].key()); + assert_eq!(b"val1", resp.kvs[0].value()); + assert_eq!(b"key3", resp.kvs[1].key()); + assert_eq!(b"val3", resp.kvs[1].value()); +} + +pub async fn test_kv_compare_and_put(kv_store: Arc>) { + let success = Arc::new(AtomicU8::new(0)); + + let mut joins = vec![]; + for _ in 0..20 { + let kv_store_clone = kv_store.clone(); + let success_clone = success.clone(); + let join = tokio::spawn(async move { + let req = CompareAndPutRequest { + key: b"key".to_vec(), + expect: vec![], + value: b"val_new".to_vec(), + }; + let resp = kv_store_clone.compare_and_put(req).await.unwrap(); + if resp.success { + success_clone.fetch_add(1, Ordering::SeqCst); + } + }); + joins.push(join); + } + + for join in joins { + join.await.unwrap(); + } + + assert_eq!(1, success.load(Ordering::SeqCst)); +} + +pub async fn test_kv_delete_range(kv_store: impl KvBackend) { + let req = DeleteRangeRequest { + key: b"key3".to_vec(), + range_end: vec![], + prev_kv: true, + }; + + let resp = kv_store.delete_range(req).await.unwrap(); + assert_eq!(1, resp.prev_kvs.len()); + assert_eq!(1, resp.deleted); + assert_eq!(b"key3", resp.prev_kvs[0].key()); + assert_eq!(b"val3", resp.prev_kvs[0].value()); + + let resp = kv_store.get(b"key3").await.unwrap(); + assert!(resp.is_none()); + + let req = DeleteRangeRequest { + key: b"key2".to_vec(), + range_end: vec![], + prev_kv: false, + }; + + let resp = kv_store.delete_range(req).await.unwrap(); + assert_eq!(1, resp.deleted); + assert!(resp.prev_kvs.is_empty()); + + let resp = kv_store.get(b"key2").await.unwrap(); + assert!(resp.is_none()); + + let key = b"key1".to_vec(); + let range_end = util::get_prefix_end_key(b"key1"); + + let req = DeleteRangeRequest { + key: key.clone(), + range_end: range_end.clone(), + prev_kv: true, + }; + let resp = kv_store.delete_range(req).await.unwrap(); + assert_eq!(2, resp.prev_kvs.len()); + + let req = RangeRequest { + key, + range_end, + ..Default::default() + }; + let resp = kv_store.range(req).await.unwrap(); + assert!(resp.kvs.is_empty()); +} + +pub async fn test_kv_batch_delete(kv_store: impl KvBackend) { + assert!(kv_store.get(b"key1").await.unwrap().is_some()); + assert!(kv_store.get(b"key100").await.unwrap().is_none()); + + let req = BatchDeleteRequest { + keys: vec![b"key1".to_vec(), b"key100".to_vec()], + prev_kv: true, + }; + let resp = kv_store.batch_delete(req).await.unwrap(); + assert_eq!(1, resp.prev_kvs.len()); + assert_eq!( + vec![KeyValue { + key: b"key1".to_vec(), + value: b"val1".to_vec() + }], + resp.prev_kvs + ); + assert!(kv_store.get(b"key1").await.unwrap().is_none()); + + assert!(kv_store.get(b"key2").await.unwrap().is_some()); + assert!(kv_store.get(b"key3").await.unwrap().is_some()); + + let req = BatchDeleteRequest { + keys: vec![b"key2".to_vec(), b"key3".to_vec()], + prev_kv: false, + }; + let resp = kv_store.batch_delete(req).await.unwrap(); + assert!(resp.prev_kvs.is_empty()); + + assert!(kv_store.get(b"key2").await.unwrap().is_none()); + assert!(kv_store.get(b"key3").await.unwrap().is_none()); +} diff --git a/src/common/meta/src/rpc/store.rs b/src/common/meta/src/rpc/store.rs index 2426442e3f5d..b307894337f2 100644 --- a/src/common/meta/src/rpc/store.rs +++ b/src/common/meta/src/rpc/store.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::fmt::{Display, Formatter}; +use std::ops::Bound; use api::v1::meta::{ BatchDeleteRequest as PbBatchDeleteRequest, BatchDeleteResponse as PbBatchDeleteResponse, @@ -30,6 +31,17 @@ use crate::error; use crate::error::Result; use crate::rpc::{util, KeyValue}; +pub fn to_range(key: Vec, range_end: Vec) -> (Bound>, Bound>) { + match (&key[..], &range_end[..]) { + (_, []) => (Bound::Included(key.clone()), Bound::Included(key)), + // If both key and range_end are ‘\0’, then range represents all keys. + ([0], [0]) => (Bound::Unbounded, Bound::Unbounded), + // If range_end is ‘\0’, the range is all keys greater than or equal to the key argument. + (_, [0]) => (Bound::Included(key), Bound::Unbounded), + (_, _) => (Bound::Included(key), Bound::Excluded(range_end)), + } +} + #[derive(Debug, Clone, Default)] pub struct RangeRequest { /// key is the first key for the range, If range_end is not given, the @@ -96,6 +108,11 @@ impl RangeRequest { } } + /// Returns the `RangeBounds`. + pub fn range(&self) -> (Bound>, Bound>) { + to_range(self.key.clone(), self.range_end.clone()) + } + /// key is the first key for the range, If range_end is not given, the /// request only looks up key. #[inline] @@ -690,6 +707,11 @@ impl DeleteRangeRequest { } } + /// Returns the `RangeBounds`. + pub fn range(&self) -> (Bound>, Bound>) { + to_range(self.key.clone(), self.range_end.clone()) + } + /// key is the first key to delete in the range. If range_end is not given, /// the range is defined to contain only the key argument. #[inline] diff --git a/src/common/runtime/src/repeated_task.rs b/src/common/runtime/src/repeated_task.rs index b3dcc781f1bd..a4f2bde8b00a 100644 --- a/src/common/runtime/src/repeated_task.rs +++ b/src/common/runtime/src/repeated_task.rs @@ -40,6 +40,7 @@ pub type BoxedTaskFunction = Box + Send + Sync + 'static> struct TaskInner { /// The repeated task handle. This handle is Some if the task is started. task_handle: Option>, + /// The task_fn to run. This is Some if the task is not started. task_fn: Option>, } @@ -50,6 +51,7 @@ pub struct RepeatedTask { inner: Mutex>, started: AtomicBool, interval: Duration, + initial_delay: Option, } impl std::fmt::Display for RepeatedTask { @@ -75,6 +77,9 @@ impl Drop for RepeatedTask { } impl RepeatedTask { + /// Creates a new repeated task. The `initial_delay` is the delay before the first execution. + /// `initial_delay` default is None, the initial interval uses the `interval`. + /// You can use `with_initial_delay` to set the `initial_delay`. pub fn new(interval: Duration, task_fn: BoxedTaskFunction) -> Self { Self { name: task_fn.name().to_string(), @@ -85,9 +90,15 @@ impl RepeatedTask { }), started: AtomicBool::new(false), interval, + initial_delay: None, } } + pub fn with_initial_delay(mut self, initial_delay: Option) -> Self { + self.initial_delay = initial_delay; + self + } + pub fn started(&self) -> bool { self.started.load(Ordering::Relaxed) } @@ -99,17 +110,21 @@ impl RepeatedTask { IllegalStateSnafu { name: &self.name } ); - let interval = self.interval; let child = self.cancel_token.child_token(); // Safety: The task is not started. let mut task_fn = inner.task_fn.take().unwrap(); + let interval = self.interval; + let mut initial_delay = self.initial_delay; // TODO(hl): Maybe spawn to a blocking runtime. let handle = runtime.spawn(async move { loop { - tokio::select! { - _ = tokio::time::sleep(interval) => {} - _ = child.cancelled() => { - return; + let sleep_time = initial_delay.take().unwrap_or(interval); + if sleep_time > Duration::ZERO { + tokio::select! { + _ = tokio::time::sleep(sleep_time) => {} + _ = child.cancelled() => { + return; + } } } if let Err(e) = task_fn.call().await { @@ -192,4 +207,21 @@ mod tests { assert_eq!(n.load(Ordering::Relaxed), 5); } + + #[tokio::test] + async fn test_repeated_task_prior_exec() { + common_telemetry::init_default_ut_logging(); + + let n = Arc::new(AtomicI32::new(0)); + let task_fn = TickTask { n: n.clone() }; + + let task = RepeatedTask::new(Duration::from_millis(100), Box::new(task_fn)) + .with_initial_delay(Some(Duration::ZERO)); + + task.start(crate::bg_runtime()).unwrap(); + tokio::time::sleep(Duration::from_millis(550)).await; + task.stop().await.unwrap(); + + assert_eq!(n.load(Ordering::Relaxed), 6); + } } diff --git a/src/frontend/src/instance/opentsdb.rs b/src/frontend/src/instance/opentsdb.rs index 1ac8fe029048..47bb940a1bb2 100644 --- a/src/frontend/src/instance/opentsdb.rs +++ b/src/frontend/src/instance/opentsdb.rs @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use api::v1::InsertRequests; use async_trait::async_trait; use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; use common_error::ext::BoxedError; use servers::error as server_error; use servers::error::AuthSnafu; use servers::opentsdb::codec::DataPoint; +use servers::opentsdb::data_point_to_grpc_row_insert_requests; use servers::query_handler::OpentsdbProtocolHandler; use session::context::QueryContextRef; use snafu::prelude::*; @@ -27,23 +27,27 @@ use crate::instance::Instance; #[async_trait] impl OpentsdbProtocolHandler for Instance { - async fn exec(&self, data_point: &DataPoint, ctx: QueryContextRef) -> server_error::Result<()> { + async fn exec( + &self, + data_points: Vec, + ctx: QueryContextRef, + ) -> server_error::Result { self.plugins .get::() .as_ref() .check_permission(ctx.current_user(), PermissionReq::Opentsdb) .context(AuthSnafu)?; - let requests = InsertRequests { - inserts: vec![data_point.as_grpc_insert()], - }; - let _ = self - .handle_inserts(requests, ctx) + let (requests, _) = data_point_to_grpc_row_insert_requests(data_points)?; + let output = self + .handle_row_inserts(requests, ctx) .await .map_err(BoxedError::new) - .with_context(|_| server_error::ExecuteQuerySnafu { - query: format!("{data_point:?}"), - })?; - Ok(()) + .context(servers::error::ExecuteGrpcQuerySnafu)?; + + Ok(match output { + common_query::Output::AffectedRows(rows) => rows, + _ => unreachable!(), + }) } } diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 29e0cfc44657..e5927399385b 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -19,14 +19,18 @@ use metrics::counter; use opentelemetry_proto::tonic::collector::metrics::v1::{ ExportMetricsServiceRequest, ExportMetricsServiceResponse, }; +use opentelemetry_proto::tonic::collector::trace::v1::{ + ExportTraceServiceRequest, ExportTraceServiceResponse, +}; use servers::error::{self, AuthSnafu, Result as ServerResult}; use servers::otlp; +use servers::otlp::plugin::TraceParserRef; use servers::query_handler::OpenTelemetryProtocolHandler; use session::context::QueryContextRef; use snafu::ResultExt; use crate::instance::Instance; -use crate::metrics::OTLP_METRICS_ROWS; +use crate::metrics::{OTLP_METRICS_ROWS, OTLP_TRACES_ROWS}; #[async_trait] impl OpenTelemetryProtocolHandler for Instance { @@ -40,7 +44,7 @@ impl OpenTelemetryProtocolHandler for Instance { .as_ref() .check_permission(ctx.current_user(), PermissionReq::Otlp) .context(AuthSnafu)?; - let (requests, rows) = otlp::to_grpc_insert_requests(request)?; + let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request)?; let _ = self .handle_row_inserts(requests, ctx) .await @@ -55,4 +59,40 @@ impl OpenTelemetryProtocolHandler for Instance { }; Ok(resp) } + + async fn traces( + &self, + request: ExportTraceServiceRequest, + ctx: QueryContextRef, + ) -> ServerResult { + self.plugins + .get::() + .as_ref() + .check_permission(ctx.current_user(), PermissionReq::Otlp) + .context(AuthSnafu)?; + + let (table_name, spans) = match self.plugins.get::() { + Some(parser) => (parser.table_name(), parser.parse(request)), + None => ( + otlp::trace::TRACE_TABLE_NAME.to_string(), + otlp::trace::parse(request), + ), + }; + + let (requests, rows) = otlp::trace::to_grpc_insert_requests(table_name, spans)?; + + let _ = self + .handle_row_inserts(requests, ctx) + .await + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu)?; + + counter!(OTLP_TRACES_ROWS, rows as u64); + + let resp = ExportTraceServiceResponse { + // TODO(fys): add support for partial_success in future patch + partial_success: None, + }; + Ok(resp) + } } diff --git a/src/frontend/src/metrics.rs b/src/frontend/src/metrics.rs index 8a7480f9ba1b..b07bf2df9eb5 100644 --- a/src/frontend/src/metrics.rs +++ b/src/frontend/src/metrics.rs @@ -22,3 +22,4 @@ pub(crate) const METRIC_RUN_SCRIPT_ELAPSED: &str = "frontend.run_script_elapsed" pub const PROM_STORE_REMOTE_WRITE_SAMPLES: &str = "frontend.prometheus.remote_write.samples"; pub const OTLP_METRICS_ROWS: &str = "frontend.otlp.metrics.rows"; +pub const OTLP_TRACES_ROWS: &str = "frontend.otlp.traces.rows"; diff --git a/src/log-store/Cargo.toml b/src/log-store/Cargo.toml index bedf1fc64f65..7e9a275c7e4f 100644 --- a/src/log-store/Cargo.toml +++ b/src/log-store/Cargo.toml @@ -19,7 +19,7 @@ common-base = { workspace = true } common-config = { workspace = true } common-error = { workspace = true } common-macro = { workspace = true } -common-meta = { workspace = true } +common-meta = { workspace = true, features = ["testing"] } common-runtime = { workspace = true } common-telemetry = { workspace = true } futures-util.workspace = true diff --git a/src/log-store/src/raft_engine/backend.rs b/src/log-store/src/raft_engine/backend.rs index 31a002a9529b..8b27b99b1464 100644 --- a/src/log-store/src/raft_engine/backend.rs +++ b/src/log-store/src/raft_engine/backend.rs @@ -15,6 +15,7 @@ //! [KvBackend] implementation based on [raft_engine::Engine]. use std::any::Any; +use std::ops::Bound::{Excluded, Included, Unbounded}; use std::sync::RwLock; use common_error::ext::BoxedError; @@ -28,6 +29,7 @@ use common_meta::rpc::store::{ RangeRequest, RangeResponse, }; use common_meta::rpc::KeyValue; +use common_meta::util::get_next_prefix_key; use raft_engine::{Config, Engine, LogBatch}; use snafu::ResultExt; @@ -137,29 +139,48 @@ impl KvBackend for RaftEngineBackend { async fn range(&self, req: RangeRequest) -> Result { let mut res = vec![]; + let (start, end) = req.range(); + let RangeRequest { + keys_only, limit, .. + } = req; + + let (start_key, end_key) = match (start, end) { + (Included(start), Included(end)) => (Some(start), Some(get_next_prefix_key(&end))), + (Unbounded, Unbounded) => (None, None), + (Included(start), Excluded(end)) => (Some(start), Some(end)), + (Included(start), Unbounded) => (Some(start), None), + _ => unreachable!(), + }; + let mut more = false; + let mut iter = 0; + self.engine .read() .unwrap() .scan_raw_messages( SYSTEM_NAMESPACE, - Some(&req.key), - Some(&req.range_end), + start_key.as_deref(), + end_key.as_deref(), false, |key, value| { - res.push(KeyValue { - key: key.to_vec(), - value: value.to_vec(), - }); - true + let take = limit == 0 || iter != limit; + iter += 1; + more = limit > 0 && iter > limit; + + if take { + res.push(KeyValue { + key: key.to_vec(), + value: if keys_only { vec![] } else { value.to_vec() }, + }); + } + + take }, ) .context(RaftEngineSnafu) .map_err(BoxedError::new) .context(meta_error::ExternalSnafu)?; - Ok(RangeResponse { - kvs: res, - more: false, - }) + Ok(RangeResponse { kvs: res, more }) } async fn put(&self, req: PutRequest) -> Result { @@ -275,7 +296,7 @@ impl KvBackend for RaftEngineBackend { key, range_end, limit: 0, - keys_only: true, + keys_only: false, }; let range_resp = self.range(range).await?; @@ -383,7 +404,12 @@ fn engine_delete(engine: &Engine, key: &[u8]) -> meta_error::Result<()> { #[cfg(test)] mod tests { use std::collections::HashSet; + use std::sync::Arc; + use common_meta::kv_backend::test::{ + prepare_kv, test_kv_batch_delete, test_kv_batch_get, test_kv_compare_and_put, + test_kv_delete_range, test_kv_put, test_kv_range, test_kv_range_2, + }; use common_test_util::temp_dir::create_temp_dir; use raft_engine::{Config, ReadableSize, RecoveryMode}; @@ -615,4 +641,66 @@ mod tests { keys ); } + + #[tokio::test] + async fn test_range() { + let dir = create_temp_dir("range"); + let backend = build_kv_backend(dir.path().to_str().unwrap().to_string()); + prepare_kv(&backend).await; + + test_kv_range(backend).await; + } + + #[tokio::test] + async fn test_range_2() { + let dir = create_temp_dir("range2"); + let backend = build_kv_backend(dir.path().to_str().unwrap().to_string()); + + test_kv_range_2(backend).await; + } + + #[tokio::test] + async fn test_put() { + let dir = create_temp_dir("put"); + let backend = build_kv_backend(dir.path().to_str().unwrap().to_string()); + prepare_kv(&backend).await; + + test_kv_put(backend).await; + } + + #[tokio::test] + async fn test_batch_get() { + let dir = create_temp_dir("batch_get"); + let backend = build_kv_backend(dir.path().to_str().unwrap().to_string()); + prepare_kv(&backend).await; + + test_kv_batch_get(backend).await; + } + + #[tokio::test] + async fn test_batch_delete() { + let dir = create_temp_dir("batch_delete"); + let backend = build_kv_backend(dir.path().to_str().unwrap().to_string()); + prepare_kv(&backend).await; + + test_kv_batch_delete(backend).await; + } + + #[tokio::test] + async fn test_delete_range() { + let dir = create_temp_dir("delete_range"); + let backend = build_kv_backend(dir.path().to_str().unwrap().to_string()); + prepare_kv(&backend).await; + + test_kv_delete_range(backend).await; + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_compare_and_put_2() { + let dir = create_temp_dir("compare_and_put"); + let backend = build_kv_backend(dir.path().to_str().unwrap().to_string()); + prepare_kv(&backend).await; + + test_kv_compare_and_put(Arc::new(backend)).await; + } } diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index 7cb8d2967921..09b33a886bde 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -28,6 +28,7 @@ const DEFAULT_MAX_BG_JOB: usize = 4; /// Configuration for [MitoEngine](crate::engine::MitoEngine). #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +#[serde(default)] pub struct MitoConfig { // Worker configs: /// Number of region workers (default 1). diff --git a/src/mito2/src/engine/create_test.rs b/src/mito2/src/engine/create_test.rs index 31cb4fd031cd..bfe9af8cbbdd 100644 --- a/src/mito2/src/engine/create_test.rs +++ b/src/mito2/src/engine/create_test.rs @@ -15,7 +15,7 @@ use std::time::Duration; use store_api::region_engine::RegionEngine; -use store_api::region_request::RegionRequest; +use store_api::region_request::{RegionCloseRequest, RegionRequest}; use store_api::storage::RegionId; use crate::config::MitoConfig; @@ -55,6 +55,37 @@ async fn test_engine_create_existing_region() { .unwrap(); } +#[tokio::test] +async fn test_engine_create_close_create_region() { + // This test will trigger create_or_open function. + let mut env = TestEnv::with_prefix("create-close-create"); + let engine = env.create_engine(MitoConfig::default()).await; + + let region_id = RegionId::new(1, 1); + let builder = CreateRequestBuilder::new(); + // Create a region with id 1. + engine + .handle_request(region_id, RegionRequest::Create(builder.build())) + .await + .unwrap(); + // Close the region. + engine + .handle_request(region_id, RegionRequest::Close(RegionCloseRequest {})) + .await + .unwrap(); + // Create the same region id again. + engine + .handle_request(region_id, RegionRequest::Create(builder.build())) + .await + .unwrap(); + + assert!(engine.is_region_exists(region_id)); + + let region = engine.get_region(region_id).unwrap(); + + assert!(region.is_writable()); +} + #[tokio::test] async fn test_engine_create_with_different_id() { let mut env = TestEnv::new(); diff --git a/src/mito2/src/engine/prune_test.rs b/src/mito2/src/engine/prune_test.rs index 503839c66ce7..77f0d37dcb89 100644 --- a/src/mito2/src/engine/prune_test.rs +++ b/src/mito2/src/engine/prune_test.rs @@ -17,7 +17,7 @@ use common_query::logical_plan::DfExpr; use common_query::prelude::Expr; use common_recordbatch::RecordBatches; use datafusion_common::ScalarValue; -use datafusion_expr::lit; +use datafusion_expr::{col, lit}; use store_api::region_engine::RegionEngine; use store_api::region_request::RegionRequest; use store_api::storage::{RegionId, ScanRequest}; @@ -46,7 +46,7 @@ async fn check_prune_row_groups(expr: DfExpr, expected: &str) { region_id, Rows { schema: column_schemas.clone(), - rows: build_rows(0, 10), + rows: build_rows(0, 15), }, ) .await; @@ -76,6 +76,16 @@ async fn test_read_parquet_stats() { +-------+---------+---------------------+ | tag_0 | field_0 | ts | +-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 10 | 10.0 | 1970-01-01T00:00:10 | +| 11 | 11.0 | 1970-01-01T00:00:11 | +| 12 | 12.0 | 1970-01-01T00:00:12 | +| 13 | 13.0 | 1970-01-01T00:00:13 | +| 14 | 14.0 | 1970-01-01T00:00:14 | +| 2 | 2.0 | 1970-01-01T00:00:02 | +| 3 | 3.0 | 1970-01-01T00:00:03 | +| 4 | 4.0 | 1970-01-01T00:00:04 | | 5 | 5.0 | 1970-01-01T00:00:05 | | 6 | 6.0 | 1970-01-01T00:00:06 | | 7 | 7.0 | 1970-01-01T00:00:07 | @@ -84,7 +94,11 @@ async fn test_read_parquet_stats() { +-------+---------+---------------------+", ) .await; +} +#[tokio::test] +async fn test_prune_tag() { + // prune result: only row group 1&2 check_prune_row_groups( datafusion_expr::col("tag_0").gt(lit(ScalarValue::Utf8(Some("4".to_string())))), "\ @@ -100,3 +114,25 @@ async fn test_read_parquet_stats() { ) .await; } + +#[tokio::test] +async fn test_prune_tag_and_field() { + common_telemetry::init_default_ut_logging(); + // prune result: only row group 1 + check_prune_row_groups( + col("tag_0") + .gt(lit(ScalarValue::Utf8(Some("4".to_string())))) + .and(col("field_0").lt(lit(8.0))), + "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 5 | 5.0 | 1970-01-01T00:00:05 | +| 6 | 6.0 | 1970-01-01T00:00:06 | +| 7 | 7.0 | 1970-01-01T00:00:07 | +| 8 | 8.0 | 1970-01-01T00:00:08 | +| 9 | 9.0 | 1970-01-01T00:00:09 | ++-------+---------+---------------------+", + ) + .await; +} diff --git a/src/mito2/src/manifest/manager.rs b/src/mito2/src/manifest/manager.rs index e9c85eea3958..653cd7511fc8 100644 --- a/src/mito2/src/manifest/manager.rs +++ b/src/mito2/src/manifest/manager.rs @@ -154,6 +154,12 @@ impl RegionManifestManager { let inner = self.inner.read().await; inner.store.clone() } + + /// Returns total manifest size. + pub async fn manifest_size(&self) -> u64 { + let inner = self.inner.read().await; + inner.total_manifest_size() + } } #[cfg(test)] @@ -186,7 +192,7 @@ impl RegionManifestManagerInner { /// Creates a new manifest. async fn new(metadata: RegionMetadataRef, options: RegionManifestOptions) -> Result { // construct storage - let store = ManifestObjectStore::new( + let mut store = ManifestObjectStore::new( &options.manifest_dir, options.object_store.clone(), options.compress_type, @@ -232,7 +238,7 @@ impl RegionManifestManagerInner { /// Returns `Ok(None)` if no such manifest. async fn open(options: RegionManifestOptions) -> Result> { // construct storage - let store = ManifestObjectStore::new( + let mut store = ManifestObjectStore::new( &options.manifest_dir, options.object_store.clone(), options.compress_type, @@ -240,8 +246,9 @@ impl RegionManifestManagerInner { // recover from storage // construct manifest builder + // calculate the manifest size from the latest checkpoint let mut version = MIN_VERSION; - let checkpoint = Self::last_checkpoint(&store).await?; + let checkpoint = Self::last_checkpoint(&mut store).await?; let last_checkpoint_version = checkpoint .as_ref() .map(|checkpoint| checkpoint.last_version) @@ -265,6 +272,8 @@ impl RegionManifestManagerInner { let mut action_iter = store.scan(version, MAX_VERSION).await?; while let Some((manifest_version, raw_action_list)) = action_iter.next_log().await? { let action_list = RegionMetaActionList::decode(&raw_action_list)?; + // set manifest size after last checkpoint + store.set_delta_file_size(manifest_version, raw_action_list.len() as u64); for action in action_list.actions { match action { RegionMetaAction::Change(action) => { @@ -312,6 +321,7 @@ impl RegionManifestManagerInner { Ok(()) } + /// Update the manifest. Return the current manifest version number. async fn update(&mut self, action_list: RegionMetaActionList) -> Result { let version = self.increase_version(); self.store.save(version, &action_list.encode()?).await?; @@ -343,6 +353,11 @@ impl RegionManifestManagerInner { Ok(version) } + + /// Returns total manifest size. + pub(crate) fn total_manifest_size(&self) -> u64 { + self.store.total_manifest_size() + } } impl RegionManifestManagerInner { @@ -369,8 +384,8 @@ impl RegionManifestManagerInner { } /// Make a new checkpoint. Return the fresh one if there are some actions to compact. - async fn do_checkpoint(&self) -> Result> { - let last_checkpoint = Self::last_checkpoint(&self.store).await?; + async fn do_checkpoint(&mut self) -> Result> { + let last_checkpoint = Self::last_checkpoint(&mut self.store).await?; let current_version = self.last_version; let (start_version, mut manifest_builder) = if let Some(checkpoint) = last_checkpoint { @@ -441,7 +456,7 @@ impl RegionManifestManagerInner { /// Fetch the last [RegionCheckpoint] from storage. pub(crate) async fn last_checkpoint( - store: &ManifestObjectStore, + store: &mut ManifestObjectStore, ) -> Result> { let last_checkpoint = store.load_last_checkpoint().await?; @@ -456,14 +471,16 @@ impl RegionManifestManagerInner { #[cfg(test)] mod test { + use api::v1::SemanticType; use common_datasource::compression::CompressionType; + use common_test_util::temp_dir::create_temp_dir; use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder}; use super::*; - use crate::manifest::action::RegionChange; + use crate::manifest::action::{RegionChange, RegionEdit}; use crate::manifest::tests::utils::basic_region_metadata; use crate::test_util::TestEnv; @@ -546,4 +563,95 @@ mod test { .unwrap(); manager.validate_manifest(&new_metadata, 1).await; } + + /// Just for test, refer to wal_dir_usage in src/store-api/src/logstore.rs. + async fn manifest_dir_usage(path: &str) -> u64 { + let mut size = 0; + let mut read_dir = tokio::fs::read_dir(path).await.unwrap(); + while let Ok(dir_entry) = read_dir.next_entry().await { + let Some(entry) = dir_entry else { + break; + }; + if entry.file_type().await.unwrap().is_file() { + let file_name = entry.file_name().into_string().unwrap(); + if file_name.contains(".checkpoint") || file_name.contains(".json") { + let file_size = entry.metadata().await.unwrap().len() as usize; + debug!("File: {file_name:?}, size: {file_size}"); + size += file_size; + } + } + } + size as u64 + } + + #[tokio::test] + async fn test_manifest_size() { + let metadata = Arc::new(basic_region_metadata()); + let data_home = create_temp_dir(""); + let data_home_path = data_home.path().to_str().unwrap().to_string(); + let env = TestEnv::with_data_home(data_home); + + let manifest_dir = format!("{}/manifest", data_home_path); + + let manager = env + .create_manifest_manager(CompressionType::Uncompressed, 10, Some(metadata.clone())) + .await + .unwrap() + .unwrap(); + + let mut new_metadata_builder = RegionMetadataBuilder::from_existing((*metadata).clone()); + new_metadata_builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new("val2", ConcreteDataType::float64_datatype(), false), + semantic_type: SemanticType::Field, + column_id: 252, + }); + let new_metadata = Arc::new(new_metadata_builder.build().unwrap()); + + let action_list = + RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange { + metadata: new_metadata.clone(), + })); + + let current_version = manager.update(action_list).await.unwrap(); + assert_eq!(current_version, 1); + manager.validate_manifest(&new_metadata, 1).await; + + // get manifest size + let manifest_size = manager.manifest_size().await; + assert_eq!(manifest_size, manifest_dir_usage(&manifest_dir).await); + + // update 10 times nop_action to trigger checkpoint + for _ in 0..10 { + manager + .update(RegionMetaActionList::new(vec![RegionMetaAction::Edit( + RegionEdit { + files_to_add: vec![], + files_to_remove: vec![], + compaction_time_window: None, + flushed_entry_id: None, + flushed_sequence: None, + }, + )])) + .await + .unwrap(); + } + + // check manifest size again + let manifest_size = manager.manifest_size().await; + assert_eq!(manifest_size, manifest_dir_usage(&manifest_dir).await); + + // Reopen the manager, + // we just calculate the size from the latest checkpoint file + manager.stop().await.unwrap(); + let manager = env + .create_manifest_manager(CompressionType::Uncompressed, 10, None) + .await + .unwrap() + .unwrap(); + manager.validate_manifest(&new_metadata, 11).await; + + // get manifest size again + let manifest_size = manager.manifest_size().await; + assert_eq!(manifest_size, 1312); + } } diff --git a/src/mito2/src/manifest/storage.rs b/src/mito2/src/manifest/storage.rs index a0f7dbf9714e..edd63ac52162 100644 --- a/src/mito2/src/manifest/storage.rs +++ b/src/mito2/src/manifest/storage.rs @@ -129,11 +129,22 @@ impl ObjectStoreLogIterator { } } +/// Key to identify a manifest file. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] +enum FileKey { + /// A delta file (`.json`). + Delta(ManifestVersion), + /// A checkpoint file (`.checkpoint`). + Checkpoint(ManifestVersion), +} + #[derive(Clone, Debug)] pub struct ManifestObjectStore { object_store: ObjectStore, compress_type: CompressionType, path: String, + /// Stores the size of each manifest file. + manifest_size_map: HashMap, } impl ManifestObjectStore { @@ -142,6 +153,7 @@ impl ManifestObjectStore { object_store, compress_type, path: util::normalize_dir(path), + manifest_size_map: HashMap::new(), } } @@ -184,6 +196,7 @@ impl ManifestObjectStore { .context(OpenDalSnafu) } + /// Scan the manifest files in the range of [start, end) and return the iterator. pub async fn scan( &self, start: ManifestVersion, @@ -212,8 +225,12 @@ impl ManifestObjectStore { }) } + /// Delete manifest files that version < end. + /// If keep_last_checkpoint is true, the last checkpoint file will be kept. + /// ### Return + /// The number of deleted files. pub async fn delete_until( - &self, + &mut self, end: ManifestVersion, keep_last_checkpoint: bool, ) -> Result { @@ -248,7 +265,7 @@ impl ManifestObjectStore { } else { None }; - let paths: Vec<_> = entries + let del_entries: Vec<_> = entries .iter() .filter(|(_e, is_checkpoint, version)| { if let Some(max_version) = checkpoint_version { @@ -264,12 +281,15 @@ impl ManifestObjectStore { true } }) - .map(|e| e.0.path().to_string()) .collect(); + let paths = del_entries + .iter() + .map(|(e, _, _)| e.path().to_string()) + .collect::>(); let ret = paths.len(); debug!( - "Deleting {} logs from manifest storage path {} until {}, checkpoint: {:?}, paths: {:?}", + "Deleting {} logs from manifest storage path {} until {}, checkpoint_version: {:?}, paths: {:?}", ret, self.path, end, @@ -282,10 +302,21 @@ impl ManifestObjectStore { .await .context(OpenDalSnafu)?; + // delete manifest sizes + for (_, is_checkpoint, version) in &del_entries { + if *is_checkpoint { + self.manifest_size_map + .remove(&FileKey::Checkpoint(*version)); + } else { + self.manifest_size_map.remove(&FileKey::Delta(*version)); + } + } + Ok(ret) } - pub async fn save(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> { + /// Save the delta manifest file. + pub async fn save(&mut self, version: ManifestVersion, bytes: &[u8]) -> Result<()> { let path = self.delta_file_path(version); debug!("Save log to manifest storage, version: {}", version); let data = self @@ -296,13 +327,17 @@ impl ManifestObjectStore { compress_type: self.compress_type, path: &path, })?; + let delta_size = data.len(); self.object_store .write(&path, data) .await - .context(OpenDalSnafu) + .context(OpenDalSnafu)?; + self.set_delta_file_size(version, delta_size as u64); + Ok(()) } - pub async fn save_checkpoint(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> { + /// Save the checkpoint manifest file. + pub async fn save_checkpoint(&mut self, version: ManifestVersion, bytes: &[u8]) -> Result<()> { let path = self.checkpoint_file_path(version); let data = self .compress_type @@ -312,10 +347,12 @@ impl ManifestObjectStore { compress_type: self.compress_type, path: &path, })?; + let checkpoint_size = data.len(); self.object_store .write(&path, data) .await .context(OpenDalSnafu)?; + self.set_checkpoint_file_size(version, checkpoint_size as u64); // Because last checkpoint file only contain size and version, which is tiny, so we don't compress it. let last_checkpoint_path = self.last_checkpoint_path(); @@ -342,7 +379,7 @@ impl ManifestObjectStore { } pub async fn load_checkpoint( - &self, + &mut self, version: ManifestVersion, ) -> Result)>> { let path = self.checkpoint_file_path(version); @@ -351,12 +388,15 @@ impl ManifestObjectStore { let checkpoint_data = match self.object_store.read(&path).await { Ok(checkpoint) => { + let checkpoint_size = checkpoint.len(); let decompress_data = self.compress_type.decode(checkpoint).await.context( DecompressObjectSnafu { compress_type: self.compress_type, path, }, )?; + // set the checkpoint size + self.set_checkpoint_file_size(version, checkpoint_size as u64); Ok(Some(decompress_data)) } Err(e) => { @@ -373,6 +413,7 @@ impl ManifestObjectStore { ); match self.object_store.read(&fall_back_path).await { Ok(checkpoint) => { + let checkpoint_size = checkpoint.len(); let decompress_data = FALL_BACK_COMPRESS_TYPE .decode(checkpoint) .await @@ -380,6 +421,7 @@ impl ManifestObjectStore { compress_type: FALL_BACK_COMPRESS_TYPE, path, })?; + self.set_checkpoint_file_size(version, checkpoint_size as u64); Ok(Some(decompress_data)) } Err(e) if e.kind() == ErrorKind::NotFound => Ok(None), @@ -398,7 +440,7 @@ impl ManifestObjectStore { /// Load the latest checkpoint. /// Return manifest version and the raw [RegionCheckpoint](crate::manifest::action::RegionCheckpoint) content if any - pub async fn load_last_checkpoint(&self) -> Result)>> { + pub async fn load_last_checkpoint(&mut self) -> Result)>> { let last_checkpoint_path = self.last_checkpoint_path(); let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await { Ok(data) => data, @@ -424,6 +466,22 @@ impl ManifestObjectStore { pub async fn read_file(&self, path: &str) -> Result> { self.object_store.read(path).await.context(OpenDalSnafu) } + + /// Compute the size(Byte) in manifest size map. + pub(crate) fn total_manifest_size(&self) -> u64 { + self.manifest_size_map.values().sum() + } + + /// Set the size of the delta file by delta version. + pub(crate) fn set_delta_file_size(&mut self, version: ManifestVersion, size: u64) { + self.manifest_size_map.insert(FileKey::Delta(version), size); + } + + /// Set the size of the checkpoint file by checkpoint version. + pub(crate) fn set_checkpoint_file_size(&mut self, version: ManifestVersion, size: u64) { + self.manifest_size_map + .insert(FileKey::Checkpoint(version), size); + } } #[derive(Serialize, Deserialize, Debug)] @@ -489,7 +547,7 @@ mod tests { test_manifest_log_store_case(log_store).await; } - async fn test_manifest_log_store_case(log_store: ManifestObjectStore) { + async fn test_manifest_log_store_case(mut log_store: ManifestObjectStore) { for v in 0..5 { log_store .save(v, format!("hello, {v}").as_bytes()) @@ -600,4 +658,92 @@ mod tests { let mut it = log_store.scan(0, 10).await.unwrap(); assert!(it.next_log().await.unwrap().is_none()); } + + #[tokio::test] + async fn test_file_version() { + let version = file_version("00000000000000000007.checkpoint"); + assert_eq!(version, 7); + + let name = delta_file(version); + assert_eq!(name, "00000000000000000007.json"); + + let name = checkpoint_file(version); + assert_eq!(name, "00000000000000000007.checkpoint"); + } + + #[tokio::test] + async fn test_uncompressed_manifest_files_size() { + let mut log_store = new_test_manifest_store(); + // write 5 manifest files with uncompressed(8B per file) + log_store.compress_type = CompressionType::Uncompressed; + for v in 0..5 { + log_store + .save(v, format!("hello, {v}").as_bytes()) + .await + .unwrap(); + } + // write 1 checkpoint file with uncompressed(23B) + log_store + .save_checkpoint(5, "checkpoint_uncompressed".as_bytes()) + .await + .unwrap(); + + // manifest files size + assert_eq!(log_store.total_manifest_size(), 63); + + // delete 3 manifest files + assert_eq!(log_store.delete_until(3, false).await.unwrap(), 3); + + // manifest files size after delete + assert_eq!(log_store.total_manifest_size(), 39); + + // delete all manifest files + assert_eq!( + log_store + .delete_until(ManifestVersion::MAX, false) + .await + .unwrap(), + 3 + ); + + assert_eq!(log_store.total_manifest_size(), 0); + } + + #[tokio::test] + async fn test_compressed_manifest_files_size() { + let mut log_store = new_test_manifest_store(); + // Test with compressed manifest files + log_store.compress_type = CompressionType::Gzip; + // write 5 manifest files + for v in 0..5 { + log_store + .save(v, format!("hello, {v}").as_bytes()) + .await + .unwrap(); + } + log_store + .save_checkpoint(5, "checkpoint_compressed".as_bytes()) + .await + .unwrap(); + + // manifest files size + assert_eq!(log_store.total_manifest_size(), 181); + + // delete 3 manifest files + assert_eq!(log_store.delete_until(3, false).await.unwrap(), 3); + + // manifest files size after delete + assert_eq!(log_store.total_manifest_size(), 97); + + // delete all manifest files + assert_eq!( + log_store + .delete_until(ManifestVersion::MAX, false) + .await + .unwrap(), + 3 + ); + + assert_eq!(log_store.total_manifest_size(), 0); + } } diff --git a/src/mito2/src/manifest/tests/checkpoint.rs b/src/mito2/src/manifest/tests/checkpoint.rs index 68c7063e1e63..c28f6cd6d598 100644 --- a/src/mito2/src/manifest/tests/checkpoint.rs +++ b/src/mito2/src/manifest/tests/checkpoint.rs @@ -202,7 +202,7 @@ async fn generate_checkpoint_with_compression_types( manager.update(action).await.unwrap(); } - RegionManifestManagerInner::last_checkpoint(&manager.store().await) + RegionManifestManagerInner::last_checkpoint(&mut manager.store().await) .await .unwrap() .unwrap() diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs index 6ab6f984ad05..efa6bc9506d6 100644 --- a/src/mito2/src/memtable/time_series.rs +++ b/src/mito2/src/memtable/time_series.rs @@ -20,8 +20,11 @@ use std::sync::{Arc, RwLock}; use api::v1::OpType; use common_telemetry::debug; +use datafusion::physical_plan::PhysicalExpr; +use datafusion_common::ScalarValue; +use datafusion_expr::ColumnarValue; use datatypes::arrow; -use datatypes::arrow::array::ArrayRef; +use datatypes::arrow::array::{ArrayRef, BooleanArray}; use datatypes::arrow::record_batch::RecordBatch; use datatypes::data_type::DataType; use datatypes::prelude::{MutableVector, ScalarVectorBuilder, Vector, VectorRef}; @@ -300,12 +303,16 @@ impl SeriesSet { let (primary_key_builders, primary_key_schema) = primary_key_builders(&self.region_metadata, 1); + let physical_exprs: Vec<_> = predicate + .and_then(|p| p.to_physical_exprs(&primary_key_schema).ok()) + .unwrap_or_default(); + Iter { metadata: self.region_metadata.clone(), series: self.series.clone(), projection, last_key: None, - predicate, + predicate: physical_exprs, pk_schema: primary_key_schema, primary_key_builders, codec: self.codec.clone(), @@ -341,7 +348,7 @@ struct Iter { series: Arc, projection: HashSet, last_key: Option>, - predicate: Option, + predicate: Vec>, pk_schema: arrow::datatypes::SchemaRef, primary_key_builders: Vec>, codec: Arc, @@ -362,18 +369,18 @@ impl Iterator for Iter { // TODO(hl): maybe yield more than one time series to amortize range overhead. for (primary_key, series) in range { let mut series = series.write().unwrap(); - if let Some(predicate) = &self.predicate { - if !prune_primary_key( + if !self.predicate.is_empty() + && !prune_primary_key( &self.codec, primary_key.as_slice(), &mut series, &mut self.primary_key_builders, self.pk_schema.clone(), - predicate, - ) { - // read next series - continue; - } + &self.predicate, + ) + { + // read next series + continue; } self.last_key = Some(primary_key.clone()); @@ -392,7 +399,7 @@ fn prune_primary_key( series: &mut Series, builders: &mut Vec>, pk_schema: arrow::datatypes::SchemaRef, - predicate: &Predicate, + predicate: &[Arc], ) -> bool { // no primary key, we simply return true. if pk_schema.fields().is_empty() { @@ -400,20 +407,52 @@ fn prune_primary_key( } if let Some(rb) = series.pk_cache.as_ref() { - let res = predicate.prune_primary_key(rb).unwrap_or(true); + let res = prune_inner(predicate, rb).unwrap_or(true); debug!("Prune primary key: {:?}, res: {:?}", rb, res); res } else { let Ok(rb) = pk_to_record_batch(codec, pk, builders, pk_schema) else { return true; }; - let res = predicate.prune_primary_key(&rb).unwrap_or(true); + let res = prune_inner(predicate, &rb).unwrap_or(true); debug!("Prune primary key: {:?}, res: {:?}", rb, res); series.update_pk_cache(rb); res } } +fn prune_inner(predicates: &[Arc], primary_key: &RecordBatch) -> Result { + for expr in predicates { + // evaluate every filter against primary key + let Ok(eva) = expr.evaluate(primary_key) else { + continue; + }; + let result = match eva { + ColumnarValue::Array(array) => { + let predicate_array = array.as_any().downcast_ref::().unwrap(); + predicate_array + .into_iter() + .map(|x| x.unwrap_or(true)) + .next() + .unwrap_or(true) + } + // result was a column + ColumnarValue::Scalar(ScalarValue::Boolean(v)) => v.unwrap_or(true), + _ => { + unreachable!("Unexpected primary key record batch evaluation result: {:?}, primary key: {:?}", eva, primary_key); + } + }; + debug!( + "Evaluate primary key {:?} against filter: {:?}, result: {:?}", + primary_key, expr, result + ); + if !result { + return Ok(false); + } + } + Ok(true) +} + fn pk_to_record_batch( codec: &Arc, bytes: &[u8], diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index 403c38b7fa81..6c6415b6f2cd 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -17,13 +17,12 @@ use common_recordbatch::SendableRecordBatchStream; use common_telemetry::debug; use common_time::range::TimestampRange; -use snafu::ResultExt; use store_api::storage::ScanRequest; use table::predicate::{Predicate, TimeRangePredicateBuilder}; use crate::access_layer::AccessLayerRef; use crate::cache::CacheManagerRef; -use crate::error::{BuildPredicateSnafu, Result}; +use crate::error::Result; use crate::read::projection::ProjectionMapper; use crate::read::seq_scan::SeqScan; use crate::region::version::VersionRef; @@ -173,11 +172,7 @@ impl ScanRegion { total_ssts ); - let predicate = Predicate::try_new( - self.request.filters.clone(), - self.version.metadata.schema.clone(), - ) - .context(BuildPredicateSnafu)?; + let predicate = Predicate::new(self.request.filters.clone()); let mapper = match &self.request.projection { Some(p) => ProjectionMapper::new(&self.version.metadata, p.iter().copied())?, None => ProjectionMapper::all(&self.version.metadata)?, diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs index 6a8a3a805f7c..a224ee1e195b 100644 --- a/src/mito2/src/region/opener.rs +++ b/src/mito2/src/region/opener.rs @@ -119,7 +119,8 @@ impl RegionOpener { &expect.column_metadatas, &expect.primary_key, )?; - + // To keep consistence with Create behavior, set the opened Region writable. + region.set_writable(true); return Ok(region); } Ok(None) => { diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 872c0e410408..481f98f1af12 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -16,6 +16,7 @@ mod format; pub mod reader; +pub mod row_group; mod stats; pub mod writer; diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 3eade74a4c62..0e40a909a364 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -188,8 +188,9 @@ impl ParquetReaderBuilder { &read_format, column_ids, ); + let pruned_row_groups = predicate - .prune_with_stats(&stats) + .prune_with_stats(&stats, read_format.metadata().schema.arrow_schema()) .into_iter() .enumerate() .filter_map(|(idx, valid)| if valid { Some(idx) } else { None }) diff --git a/src/mito2/src/sst/parquet/row_group.rs b/src/mito2/src/sst/parquet/row_group.rs new file mode 100644 index 000000000000..a80f7c874253 --- /dev/null +++ b/src/mito2/src/sst/parquet/row_group.rs @@ -0,0 +1,230 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Ports private structs from [parquet crate](https://github.com/apache/arrow-rs/blob/7e134f4d277c0b62c27529fc15a4739de3ad0afd/parquet/src/arrow/async_reader/mod.rs#L644-L650). + +use std::sync::Arc; + +use bytes::{Buf, Bytes}; +use parquet::arrow::arrow_reader::{RowGroups, RowSelection}; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::arrow::ProjectionMask; +use parquet::column::page::{PageIterator, PageReader}; +use parquet::errors::{ParquetError, Result}; +use parquet::file::metadata::RowGroupMetaData; +use parquet::file::reader::{ChunkReader, Length}; +use parquet::file::serialized_reader::SerializedPageReader; +use parquet::format::PageLocation; + +/// An in-memory collection of column chunks +pub struct InMemoryRowGroup<'a> { + metadata: &'a RowGroupMetaData, + page_locations: Option<&'a [Vec]>, + column_chunks: Vec>>, + row_count: usize, +} + +impl<'a> InMemoryRowGroup<'a> { + /// Fetches the necessary column data into memory + // TODO(yingwen): Fix clippy warnings. + #[allow(clippy::filter_map_bool_then)] + #[allow(clippy::useless_conversion)] + pub async fn fetch( + &mut self, + input: &mut T, + projection: &ProjectionMask, + selection: Option<&RowSelection>, + ) -> Result<()> { + if let Some((selection, page_locations)) = selection.zip(self.page_locations) { + // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the + // `RowSelection` + let mut page_start_offsets: Vec> = vec![]; + + let fetch_ranges = self + .column_chunks + .iter() + .zip(self.metadata.columns()) + .enumerate() + .filter_map(|(idx, (chunk, chunk_meta))| { + (chunk.is_none() && projection.leaf_included(idx)).then(|| { + // If the first page does not start at the beginning of the column, + // then we need to also fetch a dictionary page. + let mut ranges = vec![]; + let (start, _len) = chunk_meta.byte_range(); + match page_locations[idx].first() { + Some(first) if first.offset as u64 != start => { + ranges.push(start as usize..first.offset as usize); + } + _ => (), + } + + ranges.extend(selection.scan_ranges(&page_locations[idx])); + page_start_offsets.push(ranges.iter().map(|range| range.start).collect()); + + ranges + }) + }) + .flatten() + .collect(); + + let mut chunk_data = input.get_byte_ranges(fetch_ranges).await?.into_iter(); + let mut page_start_offsets = page_start_offsets.into_iter(); + + for (idx, chunk) in self.column_chunks.iter_mut().enumerate() { + if chunk.is_some() || !projection.leaf_included(idx) { + continue; + } + + if let Some(offsets) = page_start_offsets.next() { + let mut chunks = Vec::with_capacity(offsets.len()); + for _ in 0..offsets.len() { + chunks.push(chunk_data.next().unwrap()); + } + + *chunk = Some(Arc::new(ColumnChunkData::Sparse { + length: self.metadata.column(idx).byte_range().1 as usize, + data: offsets.into_iter().zip(chunks.into_iter()).collect(), + })) + } + } + } else { + let fetch_ranges = self + .column_chunks + .iter() + .enumerate() + .filter_map(|(idx, chunk)| { + (chunk.is_none() && projection.leaf_included(idx)).then(|| { + let column = self.metadata.column(idx); + let (start, length) = column.byte_range(); + start as usize..(start + length) as usize + }) + }) + .collect(); + + let mut chunk_data = input.get_byte_ranges(fetch_ranges).await?.into_iter(); + + for (idx, chunk) in self.column_chunks.iter_mut().enumerate() { + if chunk.is_some() || !projection.leaf_included(idx) { + continue; + } + + if let Some(data) = chunk_data.next() { + *chunk = Some(Arc::new(ColumnChunkData::Dense { + offset: self.metadata.column(idx).byte_range().0 as usize, + data, + })); + } + } + } + + Ok(()) + } +} + +impl<'a> RowGroups for InMemoryRowGroup<'a> { + fn num_rows(&self) -> usize { + self.row_count + } + + fn column_chunks(&self, i: usize) -> Result> { + match &self.column_chunks[i] { + None => Err(ParquetError::General(format!( + "Invalid column index {i}, column was not fetched" + ))), + Some(data) => { + let page_locations = self.page_locations.map(|index| index[i].clone()); + let page_reader: Box = Box::new(SerializedPageReader::new( + data.clone(), + self.metadata.column(i), + self.row_count, + page_locations, + )?); + + Ok(Box::new(ColumnChunkIterator { + reader: Some(Ok(page_reader)), + })) + } + } + } +} + +/// An in-memory column chunk +#[derive(Clone)] +enum ColumnChunkData { + /// Column chunk data representing only a subset of data pages + Sparse { + /// Length of the full column chunk + length: usize, + /// Set of data pages included in this sparse chunk. Each element is a tuple + /// of (page offset, page data) + data: Vec<(usize, Bytes)>, + }, + /// Full column chunk and its offset + Dense { offset: usize, data: Bytes }, +} + +impl ColumnChunkData { + fn get(&self, start: u64) -> Result { + match &self { + ColumnChunkData::Sparse { data, .. } => data + .binary_search_by_key(&start, |(offset, _)| *offset as u64) + .map(|idx| data[idx].1.clone()) + .map_err(|_| { + ParquetError::General(format!( + "Invalid offset in sparse column chunk data: {start}" + )) + }), + ColumnChunkData::Dense { offset, data } => { + let start = start as usize - *offset; + Ok(data.slice(start..)) + } + } + } +} + +impl Length for ColumnChunkData { + fn len(&self) -> u64 { + match &self { + ColumnChunkData::Sparse { length, .. } => *length as u64, + ColumnChunkData::Dense { data, .. } => data.len() as u64, + } + } +} + +impl ChunkReader for ColumnChunkData { + type T = bytes::buf::Reader; + + fn get_read(&self, start: u64) -> Result { + Ok(self.get(start)?.reader()) + } + + fn get_bytes(&self, start: u64, length: usize) -> Result { + Ok(self.get(start)?.slice(..length)) + } +} + +/// Implements [`PageIterator`] for a single column chunk, yielding a single [`PageReader`] +struct ColumnChunkIterator { + reader: Option>>, +} + +impl Iterator for ColumnChunkIterator { + type Item = Result>; + + fn next(&mut self) -> Option { + self.reader.take() + } +} + +impl PageIterator for ColumnChunkIterator {} diff --git a/src/mito2/src/test_util.rs b/src/mito2/src/test_util.rs index c9621249212c..d7cb13e5121b 100644 --- a/src/mito2/src/test_util.rs +++ b/src/mito2/src/test_util.rs @@ -99,6 +99,15 @@ impl TestEnv { } } + /// Returns a new env with specific `data_home` for test. + pub fn with_data_home(data_home: TempDir) -> TestEnv { + TestEnv { + data_home, + logstore: None, + object_store: None, + } + } + pub fn get_logstore(&self) -> Option> { self.logstore.clone() } diff --git a/src/object-store/Cargo.toml b/src/object-store/Cargo.toml index 49bf01464d4a..9d1d055ef2c5 100644 --- a/src/object-store/Cargo.toml +++ b/src/object-store/Cargo.toml @@ -7,6 +7,8 @@ license.workspace = true [dependencies] async-trait = "0.1" bytes = "1.4" +common-error.workspace = true +common-macro.workspace = true common-runtime.workspace = true common-telemetry.workspace = true futures.workspace = true @@ -17,6 +19,7 @@ opendal = { version = "0.40", features = [ "layers-tracing", "layers-metrics", ] } +snafu.workspace = true uuid.workspace = true [dev-dependencies] diff --git a/src/object-store/src/error.rs b/src/object-store/src/error.rs new file mode 100644 index 000000000000..8ea360d11683 --- /dev/null +++ b/src/object-store/src/error.rs @@ -0,0 +1,45 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_error::ext::ErrorExt; +use common_error::status_code::StatusCode; +use common_macro::stack_trace_debug; +use snafu::{Location, Snafu}; + +#[derive(Snafu)] +#[snafu(visibility(pub))] +#[stack_trace_debug] +pub enum Error { + #[snafu(display("Default storage not found: {}", default_object_store))] + DefaultStorageNotFound { + location: Location, + default_object_store: String, + }, +} + +pub type Result = std::result::Result; + +impl ErrorExt for Error { + fn status_code(&self) -> StatusCode { + match self { + Error::DefaultStorageNotFound { .. } => StatusCode::InvalidArguments, + } + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/object-store/src/lib.rs b/src/object-store/src/lib.rs index f1bf27846668..9623ef9a4ec6 100644 --- a/src/object-store/src/lib.rs +++ b/src/object-store/src/lib.rs @@ -19,7 +19,9 @@ pub use opendal::{ Operator as ObjectStore, Reader, Result, Writer, }; +pub mod error; pub mod layers; +pub mod manager; mod metrics; pub mod test_util; pub mod util; diff --git a/src/object-store/src/manager.rs b/src/object-store/src/manager.rs new file mode 100644 index 000000000000..d7cb323057cf --- /dev/null +++ b/src/object-store/src/manager.rs @@ -0,0 +1,107 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use snafu::OptionExt; + +use crate::error::{DefaultStorageNotFoundSnafu, Result}; +use crate::ObjectStore; + +/// Manages multiple object stores so that users can configure a storage for each table. +/// This struct certainly have one default object store, and can have zero or more custom object stores. +pub struct ObjectStoreManager { + stores: HashMap, + default_object_store: ObjectStore, +} + +impl ObjectStoreManager { + /// Creates a new manager with specific object stores. Returns an error if `stores` doesn't contain the default object store. + pub fn try_new( + stores: HashMap, + default_object_store: &str, + ) -> Result { + let default_object_store = stores + .get(default_object_store) + .context(DefaultStorageNotFoundSnafu { + default_object_store, + })? + .clone(); + Ok(ObjectStoreManager { + stores, + default_object_store, + }) + } + + pub fn find(&self, name: &str) -> Option<&ObjectStore> { + self.stores.get(name) + } + + pub fn default_object_store(&self) -> &ObjectStore { + &self.default_object_store + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use common_test_util::temp_dir::{create_temp_dir, TempDir}; + + use super::ObjectStoreManager; + use crate::error::Error; + use crate::services::Fs as Builder; + use crate::ObjectStore; + + fn new_object_store(dir: &TempDir) -> ObjectStore { + let store_dir = dir.path().to_str().unwrap(); + let mut builder = Builder::default(); + let _ = builder.root(store_dir); + ObjectStore::new(builder).unwrap().finish() + } + + #[test] + fn test_new_returns_err_when_global_store_not_exist() { + let dir = create_temp_dir("new"); + let object_store = new_object_store(&dir); + let stores: HashMap = vec![ + ("File".to_string(), object_store.clone()), + ("S3".to_string(), object_store.clone()), + ] + .into_iter() + .collect(); + + assert!(matches!( + ObjectStoreManager::try_new(stores, "Gcs"), + Err(Error::DefaultStorageNotFound { .. }) + )); + } + + #[test] + fn test_new_returns_ok() { + let dir = create_temp_dir("new"); + let object_store = new_object_store(&dir); + let stores: HashMap = vec![ + ("File".to_string(), object_store.clone()), + ("S3".to_string(), object_store.clone()), + ] + .into_iter() + .collect(); + let object_store_manager = ObjectStoreManager::try_new(stores, "File").unwrap(); + assert_eq!(object_store_manager.stores.len(), 2); + assert!(object_store_manager.find("File").is_some()); + assert!(object_store_manager.find("S3").is_some()); + assert!(object_store_manager.find("Gcs").is_none()); + } +} diff --git a/src/operator/src/error.rs b/src/operator/src/error.rs index cb9963ced7f3..07c730887bf9 100644 --- a/src/operator/src/error.rs +++ b/src/operator/src/error.rs @@ -556,11 +556,12 @@ impl ErrorExt for Error { Error::TableNotFound { .. } => StatusCode::TableNotFound, - Error::JoinTask { .. } - | Error::BuildParquetRecordBatchStream { .. } - | Error::ReadDfRecordBatch { .. } + Error::JoinTask { .. } => StatusCode::Internal, + + Error::BuildParquetRecordBatchStream { .. } | Error::BuildFileStream { .. } | Error::WriteStreamToFile { .. } + | Error::ReadDfRecordBatch { .. } | Error::Unexpected { .. } => StatusCode::Unexpected, Error::Catalog { source, .. } => source.status_code(), diff --git a/src/plugins/src/frontend.rs b/src/plugins/src/frontend.rs index 7ed8e96ecd3b..f6849b8e3d9e 100644 --- a/src/plugins/src/frontend.rs +++ b/src/plugins/src/frontend.rs @@ -18,7 +18,7 @@ use frontend::error::{IllegalAuthConfigSnafu, Result}; use frontend::frontend::FrontendOptions; use snafu::ResultExt; -pub async fn setup_frontend_plugins(opts: &mut FrontendOptions) -> Result { +pub async fn setup_frontend_plugins(opts: &FrontendOptions) -> Result { let plugins = Plugins::new(); if let Some(user_provider) = opts.user_provider.as_ref() { diff --git a/src/promql/Cargo.toml b/src/promql/Cargo.toml index 990197a34c4b..a1fe7f4510b7 100644 --- a/src/promql/Cargo.toml +++ b/src/promql/Cargo.toml @@ -12,6 +12,7 @@ catalog = { workspace = true } common-catalog = { workspace = true } common-error = { workspace = true } common-macro = { workspace = true } +common-recordbatch = { workspace = true } common-telemetry = { workspace = true } datafusion.workspace = true datatypes = { workspace = true } diff --git a/src/promql/src/error.rs b/src/promql/src/error.rs index 01f48c7d2478..31c44e5715e8 100644 --- a/src/promql/src/error.rs +++ b/src/promql/src/error.rs @@ -109,6 +109,9 @@ pub enum Error { #[snafu(display("Expect a metric matcher, but not found"))] NoMetricMatcher { location: Location }, + + #[snafu(display("Invalid function argument for {}", fn_name))] + FunctionInvalidArgument { fn_name: String, location: Location }, } impl ErrorExt for Error { @@ -124,7 +127,8 @@ impl ErrorExt for Error { | ExpectRangeSelector { .. } | ZeroRangeSelector { .. } | ColumnNotFound { .. } - | Deserialize { .. } => StatusCode::InvalidArguments, + | Deserialize { .. } + | FunctionInvalidArgument { .. } => StatusCode::InvalidArguments, UnknownTable { .. } | DataFusionPlanning { .. } diff --git a/src/promql/src/extension_plan.rs b/src/promql/src/extension_plan.rs index 1fb6c6b1ee3e..49a9199bf0cc 100644 --- a/src/promql/src/extension_plan.rs +++ b/src/promql/src/extension_plan.rs @@ -13,6 +13,7 @@ // limitations under the License. mod empty_metric; +mod histogram_fold; mod instant_manipulate; mod normalize; mod planner; @@ -21,6 +22,7 @@ mod series_divide; use datafusion::arrow::datatypes::{ArrowPrimitiveType, TimestampMillisecondType}; pub use empty_metric::{build_special_time_expr, EmptyMetric, EmptyMetricExec, EmptyMetricStream}; +pub use histogram_fold::{HistogramFold, HistogramFoldExec, HistogramFoldStream}; pub use instant_manipulate::{InstantManipulate, InstantManipulateExec, InstantManipulateStream}; pub use normalize::{SeriesNormalize, SeriesNormalizeExec, SeriesNormalizeStream}; pub use planner::PromExtensionPlanner; diff --git a/src/promql/src/extension_plan/histogram_fold.rs b/src/promql/src/extension_plan/histogram_fold.rs new file mode 100644 index 000000000000..a4fb2b315fe9 --- /dev/null +++ b/src/promql/src/extension_plan/histogram_fold.rs @@ -0,0 +1,916 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::task::Poll; +use std::time::Instant; + +use common_recordbatch::RecordBatch as GtRecordBatch; +use common_telemetry::warn; +use datafusion::arrow::array::AsArray; +use datafusion::arrow::compute::{self, concat_batches, SortOptions}; +use datafusion::arrow::datatypes::{DataType, Float64Type, SchemaRef}; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::common::{DFSchema, DFSchemaRef}; +use datafusion::error::{DataFusionError, Result as DataFusionResult}; +use datafusion::execution::TaskContext; +use datafusion::logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion::physical_expr::{PhysicalSortExpr, PhysicalSortRequirement}; +use datafusion::physical_plan::expressions::{CastExpr as PhyCast, Column as PhyColumn}; +use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, + RecordBatchStream, SendableRecordBatchStream, Statistics, +}; +use datafusion::prelude::{Column, Expr}; +use datatypes::prelude::{ConcreteDataType, DataType as GtDataType}; +use datatypes::schema::Schema as GtSchema; +use datatypes::value::{OrderedF64, ValueRef}; +use datatypes::vectors::MutableVector; +use futures::{ready, Stream, StreamExt}; + +/// `HistogramFold` will fold the conventional (non-native) histogram ([1]) for later +/// computing. Specifically, it will transform the `le` and `field` column into a complex +/// type, and samples on other tag columns: +/// - `le` will become a [ListArray] of [f64]. With each bucket bound parsed +/// - `field` will become a [ListArray] of [f64] +/// - other columns will be sampled every `bucket_num` element, but their types won't change. +/// +/// Due to the folding or sampling, the output rows number will become `input_rows` / `bucket_num`. +/// +/// # Requirement +/// - Input should be sorted on `, le ASC, ts`. +/// - The value set of `le` should be same. I.e., buckets of every series should be same. +/// +/// [1]: https://prometheus.io/docs/concepts/metric_types/#histogram +#[derive(Debug, PartialEq, Hash, Eq)] +pub struct HistogramFold { + /// Name of the `le` column. It's a special column in prometheus + /// for implementing conventional histogram. It's a string column + /// with "literal" float value, like "+Inf", "0.001" etc. + le_column: String, + ts_column: String, + input: LogicalPlan, + field_column: String, + quantile: OrderedF64, + output_schema: DFSchemaRef, +} + +impl UserDefinedLogicalNodeCore for HistogramFold { + fn name(&self) -> &str { + Self::name() + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.input] + } + + fn schema(&self) -> &DFSchemaRef { + &self.output_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "HistogramFold: le={}, field={}, quantile={}", + self.le_column, self.field_column, self.quantile + ) + } + + fn from_template(&self, _exprs: &[Expr], inputs: &[LogicalPlan]) -> Self { + Self { + le_column: self.le_column.clone(), + ts_column: self.ts_column.clone(), + input: inputs[0].clone(), + field_column: self.field_column.clone(), + quantile: self.quantile, + // This method cannot return error. Otherwise we should re-calculate + // the output schema + output_schema: self.output_schema.clone(), + } + } +} + +impl HistogramFold { + pub fn new( + le_column: String, + field_column: String, + ts_column: String, + quantile: f64, + input: LogicalPlan, + ) -> DataFusionResult { + let input_schema = input.schema(); + Self::check_schema(input_schema, &le_column, &field_column, &ts_column)?; + let output_schema = Self::convert_schema(input_schema, &le_column)?; + Ok(Self { + le_column, + ts_column, + input, + field_column, + quantile: quantile.into(), + output_schema, + }) + } + + pub const fn name() -> &'static str { + "HistogramFold" + } + + fn check_schema( + input_schema: &DFSchemaRef, + le_column: &str, + field_column: &str, + ts_column: &str, + ) -> DataFusionResult<()> { + let check_column = |col| { + if !input_schema.has_column_with_unqualified_name(col) { + return Err(DataFusionError::SchemaError( + datafusion::common::SchemaError::FieldNotFound { + field: Box::new(Column::new(None::, col)), + valid_fields: input_schema + .fields() + .iter() + .map(|f| f.qualified_column()) + .collect(), + }, + )); + } else { + Ok(()) + } + }; + + check_column(le_column)?; + check_column(ts_column)?; + check_column(field_column) + } + + pub fn to_execution_plan(&self, exec_input: Arc) -> Arc { + let input_schema = self.input.schema(); + // safety: those fields are checked in `check_schema()` + let le_column_index = input_schema + .index_of_column_by_name(None, &self.le_column) + .unwrap() + .unwrap(); + let field_column_index = input_schema + .index_of_column_by_name(None, &self.field_column) + .unwrap() + .unwrap(); + let ts_column_index = input_schema + .index_of_column_by_name(None, &self.ts_column) + .unwrap() + .unwrap(); + + Arc::new(HistogramFoldExec { + le_column_index, + field_column_index, + ts_column_index, + input: exec_input, + quantile: self.quantile.into(), + output_schema: Arc::new(self.output_schema.as_ref().into()), + metric: ExecutionPlanMetricsSet::new(), + }) + } + + /// Transform the schema + /// + /// - `le` will be removed + fn convert_schema( + input_schema: &DFSchemaRef, + le_column: &str, + ) -> DataFusionResult { + let mut fields = input_schema.fields().clone(); + // safety: those fields are checked in `check_schema()` + let le_column_idx = input_schema + .index_of_column_by_name(None, le_column)? + .unwrap(); + fields.remove(le_column_idx); + + Ok(Arc::new(DFSchema::new_with_metadata( + fields, + HashMap::new(), + )?)) + } +} + +#[derive(Debug)] +pub struct HistogramFoldExec { + /// Index for `le` column in the schema of input. + le_column_index: usize, + input: Arc, + output_schema: SchemaRef, + /// Index for field column in the schema of input. + field_column_index: usize, + ts_column_index: usize, + quantile: f64, + metric: ExecutionPlanMetricsSet, +} + +impl ExecutionPlan for HistogramFoldExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.output_schema.clone() + } + + fn output_partitioning(&self) -> Partitioning { + self.input.output_partitioning() + } + + fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + self.input.output_ordering() + } + + fn required_input_ordering(&self) -> Vec>> { + let mut cols = self + .tag_col_exprs() + .into_iter() + .map(|expr| PhysicalSortRequirement { + expr, + options: None, + }) + .collect::>(); + // add le ASC + cols.push(PhysicalSortRequirement { + expr: Arc::new(PhyCast::new( + Arc::new(PhyColumn::new( + self.input.schema().field(self.le_column_index).name(), + self.le_column_index, + )), + DataType::Float64, + None, + )), + options: Some(SortOptions { + descending: false, // +INF in the last + nulls_first: false, // not nullable + }), + }); + // add ts + cols.push(PhysicalSortRequirement { + expr: Arc::new(PhyColumn::new( + self.input.schema().field(self.ts_column_index).name(), + self.ts_column_index, + )), + options: None, + }); + + vec![Some(cols)] + } + + fn required_input_distribution(&self) -> Vec { + // partition on all tag columns, i.e., non-le, non-ts and non-field columns + vec![Distribution::HashPartitioned(self.tag_col_exprs())] + } + + fn maintains_input_order(&self) -> Vec { + vec![true; self.children().len()] + } + + fn children(&self) -> Vec> { + vec![self.input.clone()] + } + + // cannot change schema with this method + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DataFusionResult> { + assert!(!children.is_empty()); + Ok(Arc::new(Self { + input: children[0].clone(), + metric: self.metric.clone(), + le_column_index: self.le_column_index, + ts_column_index: self.ts_column_index, + quantile: self.quantile, + output_schema: self.output_schema.clone(), + field_column_index: self.field_column_index, + })) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DataFusionResult { + let baseline_metric = BaselineMetrics::new(&self.metric, partition); + + let batch_size = context.session_config().batch_size(); + let input = self.input.execute(partition, context)?; + let output_schema = self.output_schema.clone(); + + let mut normal_indices = (0..input.schema().fields().len()).collect::>(); + normal_indices.remove(&self.field_column_index); + normal_indices.remove(&self.le_column_index); + Ok(Box::pin(HistogramFoldStream { + le_column_index: self.le_column_index, + field_column_index: self.field_column_index, + quantile: self.quantile, + normal_indices: normal_indices.into_iter().collect(), + bucket_size: None, + input_buffer: vec![], + input, + output_schema, + metric: baseline_metric, + batch_size, + input_buffered_rows: 0, + output_buffer: HistogramFoldStream::empty_output_buffer( + &self.output_schema, + self.le_column_index, + )?, + output_buffered_rows: 0, + })) + } + + fn metrics(&self) -> Option { + Some(self.metric.clone_inner()) + } + + fn statistics(&self) -> Statistics { + Statistics { + num_rows: None, + total_byte_size: None, + column_statistics: None, + is_exact: false, + } + } +} + +impl HistogramFoldExec { + /// Return all the [PhysicalExpr] of tag columns in order. + /// + /// Tag columns are all columns except `le`, `field` and `ts` columns. + pub fn tag_col_exprs(&self) -> Vec> { + self.input + .schema() + .fields() + .iter() + .enumerate() + .filter_map(|(idx, field)| { + if idx == self.le_column_index + || idx == self.field_column_index + || idx == self.ts_column_index + { + None + } else { + Some(Arc::new(PhyColumn::new(field.name(), idx)) as _) + } + }) + .collect() + } +} + +impl DisplayAs for HistogramFoldExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!( + f, + "HistogramFoldExec: le=@{}, field=@{}, quantile={}", + self.le_column_index, self.field_column_index, self.quantile + ) + } + } + } +} + +pub struct HistogramFoldStream { + // internal states + le_column_index: usize, + field_column_index: usize, + quantile: f64, + /// Columns need not folding. This indices is based on input schema + normal_indices: Vec, + bucket_size: Option, + /// Expected output batch size + batch_size: usize, + output_schema: SchemaRef, + + // buffers + input_buffer: Vec, + input_buffered_rows: usize, + output_buffer: Vec>, + output_buffered_rows: usize, + + // runtime things + input: SendableRecordBatchStream, + metric: BaselineMetrics, +} + +impl RecordBatchStream for HistogramFoldStream { + fn schema(&self) -> SchemaRef { + self.output_schema.clone() + } +} + +impl Stream for HistogramFoldStream { + type Item = DataFusionResult; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let poll = loop { + match ready!(self.input.poll_next_unpin(cx)) { + Some(batch) => { + let batch = batch?; + let timer = Instant::now(); + let Some(result) = self.fold_input(batch)? else { + self.metric.elapsed_compute().add_elapsed(timer); + continue; + }; + self.metric.elapsed_compute().add_elapsed(timer); + break Poll::Ready(Some(result)); + } + None => break Poll::Ready(self.take_output_buf()?.map(Ok)), + } + }; + self.metric.record_poll(poll) + } +} + +impl HistogramFoldStream { + /// The inner most `Result` is for `poll_next()` + pub fn fold_input( + &mut self, + input: RecordBatch, + ) -> DataFusionResult>> { + let Some(bucket_num) = self.calculate_bucket_num(&input)? else { + return Ok(None); + }; + + if self.input_buffered_rows + input.num_rows() < bucket_num { + // not enough rows to fold + self.push_input_buf(input); + return Ok(None); + } + + self.fold_buf(bucket_num, input)?; + if self.output_buffered_rows >= self.batch_size { + return Ok(self.take_output_buf()?.map(Ok)); + } + + Ok(None) + } + + /// Generate a group of empty [MutableVector]s from the output schema. + /// + /// For simplicity, this method will insert a placeholder for `le`. So that + /// the output buffers has the same schema with input. This placeholder needs + /// to be removed before returning the output batch. + pub fn empty_output_buffer( + schema: &SchemaRef, + le_column_index: usize, + ) -> DataFusionResult>> { + let mut builders = Vec::with_capacity(schema.fields().len() + 1); + for field in schema.fields() { + let concrete_datatype = ConcreteDataType::try_from(field.data_type()).unwrap(); + let mutable_vector = concrete_datatype.create_mutable_vector(0); + builders.push(mutable_vector); + } + builders.insert( + le_column_index, + ConcreteDataType::float64_datatype().create_mutable_vector(0), + ); + + Ok(builders) + } + + fn calculate_bucket_num(&mut self, batch: &RecordBatch) -> DataFusionResult> { + if let Some(size) = self.bucket_size { + return Ok(Some(size)); + } + + let inf_pos = self.find_positive_inf(batch)?; + if inf_pos == batch.num_rows() { + // no positive inf found, append to buffer and wait for next batch + self.push_input_buf(batch.clone()); + return Ok(None); + } + + // else we found the positive inf. + // calculate the bucket size + let bucket_size = inf_pos + self.input_buffered_rows + 1; + Ok(Some(bucket_size)) + } + + /// Fold record batches from input buffer and put to output buffer + fn fold_buf(&mut self, bucket_num: usize, input: RecordBatch) -> DataFusionResult<()> { + self.push_input_buf(input); + // TODO(ruihang): this concat is avoidable. + let batch = concat_batches(&self.input.schema(), self.input_buffer.drain(..).as_ref())?; + let mut remaining_rows = self.input_buffered_rows; + let mut cursor = 0; + + let gt_schema = GtSchema::try_from(self.input.schema()).unwrap(); + let batch = GtRecordBatch::try_from_df_record_batch(Arc::new(gt_schema), batch).unwrap(); + + while remaining_rows >= bucket_num { + // "sample" normal columns + for normal_index in &self.normal_indices { + let val = batch.column(*normal_index).get(cursor); + self.output_buffer[*normal_index].push_value_ref(val.as_value_ref()); + } + // "fold" `le` and field columns + let le_array = batch.column(self.le_column_index); + let field_array = batch.column(self.field_column_index); + let mut bucket = vec![]; + let mut counters = vec![]; + for bias in 0..bucket_num { + let le_str_val = le_array.get(cursor + bias); + let le_str_val_ref = le_str_val.as_value_ref(); + let le_str = le_str_val_ref + .as_string() + .unwrap() + .expect("le column should not be nullable"); + let le = le_str.parse::().unwrap(); + bucket.push(le); + + let counter = field_array + .get(cursor + bias) + .as_value_ref() + .as_f64() + .unwrap() + .expect("field column should not be nullable"); + counters.push(counter); + } + let result = Self::evaluate_row(self.quantile, &bucket, &counters)?; + self.output_buffer[self.field_column_index].push_value_ref(ValueRef::from(result)); + cursor += bucket_num; + remaining_rows -= bucket_num; + self.output_buffered_rows += 1; + } + + let remaining_input_batch = batch.into_df_record_batch().slice(cursor, remaining_rows); + self.input_buffered_rows = remaining_input_batch.num_rows(); + self.input_buffer.push(remaining_input_batch); + + Ok(()) + } + + fn push_input_buf(&mut self, batch: RecordBatch) { + self.input_buffered_rows += batch.num_rows(); + self.input_buffer.push(batch); + } + + /// Compute result from output buffer + fn take_output_buf(&mut self) -> DataFusionResult> { + if self.output_buffered_rows == 0 { + if self.input_buffered_rows != 0 { + warn!( + "input buffer is not empty, {} rows remaining", + self.input_buffered_rows + ); + } + return Ok(None); + } + + let mut output_buf = Self::empty_output_buffer(&self.output_schema, self.le_column_index)?; + std::mem::swap(&mut self.output_buffer, &mut output_buf); + let mut columns = Vec::with_capacity(output_buf.len()); + for builder in output_buf.iter_mut() { + columns.push(builder.to_vector().to_arrow_array()); + } + // remove the placeholder column for `le` + columns.remove(self.le_column_index); + + self.output_buffered_rows = 0; + RecordBatch::try_new(self.output_schema.clone(), columns) + .map(Some) + .map_err(DataFusionError::ArrowError) + } + + /// Find the first `+Inf` which indicates the end of the bucket group + /// + /// If the return value equals to batch's num_rows means the it's not found + /// in this batch + fn find_positive_inf(&self, batch: &RecordBatch) -> DataFusionResult { + // fuse this function. It should not be called when the + // bucket size is already know. + if let Some(bucket_size) = self.bucket_size { + return Ok(bucket_size); + } + let string_le_array = batch.column(self.le_column_index); + let float_le_array = compute::cast(&string_le_array, &DataType::Float64).map_err(|e| { + DataFusionError::Execution(format!( + "cannot cast {} array to float64 array: {:?}", + string_le_array.data_type(), + e + )) + })?; + let le_as_f64_array = float_le_array + .as_primitive_opt::() + .ok_or_else(|| { + DataFusionError::Execution(format!( + "expect a float64 array, but found {}", + float_le_array.data_type() + )) + })?; + for (i, v) in le_as_f64_array.iter().enumerate() { + if let Some(v) = v && v == f64::INFINITY { + return Ok(i); + } + } + + Ok(batch.num_rows()) + } + + /// Evaluate the field column and return the result + fn evaluate_row(quantile: f64, bucket: &[f64], counter: &[f64]) -> DataFusionResult { + // check bucket + if bucket.len() <= 1 { + return Ok(f64::NAN); + } + if *bucket.last().unwrap() != f64::INFINITY { + return Err(DataFusionError::Execution( + "last bucket should be +Inf".to_string(), + )); + } + if bucket.len() != counter.len() { + return Err(DataFusionError::Execution( + "bucket and counter should have the same length".to_string(), + )); + } + // check quantile + if quantile < 0.0 { + return Ok(f64::NEG_INFINITY); + } else if quantile > 1.0 { + return Ok(f64::INFINITY); + } else if quantile.is_nan() { + return Ok(f64::NAN); + } + + // check input value + debug_assert!(bucket.windows(2).all(|w| w[0] <= w[1])); + debug_assert!(counter.windows(2).all(|w| w[0] <= w[1])); + + let total = *counter.last().unwrap(); + let expected_pos = total * quantile; + let mut fit_bucket_pos = 0; + while fit_bucket_pos < bucket.len() && counter[fit_bucket_pos] < expected_pos { + fit_bucket_pos += 1; + } + if fit_bucket_pos >= bucket.len() - 1 { + Ok(bucket[bucket.len() - 2]) + } else { + let upper_bound = bucket[fit_bucket_pos]; + let upper_count = counter[fit_bucket_pos]; + let mut lower_bound = bucket[0].min(0.0); + let mut lower_count = 0.0; + if fit_bucket_pos > 0 { + lower_bound = bucket[fit_bucket_pos - 1]; + lower_count = counter[fit_bucket_pos - 1]; + } + Ok(lower_bound + + (upper_bound - lower_bound) / (upper_count - lower_count) + * (expected_pos - lower_count)) + } + } +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use datafusion::arrow::array::Float64Array; + use datafusion::arrow::datatypes::{Field, Schema}; + use datafusion::common::ToDFSchema; + use datafusion::physical_plan::memory::MemoryExec; + use datafusion::prelude::SessionContext; + use datatypes::arrow_array::StringArray; + + use super::*; + + fn prepare_test_data() -> MemoryExec { + let schema = Arc::new(Schema::new(vec![ + Field::new("host", DataType::Utf8, true), + Field::new("le", DataType::Utf8, true), + Field::new("val", DataType::Float64, true), + ])); + + // 12 items + let host_column_1 = Arc::new(StringArray::from(vec![ + "host_1", "host_1", "host_1", "host_1", "host_1", "host_1", "host_1", "host_1", + "host_1", "host_1", "host_1", "host_1", + ])) as _; + let le_column_1 = Arc::new(StringArray::from(vec![ + "0.001", "0.1", "10", "1000", "+Inf", "0.001", "0.1", "10", "1000", "+inf", "0.001", + "0.1", + ])) as _; + let val_column_1 = Arc::new(Float64Array::from(vec![ + 0_0.0, 1.0, 1.0, 5.0, 5.0, 0_0.0, 20.0, 60.0, 70.0, 100.0, 0_1.0, 1.0, + ])) as _; + + // 2 items + let host_column_2 = Arc::new(StringArray::from(vec!["host_1", "host_1"])) as _; + let le_column_2 = Arc::new(StringArray::from(vec!["10", "1000"])) as _; + let val_column_2 = Arc::new(Float64Array::from(vec![1.0, 1.0])) as _; + + // 11 items + let host_column_3 = Arc::new(StringArray::from(vec![ + "host_1", "host_2", "host_2", "host_2", "host_2", "host_2", "host_2", "host_2", + "host_2", "host_2", "host_2", + ])) as _; + let le_column_3 = Arc::new(StringArray::from(vec![ + "+INF", "0.001", "0.1", "10", "1000", "+iNf", "0.001", "0.1", "10", "1000", "+Inf", + ])) as _; + let val_column_3 = Arc::new(Float64Array::from(vec![ + 1.0, 0_0.0, 0.0, 0.0, 0.0, 0.0, 0_0.0, 1.0, 2.0, 3.0, 4.0, + ])) as _; + + let data_1 = RecordBatch::try_new( + schema.clone(), + vec![host_column_1, le_column_1, val_column_1], + ) + .unwrap(); + let data_2 = RecordBatch::try_new( + schema.clone(), + vec![host_column_2, le_column_2, val_column_2], + ) + .unwrap(); + let data_3 = RecordBatch::try_new( + schema.clone(), + vec![host_column_3, le_column_3, val_column_3], + ) + .unwrap(); + + MemoryExec::try_new(&[vec![data_1, data_2, data_3]], schema, None).unwrap() + } + + #[tokio::test] + async fn fold_overall() { + let memory_exec = Arc::new(prepare_test_data()); + let output_schema = Arc::new( + (*HistogramFold::convert_schema( + &Arc::new(memory_exec.schema().to_dfschema().unwrap()), + "le", + ) + .unwrap() + .as_ref()) + .clone() + .into(), + ); + let fold_exec = Arc::new(HistogramFoldExec { + le_column_index: 1, + field_column_index: 2, + quantile: 0.4, + ts_column_index: 9999, // not exist but doesn't matter + input: memory_exec, + output_schema, + metric: ExecutionPlanMetricsSet::new(), + }); + + let session_context = SessionContext::default(); + let result = datafusion::physical_plan::collect(fold_exec, session_context.task_ctx()) + .await + .unwrap(); + let result_literal = datatypes::arrow::util::pretty::pretty_format_batches(&result) + .unwrap() + .to_string(); + + let expected = String::from( + "+--------+-------------------+ +| host | val | ++--------+-------------------+ +| host_1 | 257.5 | +| host_1 | 5.05 | +| host_1 | 0.0004 | +| host_2 | NaN | +| host_2 | 6.040000000000001 | ++--------+-------------------+", + ); + assert_eq!(result_literal, expected); + } + + #[test] + fn confirm_schema() { + let input_schema = Schema::new(vec![ + Field::new("host", DataType::Utf8, true), + Field::new("le", DataType::Utf8, true), + Field::new("val", DataType::Float64, true), + ]) + .to_dfschema_ref() + .unwrap(); + let expected_output_schema = Schema::new(vec![ + Field::new("host", DataType::Utf8, true), + Field::new("val", DataType::Float64, true), + ]) + .to_dfschema_ref() + .unwrap(); + + let actual = HistogramFold::convert_schema(&input_schema, "le").unwrap(); + assert_eq!(actual, expected_output_schema) + } + + #[test] + fn evaluate_row_normal_case() { + let bucket = [0.0, 1.0, 2.0, 3.0, 4.0, f64::INFINITY]; + + #[derive(Debug)] + struct Case { + quantile: f64, + counters: Vec, + expected: f64, + } + + let cases = [ + Case { + quantile: 0.9, + counters: vec![0.0, 10.0, 20.0, 30.0, 40.0, 50.0], + expected: 4.0, + }, + Case { + quantile: 0.89, + counters: vec![0.0, 10.0, 20.0, 30.0, 40.0, 50.0], + expected: 4.0, + }, + Case { + quantile: 0.78, + counters: vec![0.0, 10.0, 20.0, 30.0, 40.0, 50.0], + expected: 3.9, + }, + Case { + quantile: 0.5, + counters: vec![0.0, 10.0, 20.0, 30.0, 40.0, 50.0], + expected: 2.5, + }, + Case { + quantile: 0.5, + counters: vec![0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + expected: f64::NAN, + }, + Case { + quantile: 1.0, + counters: vec![0.0, 10.0, 20.0, 30.0, 40.0, 50.0], + expected: 4.0, + }, + Case { + quantile: 0.0, + counters: vec![0.0, 10.0, 20.0, 30.0, 40.0, 50.0], + expected: f64::NAN, + }, + Case { + quantile: 1.1, + counters: vec![0.0, 10.0, 20.0, 30.0, 40.0, 50.0], + expected: f64::INFINITY, + }, + Case { + quantile: -1.0, + counters: vec![0.0, 10.0, 20.0, 30.0, 40.0, 50.0], + expected: f64::NEG_INFINITY, + }, + ]; + + for case in cases { + let actual = + HistogramFoldStream::evaluate_row(case.quantile, &bucket, &case.counters).unwrap(); + assert_eq!( + format!("{actual}"), + format!("{}", case.expected), + "{:?}", + case + ); + } + } + + #[test] + #[should_panic] + fn evaluate_out_of_order_input() { + let bucket = [0.0, 1.0, 2.0, 3.0, 4.0, f64::INFINITY]; + let counters = [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]; + HistogramFoldStream::evaluate_row(0.5, &bucket, &counters).unwrap(); + } + + #[test] + fn evaluate_wrong_bucket() { + let bucket = [0.0, 1.0, 2.0, 3.0, 4.0, f64::INFINITY, 5.0]; + let counters = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]; + let result = HistogramFoldStream::evaluate_row(0.5, &bucket, &counters); + assert!(result.is_err()); + } + + #[test] + fn evaluate_small_fraction() { + let bucket = [0.0, 2.0, 4.0, 6.0, f64::INFINITY]; + let counters = [0.0, 1.0 / 300.0, 2.0 / 300.0, 0.01, 0.01]; + let result = HistogramFoldStream::evaluate_row(0.5, &bucket, &counters).unwrap(); + assert_eq!(3.0, result); + } +} diff --git a/src/promql/src/extension_plan/planner.rs b/src/promql/src/extension_plan/planner.rs index 1198108012c4..7798c9b32193 100644 --- a/src/promql/src/extension_plan/planner.rs +++ b/src/promql/src/extension_plan/planner.rs @@ -21,6 +21,7 @@ use datafusion::logical_expr::{LogicalPlan, UserDefinedLogicalNode}; use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_planner::{ExtensionPlanner, PhysicalPlanner}; +use super::HistogramFold; use crate::extension_plan::{ EmptyMetric, InstantManipulate, RangeManipulate, SeriesDivide, SeriesNormalize, }; @@ -47,6 +48,8 @@ impl ExtensionPlanner for PromExtensionPlanner { Ok(Some(node.to_execution_plan(physical_inputs[0].clone()))) } else if let Some(node) = node.as_any().downcast_ref::() { Ok(Some(node.to_execution_plan(session_state, planner)?)) + } else if let Some(node) = node.as_any().downcast_ref::() { + Ok(Some(node.to_execution_plan(physical_inputs[0].clone()))) } else { Ok(None) } diff --git a/src/promql/src/planner.rs b/src/promql/src/planner.rs index c7140659a878..dcadf8c4ebaf 100644 --- a/src/promql/src/planner.rs +++ b/src/promql/src/planner.rs @@ -44,14 +44,14 @@ use table::table::adapter::DfTableProviderAdapter; use crate::error::{ CatalogSnafu, ColumnNotFoundSnafu, DataFusionPlanningSnafu, ExpectExprSnafu, - ExpectRangeSelectorSnafu, MultipleMetricMatchersSnafu, MultipleVectorSnafu, - NoMetricMatcherSnafu, Result, TableNameNotFoundSnafu, TimeIndexNotFoundSnafu, - UnexpectedPlanExprSnafu, UnexpectedTokenSnafu, UnknownTableSnafu, UnsupportedExprSnafu, - ValueNotFoundSnafu, ZeroRangeSelectorSnafu, + ExpectRangeSelectorSnafu, FunctionInvalidArgumentSnafu, MultipleMetricMatchersSnafu, + MultipleVectorSnafu, NoMetricMatcherSnafu, Result, TableNameNotFoundSnafu, + TimeIndexNotFoundSnafu, UnexpectedPlanExprSnafu, UnexpectedTokenSnafu, UnknownTableSnafu, + UnsupportedExprSnafu, ValueNotFoundSnafu, ZeroRangeSelectorSnafu, }; use crate::extension_plan::{ - build_special_time_expr, EmptyMetric, InstantManipulate, Millisecond, RangeManipulate, - SeriesDivide, SeriesNormalize, + build_special_time_expr, EmptyMetric, HistogramFold, InstantManipulate, Millisecond, + RangeManipulate, SeriesDivide, SeriesNormalize, }; use crate::functions::{ AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, HoltWinters, IDelta, @@ -61,6 +61,10 @@ use crate::functions::{ /// `time()` function in PromQL. const SPECIAL_TIME_FUNCTION: &str = "time"; +/// `histogram_quantile` function in PromQL +const SPECIAL_HISTOGRAM_QUANTILE: &str = "histogram_quantile"; +/// `le` column for conventional histogram. +const LE_COLUMN_NAME: &str = "le"; const DEFAULT_TIME_INDEX_COLUMN: &str = "time"; @@ -108,6 +112,11 @@ impl PromPlannerContext { self.field_column_matcher = None; self.range = None; } + + /// Check if `le` is present in tag columns + fn has_le_tag(&self) -> bool { + self.tag_columns.iter().any(|c| c.eq(&LE_COLUMN_NAME)) + } } pub struct PromPlanner { @@ -440,6 +449,58 @@ impl PromPlanner { })); } + if func.name == SPECIAL_HISTOGRAM_QUANTILE { + if args.args.len() != 2 { + return FunctionInvalidArgumentSnafu { + fn_name: SPECIAL_HISTOGRAM_QUANTILE.to_string(), + } + .fail(); + } + let phi = Self::try_build_float_literal(&args.args[0]).with_context(|| { + FunctionInvalidArgumentSnafu { + fn_name: SPECIAL_HISTOGRAM_QUANTILE.to_string(), + } + })?; + let input = args.args[1].as_ref().clone(); + let input_plan = self.prom_expr_to_plan(input).await?; + + if !self.ctx.has_le_tag() { + common_telemetry::info!("[DEBUG] valid tags: {:?}", self.ctx.tag_columns); + return ColumnNotFoundSnafu { + col: LE_COLUMN_NAME.to_string(), + } + .fail(); + } + let time_index_column = + self.ctx.time_index_column.clone().with_context(|| { + TimeIndexNotFoundSnafu { + table: self.ctx.table_name.clone().unwrap_or_default(), + } + })?; + // FIXME(ruihang): support multi fields + let field_column = self + .ctx + .field_columns + .first() + .with_context(|| FunctionInvalidArgumentSnafu { + fn_name: SPECIAL_HISTOGRAM_QUANTILE.to_string(), + })? + .clone(); + + return Ok(LogicalPlan::Extension(Extension { + node: Arc::new( + HistogramFold::new( + LE_COLUMN_NAME.to_string(), + field_column, + time_index_column, + phi, + input_plan, + ) + .context(DataFusionPlanningSnafu)?, + ), + })); + } + let args = self.create_function_args(&args.args)?; let input = self .prom_expr_to_plan(args.input.with_context(|| ExpectExprSnafu { @@ -1183,6 +1244,25 @@ impl PromPlanner { } } + /// Try to build a [f64] from [PromExpr]. + fn try_build_float_literal(expr: &PromExpr) -> Option { + match expr { + PromExpr::NumberLiteral(NumberLiteral { val }) => Some(*val), + PromExpr::Paren(ParenExpr { expr }) => Self::try_build_float_literal(expr), + PromExpr::Unary(UnaryExpr { expr, .. }) => { + Self::try_build_float_literal(expr).map(|f| -f) + } + PromExpr::StringLiteral(_) + | PromExpr::Binary(_) + | PromExpr::VectorSelector(_) + | PromExpr::MatrixSelector(_) + | PromExpr::Call(_) + | PromExpr::Extension(_) + | PromExpr::Aggregate(_) + | PromExpr::Subquery(_) => None, + } + } + /// Return a lambda to build binary expression from token. /// Because some binary operator are function in DataFusion like `atan2` or `^`. #[allow(clippy::type_complexity)] diff --git a/src/query/src/error.rs b/src/query/src/error.rs index ff2842f20820..7999b4b49871 100644 --- a/src/query/src/error.rs +++ b/src/query/src/error.rs @@ -316,9 +316,13 @@ impl ErrorExt for Error { ParseSql { source, .. } => source.status_code(), CreateRecordBatch { source, .. } => source.status_code(), QueryExecution { source, .. } | QueryPlan { source, .. } => source.status_code(), - DataFusion { .. } | MissingTimestampColumn { .. } | RoutePartition { .. } => { - StatusCode::Internal - } + DataFusion { error, .. } => match error { + DataFusionError::Internal(_) => StatusCode::Internal, + DataFusionError::NotImplemented(_) => StatusCode::Unsupported, + DataFusionError::Plan(_) => StatusCode::PlanQuery, + _ => StatusCode::EngineExecuteQuery, + }, + MissingTimestampColumn { .. } | RoutePartition { .. } => StatusCode::EngineExecuteQuery, Sql { source, .. } => source.status_code(), PlanSql { .. } => StatusCode::PlanQuery, ConvertSqlType { source, .. } | ConvertSqlValue { source, .. } => source.status_code(), diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index 3efc2d3c65fa..db9724aaec51 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -392,7 +392,6 @@ impl ErrorExt for Error { Internal { .. } | InternalIo { .. } | TokioIo { .. } - | CollectRecordbatch { .. } | StartHttp { .. } | StartGrpc { .. } | AlreadyStarted { .. } @@ -403,6 +402,8 @@ impl ErrorExt for Error { | GrpcReflectionService { .. } | BuildHttpResponse { .. } => StatusCode::Internal, + CollectRecordbatch { .. } => StatusCode::EngineExecuteQuery, + InsertScript { source, .. } | ExecuteScript { source, .. } | ExecuteQuery { source, .. } diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index f00d4c07a271..884f79d48d85 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -660,6 +660,7 @@ impl HttpServer { fn route_otlp(&self, otlp_handler: OpenTelemetryProtocolHandlerRef) -> Router { Router::new() .route("/v1/metrics", routing::post(otlp::metrics)) + .route("/v1/traces", routing::post(otlp::traces)) .with_state(otlp_handler) } diff --git a/src/servers/src/http/opentsdb.rs b/src/servers/src/http/opentsdb.rs index c5b90b42a438..054595252ad3 100644 --- a/src/servers/src/http/opentsdb.rs +++ b/src/servers/src/http/opentsdb.rs @@ -84,17 +84,19 @@ pub async fn put( let summary = params.contains_key("summary"); let details = params.contains_key("details"); - let data_points = parse_data_points(body).await?; + let data_point_requests = parse_data_points(body).await?; + let data_points = data_point_requests + .iter() + .map(|point| point.clone().into()) + .collect::>(); let response = if !summary && !details { - for data_point in data_points.into_iter() { - if let Err(e) = opentsdb_handler.exec(&data_point.into(), ctx.clone()).await { - // Not debugging purpose, failed fast. - return error::InternalSnafu { - err_msg: e.to_string(), - } - .fail(); + if let Err(e) = opentsdb_handler.exec(data_points, ctx.clone()).await { + // Not debugging purpose, failed fast. + return error::InternalSnafu { + err_msg: e.to_string(), } + .fail(); } (HttpStatusCode::NO_CONTENT, Json(OpentsdbPutResponse::Empty)) } else { @@ -108,15 +110,11 @@ pub async fn put( }, }; - for data_point in data_points.into_iter() { - let result = opentsdb_handler - .exec(&data_point.clone().into(), ctx.clone()) - .await; + for (data_point, request) in data_points.into_iter().zip(data_point_requests) { + let result = opentsdb_handler.exec(vec![data_point], ctx.clone()).await; match result { - Ok(()) => response.on_success(), - Err(e) => { - response.on_failed(data_point, e); - } + Ok(affected_rows) => response.on_success(affected_rows), + Err(e) => response.on_failed(request, e), } } ( @@ -151,8 +149,8 @@ pub struct OpentsdbDebuggingResponse { } impl OpentsdbDebuggingResponse { - fn on_success(&mut self) { - self.success += 1; + fn on_success(&mut self, affected_rows: usize) { + self.success += affected_rows as i32; } fn on_failed(&mut self, datapoint: DataPointRequest, error: impl ErrorExt) { diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs index 7d797d440fbf..b4ae4ea85473 100644 --- a/src/servers/src/http/otlp.rs +++ b/src/servers/src/http/otlp.rs @@ -21,6 +21,9 @@ use hyper::Body; use opentelemetry_proto::tonic::collector::metrics::v1::{ ExportMetricsServiceRequest, ExportMetricsServiceResponse, }; +use opentelemetry_proto::tonic::collector::trace::v1::{ + ExportTraceServiceRequest, ExportTraceServiceResponse, +}; use prost::Message; use session::context::QueryContextRef; use snafu::prelude::*; @@ -33,16 +36,19 @@ pub async fn metrics( State(handler): State, Extension(query_ctx): Extension, RawBody(body): RawBody, -) -> Result { +) -> Result { let _timer = timer!( - crate::metrics::METRIC_HTTP_OPENTELEMETRY_ELAPSED, + crate::metrics::METRIC_HTTP_OPENTELEMETRY_METRICS_ELAPSED, &[(crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string())] ); - let request = parse_body(body).await?; - handler.metrics(request, query_ctx).await.map(OtlpResponse) + let request = parse_metrics_body(body).await?; + handler + .metrics(request, query_ctx) + .await + .map(OtlpMetricsResponse) } -async fn parse_body(body: Body) -> Result { +async fn parse_metrics_body(body: Body) -> Result { hyper::body::to_bytes(body) .await .context(error::HyperSnafu) @@ -51,9 +57,47 @@ async fn parse_body(body: Body) -> Result { }) } -pub struct OtlpResponse(ExportMetricsServiceResponse); +pub struct OtlpMetricsResponse(ExportMetricsServiceResponse); + +impl IntoResponse for OtlpMetricsResponse { + fn into_response(self) -> axum::response::Response { + ( + [(header::CONTENT_TYPE, "application/x-protobuf")], + self.0.encode_to_vec(), + ) + .into_response() + } +} + +#[axum_macros::debug_handler] +pub async fn traces( + State(handler): State, + Extension(query_ctx): Extension, + RawBody(body): RawBody, +) -> Result { + let _timer = timer!( + crate::metrics::METRIC_HTTP_OPENTELEMETRY_TRACES_ELAPSED, + &[(crate::metrics::METRIC_DB_LABEL, query_ctx.get_db_string())] + ); + let request = parse_traces_body(body).await?; + handler + .traces(request, query_ctx) + .await + .map(OtlpTracesResponse) +} + +async fn parse_traces_body(body: Body) -> Result { + hyper::body::to_bytes(body) + .await + .context(error::HyperSnafu) + .and_then(|buf| { + ExportTraceServiceRequest::decode(&buf[..]).context(error::DecodeOtlpRequestSnafu) + }) +} + +pub struct OtlpTracesResponse(ExportTraceServiceResponse); -impl IntoResponse for OtlpResponse { +impl IntoResponse for OtlpTracesResponse { fn into_response(self) -> axum::response::Response { ( [(header::CONTENT_TYPE, "application/x-protobuf")], diff --git a/src/servers/src/metrics.rs b/src/servers/src/metrics.rs index d9e708cfcf31..6e9aee8ad1e4 100644 --- a/src/servers/src/metrics.rs +++ b/src/servers/src/metrics.rs @@ -37,7 +37,10 @@ pub(crate) const METRIC_HTTP_INFLUXDB_WRITE_ELAPSED: &str = "servers.http_influx pub(crate) const METRIC_HTTP_PROM_STORE_WRITE_ELAPSED: &str = "servers.http_prometheus_write_elapsed"; pub(crate) const METRIC_HTTP_PROM_STORE_READ_ELAPSED: &str = "servers.http_prometheus_read_elapsed"; -pub(crate) const METRIC_HTTP_OPENTELEMETRY_ELAPSED: &str = "servers.http_otlp_elapsed"; +pub(crate) const METRIC_HTTP_OPENTELEMETRY_METRICS_ELAPSED: &str = + "servers.http_otlp_metrics_elapsed"; +pub(crate) const METRIC_HTTP_OPENTELEMETRY_TRACES_ELAPSED: &str = + "servers.http_otlp_traces_elapsed"; pub(crate) const METRIC_TCP_OPENTSDB_LINE_WRITE_ELAPSED: &str = "servers.opentsdb_line_write_elapsed"; pub(crate) const METRIC_HTTP_PROMQL_INSTANT_QUERY_ELAPSED: &str = diff --git a/src/servers/src/opentsdb.rs b/src/servers/src/opentsdb.rs index 61ed84167064..07cde1e14765 100644 --- a/src/servers/src/opentsdb.rs +++ b/src/servers/src/opentsdb.rs @@ -20,16 +20,20 @@ use std::future::Future; use std::net::SocketAddr; use std::sync::Arc; +use api::v1::RowInsertRequests; use async_trait::async_trait; use common_runtime::Runtime; use common_telemetry::logging::error; use futures::StreamExt; use tokio::sync::broadcast; +use self::codec::DataPoint; use crate::error::Result; use crate::opentsdb::connection::Connection; use crate::opentsdb::handler::Handler; +use crate::prom_store::{FIELD_COLUMN_NAME, TIMESTAMP_COLUMN_NAME}; use crate::query_handler::OpentsdbProtocolHandlerRef; +use crate::row_writer::{self, MultiTableData}; use crate::server::{AbortableStream, BaseTcpServer, Server}; use crate::shutdown::Shutdown; @@ -126,3 +130,38 @@ impl Server for OpentsdbServer { OPENTSDB_SERVER } } + +pub fn data_point_to_grpc_row_insert_requests( + data_points: Vec, +) -> Result<(RowInsertRequests, usize)> { + let mut multi_table_data = MultiTableData::new(); + + for mut data_point in data_points { + let tags: Vec<(String, String)> = std::mem::take(data_point.tags_mut()); + let table_name = data_point.metric(); + let value = data_point.value(); + let timestamp = data_point.ts_millis(); + // length of tags + 2 extra columns for greptime_timestamp and the value + let num_columns = tags.len() + 2; + + let table_data = multi_table_data.get_or_default_table_data(table_name, num_columns, 1); + let mut one_row = table_data.alloc_one_row(); + + // tags + row_writer::write_tags(table_data, tags.into_iter(), &mut one_row)?; + + // value + row_writer::write_f64(table_data, FIELD_COLUMN_NAME, value, &mut one_row)?; + // timestamp + row_writer::write_ts_millis( + table_data, + TIMESTAMP_COLUMN_NAME, + Some(timestamp), + &mut one_row, + )?; + + table_data.add_row(one_row); + } + + Ok(multi_table_data.into_row_insert_requests()) +} diff --git a/src/servers/src/opentsdb/codec.rs b/src/servers/src/opentsdb/codec.rs index 163e060adece..55e160460554 100644 --- a/src/servers/src/opentsdb/codec.rs +++ b/src/servers/src/opentsdb/codec.rs @@ -19,7 +19,7 @@ use crate::error::{self, Result}; pub const OPENTSDB_TIMESTAMP_COLUMN_NAME: &str = "greptime_timestamp"; pub const OPENTSDB_FIELD_COLUMN_NAME: &str = "greptime_value"; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct DataPoint { metric: String, ts_millis: i64, @@ -115,6 +115,10 @@ impl DataPoint { &self.tags } + pub fn tags_mut(&mut self) -> &mut Vec<(String, String)> { + &mut self.tags + } + pub fn ts_millis(&self) -> i64 { self.ts_millis } diff --git a/src/servers/src/opentsdb/handler.rs b/src/servers/src/opentsdb/handler.rs index 4cbe1731fe11..a12d54db614a 100644 --- a/src/servers/src/opentsdb/handler.rs +++ b/src/servers/src/opentsdb/handler.rs @@ -94,7 +94,7 @@ impl Handler { match DataPoint::try_create(&line) { Ok(data_point) => { let _timer = timer!(crate::metrics::METRIC_TCP_OPENTSDB_LINE_WRITE_ELAPSED); - let result = self.query_handler.exec(&data_point, ctx.clone()).await; + let result = self.query_handler.exec(vec![data_point], ctx.clone()).await; if let Err(e) = result { self.connection.write_line(e.output_msg()).await?; } @@ -128,8 +128,8 @@ mod tests { #[async_trait] impl OpentsdbProtocolHandler for DummyQueryHandler { - async fn exec(&self, data_point: &DataPoint, _ctx: QueryContextRef) -> Result<()> { - let metric = data_point.metric(); + async fn exec(&self, data_points: Vec, _ctx: QueryContextRef) -> Result { + let metric = data_points.first().unwrap().metric(); if metric == "should_failed" { return error::InternalSnafu { err_msg: "expected", @@ -137,7 +137,7 @@ mod tests { .fail(); } self.tx.send(metric.to_string()).await.unwrap(); - Ok(()) + Ok(data_points.len()) } } @@ -169,7 +169,7 @@ mod tests { .await .unwrap(); let resp = client.read_line().await.unwrap(); - assert_eq!(resp, Some("Internal error: expected".to_string())); + assert_eq!(resp, Some("Internal error: 1003".to_string())); client.write_line("get".to_string()).await.unwrap(); let resp = client.read_line().await.unwrap(); diff --git a/src/servers/src/otlp.rs b/src/servers/src/otlp.rs index 3acfaf0a881b..0f35a7b39ef0 100644 --- a/src/servers/src/otlp.rs +++ b/src/servers/src/otlp.rs @@ -12,649 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -use api::v1::{RowInsertRequests, Value}; -use common_grpc::writer::Precision; -use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest; -use opentelemetry_proto::tonic::common::v1::{any_value, KeyValue}; -use opentelemetry_proto::tonic::metrics::v1::{metric, number_data_point, *}; - -use crate::error::Result; -use crate::row_writer::{self, MultiTableData, TableData}; +pub mod metrics; +pub mod plugin; +pub mod trace; const GREPTIME_TIMESTAMP: &str = "greptime_timestamp"; const GREPTIME_VALUE: &str = "greptime_value"; const GREPTIME_COUNT: &str = "greptime_count"; -/// the default column count for table writer -const APPROXIMATE_COLUMN_COUNT: usize = 8; - -/// Normalize otlp instrumentation, metric and attribute names -/// -/// -/// - since the name are case-insensitive, we transform them to lowercase for -/// better sql usability -/// - replace `.` and `-` with `_` -fn normalize_otlp_name(name: &str) -> String { - name.to_lowercase().replace(|c| c == '.' || c == '-', "_") -} - -/// Convert OpenTelemetry metrics to GreptimeDB insert requests -/// -/// See -/// -/// for data structure of OTLP metrics. -/// -/// Returns `InsertRequests` and total number of rows to ingest -pub fn to_grpc_insert_requests( - request: ExportMetricsServiceRequest, -) -> Result<(RowInsertRequests, usize)> { - let mut table_writer = MultiTableData::default(); - - for resource in &request.resource_metrics { - let resource_attrs = resource.resource.as_ref().map(|r| &r.attributes); - for scope in &resource.scope_metrics { - let scope_attrs = scope.scope.as_ref().map(|s| &s.attributes); - for metric in &scope.metrics { - encode_metrics(&mut table_writer, metric, resource_attrs, scope_attrs)?; - } - } - } - - Ok(table_writer.into_row_insert_requests()) -} - -fn encode_metrics( - table_writer: &mut MultiTableData, - metric: &Metric, - resource_attrs: Option<&Vec>, - scope_attrs: Option<&Vec>, -) -> Result<()> { - let name = &metric.name; - // note that we don't store description or unit, we might want to deal with - // these fields in the future. - if let Some(data) = &metric.data { - match data { - metric::Data::Gauge(gauge) => { - encode_gauge(table_writer, name, gauge, resource_attrs, scope_attrs)?; - } - metric::Data::Sum(sum) => { - encode_sum(table_writer, name, sum, resource_attrs, scope_attrs)?; - } - metric::Data::Summary(summary) => { - encode_summary(table_writer, name, summary, resource_attrs, scope_attrs)?; - } - metric::Data::Histogram(hist) => { - encode_histogram(table_writer, name, hist, resource_attrs, scope_attrs)?; - } - // TODO(sunng87) leave ExponentialHistogram for next release - metric::Data::ExponentialHistogram(_hist) => {} - } - } - - Ok(()) -} - -fn write_attributes( - writer: &mut TableData, - row: &mut Vec, - attrs: Option<&Vec>, -) -> Result<()> { - if let Some(attrs) = attrs { - let table_tags = attrs.iter().filter_map(|attr| { - if let Some(val) = attr.value.as_ref().and_then(|v| v.value.as_ref()) { - let key = normalize_otlp_name(&attr.key); - match val { - any_value::Value::StringValue(s) => Some((key, s.to_string())), - any_value::Value::IntValue(v) => Some((key, v.to_string())), - any_value::Value::DoubleValue(v) => Some((key, v.to_string())), - _ => None, // TODO(sunng87): allow different type of values - } - } else { - None - } - }); - - row_writer::write_tags(writer, table_tags, row)?; - } - Ok(()) -} - -fn write_timestamp(table: &mut TableData, row: &mut Vec, time_nano: i64) -> Result<()> { - row_writer::write_ts_precision( - table, - GREPTIME_TIMESTAMP, - Some(time_nano), - Precision::Nanosecond, - row, - ) -} - -fn write_data_point_value( - table: &mut TableData, - row: &mut Vec, - field: &str, - value: &Option, -) -> Result<()> { - match value { - Some(number_data_point::Value::AsInt(val)) => { - // we coerce all values to f64 - row_writer::write_f64(table, field, *val as f64, row)?; - } - Some(number_data_point::Value::AsDouble(val)) => { - row_writer::write_f64(table, field, *val, row)?; - } - _ => {} - } - Ok(()) -} - -fn write_tags_and_timestamp( - table: &mut TableData, - row: &mut Vec, - resource_attrs: Option<&Vec>, - scope_attrs: Option<&Vec>, - data_point_attrs: Option<&Vec>, - timestamp_nanos: i64, -) -> Result<()> { - write_attributes(table, row, resource_attrs)?; - write_attributes(table, row, scope_attrs)?; - write_attributes(table, row, data_point_attrs)?; - - write_timestamp(table, row, timestamp_nanos)?; - - Ok(()) -} - -/// encode this gauge metric -/// -/// note that there can be multiple data points in the request, it's going to be -/// stored as multiple rows -fn encode_gauge( - table_writer: &mut MultiTableData, - name: &str, - gauge: &Gauge, - resource_attrs: Option<&Vec>, - scope_attrs: Option<&Vec>, -) -> Result<()> { - let table = table_writer.get_or_default_table_data( - &normalize_otlp_name(name), - APPROXIMATE_COLUMN_COUNT, - gauge.data_points.len(), - ); - - for data_point in &gauge.data_points { - let mut row = table.alloc_one_row(); - write_tags_and_timestamp( - table, - &mut row, - resource_attrs, - scope_attrs, - Some(data_point.attributes.as_ref()), - data_point.time_unix_nano as i64, - )?; - - write_data_point_value(table, &mut row, GREPTIME_VALUE, &data_point.value)?; - table.add_row(row); - } - - Ok(()) -} - -/// encode this sum metric -/// -/// `aggregation_temporality` and `monotonic` are ignored for now -fn encode_sum( - table_writer: &mut MultiTableData, - name: &str, - sum: &Sum, - resource_attrs: Option<&Vec>, - scope_attrs: Option<&Vec>, -) -> Result<()> { - let table = table_writer.get_or_default_table_data( - &normalize_otlp_name(name), - APPROXIMATE_COLUMN_COUNT, - sum.data_points.len(), - ); - - for data_point in &sum.data_points { - let mut row = table.alloc_one_row(); - write_tags_and_timestamp( - table, - &mut row, - resource_attrs, - scope_attrs, - Some(data_point.attributes.as_ref()), - data_point.time_unix_nano as i64, - )?; - write_data_point_value(table, &mut row, GREPTIME_VALUE, &data_point.value)?; - table.add_row(row); - } - - Ok(()) -} - -const HISTOGRAM_LE_COLUMN: &str = "le"; - -/// Encode histogram data. This function returns 3 insert requests for 3 tables. -/// -/// The implementation has been following Prometheus histogram table format: -/// -/// - A `%metric%_bucket` table including `greptime_le` tag that stores bucket upper -/// limit, and `greptime_value` for bucket count -/// - A `%metric%_sum` table storing sum of samples -/// - A `%metric%_count` table storing count of samples. -/// -/// By its Prometheus compatibility, we hope to be able to use prometheus -/// quantile functions on this table. -fn encode_histogram( - table_writer: &mut MultiTableData, - name: &str, - hist: &Histogram, - resource_attrs: Option<&Vec>, - scope_attrs: Option<&Vec>, -) -> Result<()> { - let normalized_name = normalize_otlp_name(name); - - let bucket_table_name = format!("{}_bucket", normalized_name); - let sum_table_name = format!("{}_sum", normalized_name); - let count_table_name = format!("{}_count", normalized_name); - - let data_points_len = hist.data_points.len(); - // Note that the row and columns number here is approximate - let mut bucket_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len * 3); - let mut sum_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len); - let mut count_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len); - - for data_point in &hist.data_points { - let mut accumulated_count = 0; - for (idx, count) in data_point.bucket_counts.iter().enumerate() { - let mut bucket_row = bucket_table.alloc_one_row(); - write_tags_and_timestamp( - &mut bucket_table, - &mut bucket_row, - resource_attrs, - scope_attrs, - Some(data_point.attributes.as_ref()), - data_point.time_unix_nano as i64, - )?; - - if let Some(upper_bounds) = data_point.explicit_bounds.get(idx) { - row_writer::write_tag( - &mut bucket_table, - HISTOGRAM_LE_COLUMN, - upper_bounds, - &mut bucket_row, - )?; - } else if idx == data_point.explicit_bounds.len() { - // The last bucket - row_writer::write_tag( - &mut bucket_table, - HISTOGRAM_LE_COLUMN, - f64::INFINITY, - &mut bucket_row, - )?; - } - - accumulated_count += count; - row_writer::write_f64( - &mut bucket_table, - GREPTIME_VALUE, - accumulated_count as f64, - &mut bucket_row, - )?; - - bucket_table.add_row(bucket_row); - } - - if let Some(sum) = data_point.sum { - let mut sum_row = sum_table.alloc_one_row(); - write_tags_and_timestamp( - &mut sum_table, - &mut sum_row, - resource_attrs, - scope_attrs, - Some(data_point.attributes.as_ref()), - data_point.time_unix_nano as i64, - )?; - - row_writer::write_f64(&mut sum_table, GREPTIME_VALUE, sum, &mut sum_row)?; - sum_table.add_row(sum_row); - } - - let mut count_row = count_table.alloc_one_row(); - write_tags_and_timestamp( - &mut count_table, - &mut count_row, - resource_attrs, - scope_attrs, - Some(data_point.attributes.as_ref()), - data_point.time_unix_nano as i64, - )?; - - row_writer::write_f64( - &mut count_table, - GREPTIME_VALUE, - data_point.count as f64, - &mut count_row, - )?; - count_table.add_row(count_row); - } - - table_writer.add_table_data(bucket_table_name, bucket_table); - table_writer.add_table_data(sum_table_name, sum_table); - table_writer.add_table_data(count_table_name, count_table); - - Ok(()) -} - -#[allow(dead_code)] -fn encode_exponential_histogram(_name: &str, _hist: &ExponentialHistogram) -> Result<()> { - // TODO(sunng87): implement this using a prometheus compatible way - Ok(()) -} - -fn encode_summary( - table_writer: &mut MultiTableData, - name: &str, - summary: &Summary, - resource_attrs: Option<&Vec>, - scope_attrs: Option<&Vec>, -) -> Result<()> { - let table = table_writer.get_or_default_table_data( - &normalize_otlp_name(name), - APPROXIMATE_COLUMN_COUNT, - summary.data_points.len(), - ); - - for data_point in &summary.data_points { - let mut row = table.alloc_one_row(); - write_tags_and_timestamp( - table, - &mut row, - resource_attrs, - scope_attrs, - Some(data_point.attributes.as_ref()), - data_point.time_unix_nano as i64, - )?; - - for quantile in &data_point.quantile_values { - row_writer::write_f64( - table, - &format!("greptime_p{:02}", quantile.quantile * 100f64), - quantile.value, - &mut row, - )?; - } - - row_writer::write_f64(table, GREPTIME_COUNT, data_point.count as f64, &mut row)?; - table.add_row(row); - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use opentelemetry_proto::tonic::common::v1::any_value::Value as Val; - use opentelemetry_proto::tonic::common::v1::{AnyValue, KeyValue}; - use opentelemetry_proto::tonic::metrics::v1::number_data_point::Value; - use opentelemetry_proto::tonic::metrics::v1::summary_data_point::ValueAtQuantile; - use opentelemetry_proto::tonic::metrics::v1::{HistogramDataPoint, NumberDataPoint}; - - use super::*; - - #[test] - fn test_normalize_otlp_name() { - assert_eq!(normalize_otlp_name("jvm.memory.free"), "jvm_memory_free"); - assert_eq!(normalize_otlp_name("jvm-memory-free"), "jvm_memory_free"); - assert_eq!(normalize_otlp_name("jvm_memory_free"), "jvm_memory_free"); - assert_eq!(normalize_otlp_name("JVM_MEMORY_FREE"), "jvm_memory_free"); - assert_eq!(normalize_otlp_name("JVM_memory_FREE"), "jvm_memory_free"); - } - - fn keyvalue(key: &str, value: &str) -> KeyValue { - KeyValue { - key: key.into(), - value: Some(AnyValue { - value: Some(Val::StringValue(value.into())), - }), - } - } - - #[test] - fn test_encode_gauge() { - let mut tables = MultiTableData::default(); - - let data_points = vec![ - NumberDataPoint { - attributes: vec![keyvalue("host", "testsevrer")], - time_unix_nano: 100, - value: Some(Value::AsInt(100)), - ..Default::default() - }, - NumberDataPoint { - attributes: vec![keyvalue("host", "testserver")], - time_unix_nano: 105, - value: Some(Value::AsInt(105)), - ..Default::default() - }, - ]; - let gauge = Gauge { data_points }; - encode_gauge( - &mut tables, - "datamon", - &gauge, - Some(&vec![keyvalue("resource", "app")]), - Some(&vec![keyvalue("scope", "otel")]), - ) - .unwrap(); - - let table = tables.get_or_default_table_data("datamon", 0, 0); - assert_eq!(table.num_rows(), 2); - assert_eq!(table.num_columns(), 5); - assert_eq!( - table - .columns() - .iter() - .map(|c| &c.column_name) - .collect::>(), - vec![ - "resource", - "scope", - "host", - "greptime_timestamp", - "greptime_value" - ] - ); - } - - #[test] - fn test_encode_sum() { - let mut tables = MultiTableData::default(); - - let data_points = vec![ - NumberDataPoint { - attributes: vec![keyvalue("host", "testserver")], - time_unix_nano: 100, - value: Some(Value::AsInt(100)), - ..Default::default() - }, - NumberDataPoint { - attributes: vec![keyvalue("host", "testserver")], - time_unix_nano: 105, - value: Some(Value::AsInt(0)), - ..Default::default() - }, - ]; - let sum = Sum { - data_points, - ..Default::default() - }; - encode_sum( - &mut tables, - "datamon", - &sum, - Some(&vec![keyvalue("resource", "app")]), - Some(&vec![keyvalue("scope", "otel")]), - ) - .unwrap(); - - let table = tables.get_or_default_table_data("datamon", 0, 0); - assert_eq!(table.num_rows(), 2); - assert_eq!(table.num_columns(), 5); - assert_eq!( - table - .columns() - .iter() - .map(|c| &c.column_name) - .collect::>(), - vec![ - "resource", - "scope", - "host", - "greptime_timestamp", - "greptime_value" - ] - ); - } - - #[test] - fn test_encode_summary() { - let mut tables = MultiTableData::default(); - - let data_points = vec![SummaryDataPoint { - attributes: vec![keyvalue("host", "testserver")], - time_unix_nano: 100, - count: 25, - sum: 5400.0, - quantile_values: vec![ - ValueAtQuantile { - quantile: 0.90, - value: 1000.0, - }, - ValueAtQuantile { - quantile: 0.95, - value: 3030.0, - }, - ], - ..Default::default() - }]; - let summary = Summary { data_points }; - encode_summary( - &mut tables, - "datamon", - &summary, - Some(&vec![keyvalue("resource", "app")]), - Some(&vec![keyvalue("scope", "otel")]), - ) - .unwrap(); - - let table = tables.get_or_default_table_data("datamon", 0, 0); - assert_eq!(table.num_rows(), 1); - assert_eq!(table.num_columns(), 7); - assert_eq!( - table - .columns() - .iter() - .map(|c| &c.column_name) - .collect::>(), - vec![ - "resource", - "scope", - "host", - "greptime_timestamp", - "greptime_p90", - "greptime_p95", - "greptime_count" - ] - ); - } - - #[test] - fn test_encode_histogram() { - let mut tables = MultiTableData::default(); - - let data_points = vec![HistogramDataPoint { - attributes: vec![keyvalue("host", "testserver")], - time_unix_nano: 100, - start_time_unix_nano: 23, - count: 25, - sum: Some(100.), - max: Some(200.), - min: Some(0.03), - bucket_counts: vec![2, 4, 6, 9, 4], - explicit_bounds: vec![0.1, 1., 10., 100.], - ..Default::default() - }]; - - let histogram = Histogram { - data_points, - aggregation_temporality: AggregationTemporality::Delta.into(), - }; - encode_histogram( - &mut tables, - "histo", - &histogram, - Some(&vec![keyvalue("resource", "app")]), - Some(&vec![keyvalue("scope", "otel")]), - ) - .unwrap(); - - assert_eq!(3, tables.num_tables()); - - // bucket table - let bucket_table = tables.get_or_default_table_data("histo_bucket", 0, 0); - assert_eq!(bucket_table.num_rows(), 5); - assert_eq!(bucket_table.num_columns(), 6); - assert_eq!( - bucket_table - .columns() - .iter() - .map(|c| &c.column_name) - .collect::>(), - vec![ - "resource", - "scope", - "host", - "greptime_timestamp", - "le", - "greptime_value", - ] - ); - - let sum_table = tables.get_or_default_table_data("histo_sum", 0, 0); - assert_eq!(sum_table.num_rows(), 1); - assert_eq!(sum_table.num_columns(), 5); - assert_eq!( - sum_table - .columns() - .iter() - .map(|c| &c.column_name) - .collect::>(), - vec![ - "resource", - "scope", - "host", - "greptime_timestamp", - "greptime_value", - ] - ); - - let count_table = tables.get_or_default_table_data("histo_count", 0, 0); - assert_eq!(count_table.num_rows(), 1); - assert_eq!(count_table.num_columns(), 5); - assert_eq!( - count_table - .columns() - .iter() - .map(|c| &c.column_name) - .collect::>(), - vec![ - "resource", - "scope", - "host", - "greptime_timestamp", - "greptime_value", - ] - ); - } -} diff --git a/src/servers/src/otlp/metrics.rs b/src/servers/src/otlp/metrics.rs new file mode 100644 index 000000000000..cd7bbc7db81a --- /dev/null +++ b/src/servers/src/otlp/metrics.rs @@ -0,0 +1,658 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::{RowInsertRequests, Value}; +use common_grpc::writer::Precision; +use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest; +use opentelemetry_proto::tonic::common::v1::{any_value, KeyValue}; +use opentelemetry_proto::tonic::metrics::v1::{metric, number_data_point, *}; + +use super::{GREPTIME_COUNT, GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use crate::error::Result; +use crate::row_writer::{self, MultiTableData, TableData}; + +/// the default column count for table writer +const APPROXIMATE_COLUMN_COUNT: usize = 8; + +/// Normalize otlp instrumentation, metric and attribute names +/// +/// +/// - since the name are case-insensitive, we transform them to lowercase for +/// better sql usability +/// - replace `.` and `-` with `_` +fn normalize_otlp_name(name: &str) -> String { + name.to_lowercase().replace(|c| c == '.' || c == '-', "_") +} + +/// Convert OpenTelemetry metrics to GreptimeDB insert requests +/// +/// See +/// +/// for data structure of OTLP metrics. +/// +/// Returns `InsertRequests` and total number of rows to ingest +pub fn to_grpc_insert_requests( + request: ExportMetricsServiceRequest, +) -> Result<(RowInsertRequests, usize)> { + let mut table_writer = MultiTableData::default(); + + for resource in &request.resource_metrics { + let resource_attrs = resource.resource.as_ref().map(|r| &r.attributes); + for scope in &resource.scope_metrics { + let scope_attrs = scope.scope.as_ref().map(|s| &s.attributes); + for metric in &scope.metrics { + encode_metrics(&mut table_writer, metric, resource_attrs, scope_attrs)?; + } + } + } + + Ok(table_writer.into_row_insert_requests()) +} + +fn encode_metrics( + table_writer: &mut MultiTableData, + metric: &Metric, + resource_attrs: Option<&Vec>, + scope_attrs: Option<&Vec>, +) -> Result<()> { + let name = &metric.name; + // note that we don't store description or unit, we might want to deal with + // these fields in the future. + if let Some(data) = &metric.data { + match data { + metric::Data::Gauge(gauge) => { + encode_gauge(table_writer, name, gauge, resource_attrs, scope_attrs)?; + } + metric::Data::Sum(sum) => { + encode_sum(table_writer, name, sum, resource_attrs, scope_attrs)?; + } + metric::Data::Summary(summary) => { + encode_summary(table_writer, name, summary, resource_attrs, scope_attrs)?; + } + metric::Data::Histogram(hist) => { + encode_histogram(table_writer, name, hist, resource_attrs, scope_attrs)?; + } + // TODO(sunng87) leave ExponentialHistogram for next release + metric::Data::ExponentialHistogram(_hist) => {} + } + } + + Ok(()) +} + +fn write_attributes( + writer: &mut TableData, + row: &mut Vec, + attrs: Option<&Vec>, +) -> Result<()> { + if let Some(attrs) = attrs { + let table_tags = attrs.iter().filter_map(|attr| { + if let Some(val) = attr.value.as_ref().and_then(|v| v.value.as_ref()) { + let key = normalize_otlp_name(&attr.key); + match val { + any_value::Value::StringValue(s) => Some((key, s.to_string())), + any_value::Value::IntValue(v) => Some((key, v.to_string())), + any_value::Value::DoubleValue(v) => Some((key, v.to_string())), + _ => None, // TODO(sunng87): allow different type of values + } + } else { + None + } + }); + + row_writer::write_tags(writer, table_tags, row)?; + } + Ok(()) +} + +fn write_timestamp(table: &mut TableData, row: &mut Vec, time_nano: i64) -> Result<()> { + row_writer::write_ts_precision( + table, + GREPTIME_TIMESTAMP, + Some(time_nano), + Precision::Nanosecond, + row, + ) +} + +fn write_data_point_value( + table: &mut TableData, + row: &mut Vec, + field: &str, + value: &Option, +) -> Result<()> { + match value { + Some(number_data_point::Value::AsInt(val)) => { + // we coerce all values to f64 + row_writer::write_f64(table, field, *val as f64, row)?; + } + Some(number_data_point::Value::AsDouble(val)) => { + row_writer::write_f64(table, field, *val, row)?; + } + _ => {} + } + Ok(()) +} + +fn write_tags_and_timestamp( + table: &mut TableData, + row: &mut Vec, + resource_attrs: Option<&Vec>, + scope_attrs: Option<&Vec>, + data_point_attrs: Option<&Vec>, + timestamp_nanos: i64, +) -> Result<()> { + write_attributes(table, row, resource_attrs)?; + write_attributes(table, row, scope_attrs)?; + write_attributes(table, row, data_point_attrs)?; + + write_timestamp(table, row, timestamp_nanos)?; + + Ok(()) +} + +/// encode this gauge metric +/// +/// note that there can be multiple data points in the request, it's going to be +/// stored as multiple rows +fn encode_gauge( + table_writer: &mut MultiTableData, + name: &str, + gauge: &Gauge, + resource_attrs: Option<&Vec>, + scope_attrs: Option<&Vec>, +) -> Result<()> { + let table = table_writer.get_or_default_table_data( + &normalize_otlp_name(name), + APPROXIMATE_COLUMN_COUNT, + gauge.data_points.len(), + ); + + for data_point in &gauge.data_points { + let mut row = table.alloc_one_row(); + write_tags_and_timestamp( + table, + &mut row, + resource_attrs, + scope_attrs, + Some(data_point.attributes.as_ref()), + data_point.time_unix_nano as i64, + )?; + + write_data_point_value(table, &mut row, GREPTIME_VALUE, &data_point.value)?; + table.add_row(row); + } + + Ok(()) +} + +/// encode this sum metric +/// +/// `aggregation_temporality` and `monotonic` are ignored for now +fn encode_sum( + table_writer: &mut MultiTableData, + name: &str, + sum: &Sum, + resource_attrs: Option<&Vec>, + scope_attrs: Option<&Vec>, +) -> Result<()> { + let table = table_writer.get_or_default_table_data( + &normalize_otlp_name(name), + APPROXIMATE_COLUMN_COUNT, + sum.data_points.len(), + ); + + for data_point in &sum.data_points { + let mut row = table.alloc_one_row(); + write_tags_and_timestamp( + table, + &mut row, + resource_attrs, + scope_attrs, + Some(data_point.attributes.as_ref()), + data_point.time_unix_nano as i64, + )?; + write_data_point_value(table, &mut row, GREPTIME_VALUE, &data_point.value)?; + table.add_row(row); + } + + Ok(()) +} + +const HISTOGRAM_LE_COLUMN: &str = "le"; + +/// Encode histogram data. This function returns 3 insert requests for 3 tables. +/// +/// The implementation has been following Prometheus histogram table format: +/// +/// - A `%metric%_bucket` table including `greptime_le` tag that stores bucket upper +/// limit, and `greptime_value` for bucket count +/// - A `%metric%_sum` table storing sum of samples +/// - A `%metric%_count` table storing count of samples. +/// +/// By its Prometheus compatibility, we hope to be able to use prometheus +/// quantile functions on this table. +fn encode_histogram( + table_writer: &mut MultiTableData, + name: &str, + hist: &Histogram, + resource_attrs: Option<&Vec>, + scope_attrs: Option<&Vec>, +) -> Result<()> { + let normalized_name = normalize_otlp_name(name); + + let bucket_table_name = format!("{}_bucket", normalized_name); + let sum_table_name = format!("{}_sum", normalized_name); + let count_table_name = format!("{}_count", normalized_name); + + let data_points_len = hist.data_points.len(); + // Note that the row and columns number here is approximate + let mut bucket_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len * 3); + let mut sum_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len); + let mut count_table = TableData::new(APPROXIMATE_COLUMN_COUNT, data_points_len); + + for data_point in &hist.data_points { + let mut accumulated_count = 0; + for (idx, count) in data_point.bucket_counts.iter().enumerate() { + let mut bucket_row = bucket_table.alloc_one_row(); + write_tags_and_timestamp( + &mut bucket_table, + &mut bucket_row, + resource_attrs, + scope_attrs, + Some(data_point.attributes.as_ref()), + data_point.time_unix_nano as i64, + )?; + + if let Some(upper_bounds) = data_point.explicit_bounds.get(idx) { + row_writer::write_tag( + &mut bucket_table, + HISTOGRAM_LE_COLUMN, + upper_bounds, + &mut bucket_row, + )?; + } else if idx == data_point.explicit_bounds.len() { + // The last bucket + row_writer::write_tag( + &mut bucket_table, + HISTOGRAM_LE_COLUMN, + f64::INFINITY, + &mut bucket_row, + )?; + } + + accumulated_count += count; + row_writer::write_f64( + &mut bucket_table, + GREPTIME_VALUE, + accumulated_count as f64, + &mut bucket_row, + )?; + + bucket_table.add_row(bucket_row); + } + + if let Some(sum) = data_point.sum { + let mut sum_row = sum_table.alloc_one_row(); + write_tags_and_timestamp( + &mut sum_table, + &mut sum_row, + resource_attrs, + scope_attrs, + Some(data_point.attributes.as_ref()), + data_point.time_unix_nano as i64, + )?; + + row_writer::write_f64(&mut sum_table, GREPTIME_VALUE, sum, &mut sum_row)?; + sum_table.add_row(sum_row); + } + + let mut count_row = count_table.alloc_one_row(); + write_tags_and_timestamp( + &mut count_table, + &mut count_row, + resource_attrs, + scope_attrs, + Some(data_point.attributes.as_ref()), + data_point.time_unix_nano as i64, + )?; + + row_writer::write_f64( + &mut count_table, + GREPTIME_VALUE, + data_point.count as f64, + &mut count_row, + )?; + count_table.add_row(count_row); + } + + table_writer.add_table_data(bucket_table_name, bucket_table); + table_writer.add_table_data(sum_table_name, sum_table); + table_writer.add_table_data(count_table_name, count_table); + + Ok(()) +} + +#[allow(dead_code)] +fn encode_exponential_histogram(_name: &str, _hist: &ExponentialHistogram) -> Result<()> { + // TODO(sunng87): implement this using a prometheus compatible way + Ok(()) +} + +fn encode_summary( + table_writer: &mut MultiTableData, + name: &str, + summary: &Summary, + resource_attrs: Option<&Vec>, + scope_attrs: Option<&Vec>, +) -> Result<()> { + let table = table_writer.get_or_default_table_data( + &normalize_otlp_name(name), + APPROXIMATE_COLUMN_COUNT, + summary.data_points.len(), + ); + + for data_point in &summary.data_points { + let mut row = table.alloc_one_row(); + write_tags_and_timestamp( + table, + &mut row, + resource_attrs, + scope_attrs, + Some(data_point.attributes.as_ref()), + data_point.time_unix_nano as i64, + )?; + + for quantile in &data_point.quantile_values { + row_writer::write_f64( + table, + &format!("greptime_p{:02}", quantile.quantile * 100f64), + quantile.value, + &mut row, + )?; + } + + row_writer::write_f64(table, GREPTIME_COUNT, data_point.count as f64, &mut row)?; + table.add_row(row); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use opentelemetry_proto::tonic::common::v1::any_value::Value as Val; + use opentelemetry_proto::tonic::common::v1::{AnyValue, KeyValue}; + use opentelemetry_proto::tonic::metrics::v1::number_data_point::Value; + use opentelemetry_proto::tonic::metrics::v1::summary_data_point::ValueAtQuantile; + use opentelemetry_proto::tonic::metrics::v1::{HistogramDataPoint, NumberDataPoint}; + + use super::*; + + #[test] + fn test_normalize_otlp_name() { + assert_eq!(normalize_otlp_name("jvm.memory.free"), "jvm_memory_free"); + assert_eq!(normalize_otlp_name("jvm-memory-free"), "jvm_memory_free"); + assert_eq!(normalize_otlp_name("jvm_memory_free"), "jvm_memory_free"); + assert_eq!(normalize_otlp_name("JVM_MEMORY_FREE"), "jvm_memory_free"); + assert_eq!(normalize_otlp_name("JVM_memory_FREE"), "jvm_memory_free"); + } + + fn keyvalue(key: &str, value: &str) -> KeyValue { + KeyValue { + key: key.into(), + value: Some(AnyValue { + value: Some(Val::StringValue(value.into())), + }), + } + } + + #[test] + fn test_encode_gauge() { + let mut tables = MultiTableData::default(); + + let data_points = vec![ + NumberDataPoint { + attributes: vec![keyvalue("host", "testsevrer")], + time_unix_nano: 100, + value: Some(Value::AsInt(100)), + ..Default::default() + }, + NumberDataPoint { + attributes: vec![keyvalue("host", "testserver")], + time_unix_nano: 105, + value: Some(Value::AsInt(105)), + ..Default::default() + }, + ]; + let gauge = Gauge { data_points }; + encode_gauge( + &mut tables, + "datamon", + &gauge, + Some(&vec![keyvalue("resource", "app")]), + Some(&vec![keyvalue("scope", "otel")]), + ) + .unwrap(); + + let table = tables.get_or_default_table_data("datamon", 0, 0); + assert_eq!(table.num_rows(), 2); + assert_eq!(table.num_columns(), 5); + assert_eq!( + table + .columns() + .iter() + .map(|c| &c.column_name) + .collect::>(), + vec![ + "resource", + "scope", + "host", + "greptime_timestamp", + "greptime_value" + ] + ); + } + + #[test] + fn test_encode_sum() { + let mut tables = MultiTableData::default(); + + let data_points = vec![ + NumberDataPoint { + attributes: vec![keyvalue("host", "testserver")], + time_unix_nano: 100, + value: Some(Value::AsInt(100)), + ..Default::default() + }, + NumberDataPoint { + attributes: vec![keyvalue("host", "testserver")], + time_unix_nano: 105, + value: Some(Value::AsInt(0)), + ..Default::default() + }, + ]; + let sum = Sum { + data_points, + ..Default::default() + }; + encode_sum( + &mut tables, + "datamon", + &sum, + Some(&vec![keyvalue("resource", "app")]), + Some(&vec![keyvalue("scope", "otel")]), + ) + .unwrap(); + + let table = tables.get_or_default_table_data("datamon", 0, 0); + assert_eq!(table.num_rows(), 2); + assert_eq!(table.num_columns(), 5); + assert_eq!( + table + .columns() + .iter() + .map(|c| &c.column_name) + .collect::>(), + vec![ + "resource", + "scope", + "host", + "greptime_timestamp", + "greptime_value" + ] + ); + } + + #[test] + fn test_encode_summary() { + let mut tables = MultiTableData::default(); + + let data_points = vec![SummaryDataPoint { + attributes: vec![keyvalue("host", "testserver")], + time_unix_nano: 100, + count: 25, + sum: 5400.0, + quantile_values: vec![ + ValueAtQuantile { + quantile: 0.90, + value: 1000.0, + }, + ValueAtQuantile { + quantile: 0.95, + value: 3030.0, + }, + ], + ..Default::default() + }]; + let summary = Summary { data_points }; + encode_summary( + &mut tables, + "datamon", + &summary, + Some(&vec![keyvalue("resource", "app")]), + Some(&vec![keyvalue("scope", "otel")]), + ) + .unwrap(); + + let table = tables.get_or_default_table_data("datamon", 0, 0); + assert_eq!(table.num_rows(), 1); + assert_eq!(table.num_columns(), 7); + assert_eq!( + table + .columns() + .iter() + .map(|c| &c.column_name) + .collect::>(), + vec![ + "resource", + "scope", + "host", + "greptime_timestamp", + "greptime_p90", + "greptime_p95", + "greptime_count" + ] + ); + } + + #[test] + fn test_encode_histogram() { + let mut tables = MultiTableData::default(); + + let data_points = vec![HistogramDataPoint { + attributes: vec![keyvalue("host", "testserver")], + time_unix_nano: 100, + start_time_unix_nano: 23, + count: 25, + sum: Some(100.), + max: Some(200.), + min: Some(0.03), + bucket_counts: vec![2, 4, 6, 9, 4], + explicit_bounds: vec![0.1, 1., 10., 100.], + ..Default::default() + }]; + + let histogram = Histogram { + data_points, + aggregation_temporality: AggregationTemporality::Delta.into(), + }; + encode_histogram( + &mut tables, + "histo", + &histogram, + Some(&vec![keyvalue("resource", "app")]), + Some(&vec![keyvalue("scope", "otel")]), + ) + .unwrap(); + + assert_eq!(3, tables.num_tables()); + + // bucket table + let bucket_table = tables.get_or_default_table_data("histo_bucket", 0, 0); + assert_eq!(bucket_table.num_rows(), 5); + assert_eq!(bucket_table.num_columns(), 6); + assert_eq!( + bucket_table + .columns() + .iter() + .map(|c| &c.column_name) + .collect::>(), + vec![ + "resource", + "scope", + "host", + "greptime_timestamp", + "le", + "greptime_value", + ] + ); + + let sum_table = tables.get_or_default_table_data("histo_sum", 0, 0); + assert_eq!(sum_table.num_rows(), 1); + assert_eq!(sum_table.num_columns(), 5); + assert_eq!( + sum_table + .columns() + .iter() + .map(|c| &c.column_name) + .collect::>(), + vec![ + "resource", + "scope", + "host", + "greptime_timestamp", + "greptime_value", + ] + ); + + let count_table = tables.get_or_default_table_data("histo_count", 0, 0); + assert_eq!(count_table.num_rows(), 1); + assert_eq!(count_table.num_columns(), 5); + assert_eq!( + count_table + .columns() + .iter() + .map(|c| &c.column_name) + .collect::>(), + vec![ + "resource", + "scope", + "host", + "greptime_timestamp", + "greptime_value", + ] + ); + } +} diff --git a/src/servers/src/otlp/plugin.rs b/src/servers/src/otlp/plugin.rs new file mode 100644 index 000000000000..ddcb4375e6d8 --- /dev/null +++ b/src/servers/src/otlp/plugin.rs @@ -0,0 +1,28 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; + +use super::trace::TraceSpans; + +/// Transformer helps to transform ExportTraceServiceRequest based on logic, like: +/// - uplift some fields from Attributes (Map type) to column +pub trait TraceParser: Send + Sync { + fn parse(&self, request: ExportTraceServiceRequest) -> TraceSpans; + fn table_name(&self) -> String; +} + +pub type TraceParserRef = Arc; diff --git a/src/servers/src/otlp/trace.rs b/src/servers/src/otlp/trace.rs new file mode 100644 index 000000000000..20ba773f3db1 --- /dev/null +++ b/src/servers/src/otlp/trace.rs @@ -0,0 +1,411 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use api::v1::value::ValueData; +use api::v1::{ColumnDataType, RowInsertRequests}; +use common_grpc::writer::Precision; +use common_time::time::Time; +use itertools::Itertools; +use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; +use opentelemetry_proto::tonic::common::v1::any_value::Value as OtlpValue; +use opentelemetry_proto::tonic::common::v1::{ + AnyValue, ArrayValue, InstrumentationScope, KeyValue, KeyValueList, +}; +use opentelemetry_proto::tonic::trace::v1::span::{Event, Link}; +use opentelemetry_proto::tonic::trace::v1::{Span, Status}; +use serde_json::json; + +use super::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use crate::error::Result; +use crate::row_writer::{self, MultiTableData, TableData}; + +const APPROXIMATE_COLUMN_COUNT: usize = 24; +pub const TRACE_TABLE_NAME: &str = "traces_preview_v01"; + +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub struct TraceSpan { + // the following are tags + pub trace_id: String, + pub span_id: String, + pub parent_span_id: String, + + // the following are fields + pub resource_attributes: String, // TODO(yuanbohan): Map in the future + pub scope_name: String, + pub scope_version: String, + pub scope_attributes: String, // TODO(yuanbohan): Map in the future + pub trace_state: String, + pub span_name: String, + pub span_kind: String, + pub span_status_code: String, + pub span_status_message: String, + pub span_attributes: String, // TODO(yuanbohan): Map in the future + pub span_events: String, // TODO(yuanbohan): List in the future + pub span_links: String, // TODO(yuanbohan): List in the future + pub start_in_nanosecond: u64, // this is also the Timestamp Index + pub end_in_nanosecond: u64, + + pub uplifted_fields: Vec<(String, ColumnDataType, ValueData)>, +} + +pub type TraceSpans = Vec; + +/// Convert SpanTraces to GreptimeDB row insert requests. +/// Returns `InsertRequests` and total number of rows to ingest +pub fn to_grpc_insert_requests( + table_name: String, + spans: TraceSpans, +) -> Result<(RowInsertRequests, usize)> { + let mut multi_table_writer = MultiTableData::default(); + let one_table_writer = multi_table_writer.get_or_default_table_data( + table_name, + APPROXIMATE_COLUMN_COUNT, + spans.len(), + ); + + for span in spans { + write_span_to_row(one_table_writer, span)?; + } + + Ok(multi_table_writer.into_row_insert_requests()) +} + +pub fn write_span_to_row(writer: &mut TableData, span: TraceSpan) -> Result<()> { + let mut row = writer.alloc_one_row(); + { + // tags + let iter = vec![ + ("trace_id", span.trace_id), + ("span_id", span.span_id), + ("parent_span_id", span.parent_span_id), + ] + .into_iter() + .map(|(col, val)| (col.to_string(), val)); + row_writer::write_tags(writer, iter, &mut row)?; + } + { + // fields + let str_fields_iter = vec![ + ("resource_attributes", span.resource_attributes), + ("scope_name", span.scope_name), + ("scope_version", span.scope_version), + ("scope_attributes", span.scope_attributes), + ("trace_state", span.trace_state), + ("span_name", span.span_name), + ("span_kind", span.span_kind), + ("span_status_code", span.span_status_code), + ("span_status_message", span.span_status_message), + ("span_attributes", span.span_attributes), + ("span_events", span.span_events), + ("span_links", span.span_links), + ] + .into_iter() + .map(|(col, val)| { + ( + col.into(), + ColumnDataType::String, + ValueData::StringValue(val), + ) + }); + + let time_fields_iter = vec![ + ("start", span.start_in_nanosecond), + ("end", span.end_in_nanosecond), + ] + .into_iter() + .map(|(col, val)| { + ( + col.into(), + ColumnDataType::TimestampNanosecond, + ValueData::TimestampNanosecondValue(val as i64), + ) + }); + + row_writer::write_fields(writer, str_fields_iter, &mut row)?; + row_writer::write_fields(writer, time_fields_iter, &mut row)?; + row_writer::write_fields(writer, span.uplifted_fields.into_iter(), &mut row)?; + } + + row_writer::write_f64( + writer, + GREPTIME_VALUE, + (span.end_in_nanosecond - span.start_in_nanosecond) as f64 / 1_000_000.0, // duration in millisecond + &mut row, + )?; + row_writer::write_ts_precision( + writer, + GREPTIME_TIMESTAMP, + Some(span.start_in_nanosecond as i64), + Precision::Nanosecond, + &mut row, + )?; + + writer.add_row(row); + + Ok(()) +} + +pub fn parse_span( + resource_attrs: &[KeyValue], + scope: &InstrumentationScope, + span: Span, +) -> TraceSpan { + let (span_status_code, span_status_message) = status_to_string(&span.status); + let span_kind = span.kind().as_str_name().into(); + TraceSpan { + trace_id: bytes_to_hex_string(&span.trace_id), + span_id: bytes_to_hex_string(&span.span_id), + parent_span_id: bytes_to_hex_string(&span.parent_span_id), + + resource_attributes: vec_kv_to_string(resource_attrs), + trace_state: span.trace_state, + + scope_name: scope.name.clone(), + scope_version: scope.version.clone(), + scope_attributes: vec_kv_to_string(&scope.attributes), + + span_name: span.name, + span_kind, + span_status_code, + span_status_message, + span_attributes: vec_kv_to_string(&span.attributes), + span_events: events_to_string(&span.events), + span_links: links_to_string(&span.links), + + start_in_nanosecond: span.start_time_unix_nano, + end_in_nanosecond: span.end_time_unix_nano, + + uplifted_fields: vec![], + } +} + +/// Convert OpenTelemetry traces to SpanTraces +/// +/// See +/// +/// for data structure of OTLP traces. +pub fn parse(request: ExportTraceServiceRequest) -> TraceSpans { + let mut spans = vec![]; + for resource_spans in request.resource_spans { + let resource_attrs = resource_spans + .resource + .map(|r| r.attributes) + .unwrap_or_default(); + for scope_spans in resource_spans.scope_spans { + let scope = scope_spans.scope.unwrap_or_default(); + for span in scope_spans.spans { + spans.push(parse_span(&resource_attrs, &scope, span)); + } + } + } + spans +} + +pub fn bytes_to_hex_string(bs: &[u8]) -> String { + bs.iter().map(|b| format!("{:02x}", b)).join("") +} + +pub fn arr_vals_to_string(arr: &ArrayValue) -> String { + let vs: Vec = arr + .values + .iter() + .filter_map(|val| any_value_to_string(val.clone())) + .collect(); + + serde_json::to_string(&vs).unwrap_or_else(|_| "[]".into()) +} + +pub fn vec_kv_to_string(vec: &[KeyValue]) -> String { + let vs: HashMap = vec + .iter() + .map(|kv| { + let val = kv + .value + .clone() + .and_then(any_value_to_string) + .unwrap_or_default(); + (kv.key.clone(), val) + }) + .collect(); + + serde_json::to_string(&vs).unwrap_or_else(|_| "{}".into()) +} + +pub fn kvlist_to_string(kvlist: &KeyValueList) -> String { + vec_kv_to_string(&kvlist.values) +} + +pub fn any_value_to_string(val: AnyValue) -> Option { + val.value.map(|value| match value { + OtlpValue::StringValue(s) => s, + OtlpValue::BoolValue(b) => b.to_string(), + OtlpValue::IntValue(i) => i.to_string(), + OtlpValue::DoubleValue(d) => d.to_string(), + OtlpValue::ArrayValue(arr) => arr_vals_to_string(&arr), + OtlpValue::KvlistValue(kv) => kvlist_to_string(&kv), + OtlpValue::BytesValue(bs) => bytes_to_hex_string(&bs), + }) +} + +pub fn event_to_string(event: &Event) -> String { + json!({ + "name": event.name, + "time": Time::new_nanosecond(event.time_unix_nano as i64).to_iso8601_string(), + "attrs": vec_kv_to_string(&event.attributes), + }) + .to_string() +} + +pub fn events_to_string(events: &[Event]) -> String { + let v: Vec = events.iter().map(event_to_string).collect(); + serde_json::to_string(&v).unwrap_or_else(|_| "[]".into()) +} + +pub fn link_to_string(link: &Link) -> String { + json!({ + "trace_id": link.trace_id, + "span_id": link.span_id, + "trace_state": link.trace_state, + "attributes": vec_kv_to_string(&link.attributes), + }) + .to_string() +} + +pub fn links_to_string(links: &[Link]) -> String { + let v: Vec = links.iter().map(link_to_string).collect(); + serde_json::to_string(&v).unwrap_or_else(|_| "[]".into()) +} + +pub fn status_to_string(status: &Option) -> (String, String) { + match status { + Some(status) => (status.code().as_str_name().into(), status.message.clone()), + None => ("".into(), "".into()), + } +} + +#[cfg(test)] +mod tests { + use common_time::time::Time; + use opentelemetry_proto::tonic::common::v1::{ + any_value, AnyValue, ArrayValue, KeyValue, KeyValueList, + }; + use opentelemetry_proto::tonic::trace::v1::span::Event; + use opentelemetry_proto::tonic::trace::v1::Status; + use serde_json::json; + + use crate::otlp::trace::{ + arr_vals_to_string, bytes_to_hex_string, event_to_string, kvlist_to_string, + status_to_string, vec_kv_to_string, + }; + + #[test] + fn test_bytes_to_hex_string() { + assert_eq!( + "24fe79948641b110a29bc27859307e8d", + bytes_to_hex_string(&[ + 36, 254, 121, 148, 134, 65, 177, 16, 162, 155, 194, 120, 89, 48, 126, 141, + ]) + ); + + assert_eq!( + "baffeedd7b8debc0", + bytes_to_hex_string(&[186, 255, 238, 221, 123, 141, 235, 192,]) + ); + } + + #[test] + fn test_arr_vals_to_string() { + assert_eq!("[]", arr_vals_to_string(&ArrayValue { values: vec![] })); + + let arr = ArrayValue { + values: vec![ + AnyValue { + value: Some(any_value::Value::StringValue("string_value".into())), + }, + AnyValue { + value: Some(any_value::Value::BoolValue(true)), + }, + AnyValue { + value: Some(any_value::Value::IntValue(1)), + }, + AnyValue { + value: Some(any_value::Value::DoubleValue(1.2)), + }, + ], + }; + let expect = json!(["string_value", "true", "1", "1.2"]).to_string(); + assert_eq!(expect, arr_vals_to_string(&arr)); + } + + #[test] + fn test_kv_list_to_string() { + let kvlist = KeyValueList { + values: vec![KeyValue { + key: "str_key".into(), + value: Some(AnyValue { + value: Some(any_value::Value::StringValue("val1".into())), + }), + }], + }; + let expect = json!({ + "str_key": "val1", + }) + .to_string(); + assert_eq!(expect, kvlist_to_string(&kvlist)) + } + + #[test] + fn test_event_to_string() { + let attributes = vec![KeyValue { + key: "str_key".into(), + value: Some(AnyValue { + value: Some(any_value::Value::StringValue("val1".into())), + }), + }]; + let event = Event { + time_unix_nano: 1697620662450128000_u64, + name: "event_name".into(), + attributes, + dropped_attributes_count: 0, + }; + let event_string = event_to_string(&event); + let expect = json!({ + "name": event.name, + "time": Time::new_nanosecond(event.time_unix_nano as i64).to_iso8601_string(), + "attrs": vec_kv_to_string(&event.attributes), + }); + + assert_eq!( + expect, + serde_json::from_str::(event_string.as_str()).unwrap() + ); + } + + #[test] + fn test_status_to_string() { + let message = String::from("status message"); + let status = Status { + code: 1, + message: message.clone(), + }; + + assert_eq!( + ("STATUS_CODE_OK".into(), message), + status_to_string(&Some(status)), + ); + } +} diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index ef8f74575e7c..06ef6bbda5c0 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -34,6 +34,9 @@ use common_query::Output; use opentelemetry_proto::tonic::collector::metrics::v1::{ ExportMetricsServiceRequest, ExportMetricsServiceResponse, }; +use opentelemetry_proto::tonic::collector::trace::v1::{ + ExportTraceServiceRequest, ExportTraceServiceResponse, +}; use session::context::QueryContextRef; use crate::error::Result; @@ -74,7 +77,7 @@ pub trait InfluxdbLineProtocolHandler { pub trait OpentsdbProtocolHandler { /// A successful request will not return a response. /// Only on error will the socket return a line of data. - async fn exec(&self, data_point: &DataPoint, ctx: QueryContextRef) -> Result<()>; + async fn exec(&self, data_points: Vec, ctx: QueryContextRef) -> Result; } pub struct PromStoreResponse { @@ -101,4 +104,11 @@ pub trait OpenTelemetryProtocolHandler { request: ExportMetricsServiceRequest, ctx: QueryContextRef, ) -> Result; + + /// Handling opentelemetry traces request + async fn traces( + &self, + request: ExportTraceServiceRequest, + ctx: QueryContextRef, + ) -> Result; } diff --git a/src/servers/tests/http/opentsdb_test.rs b/src/servers/tests/http/opentsdb_test.rs index e77143d3b3a1..388e8b6c0e44 100644 --- a/src/servers/tests/http/opentsdb_test.rs +++ b/src/servers/tests/http/opentsdb_test.rs @@ -51,7 +51,8 @@ impl GrpcQueryHandler for DummyInstance { #[async_trait] impl OpentsdbProtocolHandler for DummyInstance { - async fn exec(&self, data_point: &DataPoint, _ctx: QueryContextRef) -> Result<()> { + async fn exec(&self, data_points: Vec, _ctx: QueryContextRef) -> Result { + let data_point = data_points.first().unwrap(); if data_point.metric() == "should_failed" { return error::InternalSnafu { err_msg: "expected", @@ -59,7 +60,7 @@ impl OpentsdbProtocolHandler for DummyInstance { .fail(); } let _ = self.tx.send(data_point.metric().to_string()).await; - Ok(()) + Ok(data_points.len()) } } @@ -163,19 +164,13 @@ async fn test_opentsdb_put() { .send() .await; assert_eq!(result.status(), 500); - assert_eq!( - result.text().await, - "{\"error\":\"Internal error: Internal error: expected\"}" - ); + assert_eq!(result.text().await, "{\"error\":\"Internal error: 1003\"}"); let mut metrics = vec![]; while let Ok(s) = rx.try_recv() { metrics.push(s); } - assert_eq!( - metrics, - vec!["m1".to_string(), "m2".to_string(), "m3".to_string()] - ); + assert_eq!(metrics, vec!["m1".to_string(), "m2".to_string()]); } #[tokio::test] @@ -208,7 +203,7 @@ async fn test_opentsdb_debug_put() { .send() .await; assert_eq!(result.status(), 200); - assert_eq!(result.text().await, "{\"success\":0,\"failed\":1,\"errors\":[{\"datapoint\":{\"metric\":\"should_failed\",\"timestamp\":1000,\"value\":1.0,\"tags\":{\"host\":\"web01\"}},\"error\":\"Internal error: expected\"}]}"); + assert_eq!(result.text().await, "{\"success\":0,\"failed\":1,\"errors\":[{\"datapoint\":{\"metric\":\"should_failed\",\"timestamp\":1000,\"value\":1.0,\"tags\":{\"host\":\"web01\"}},\"error\":\"Internal error: 1003\"}]}"); // multiple data point summary debug put let result = client @@ -233,7 +228,7 @@ async fn test_opentsdb_debug_put() { .send() .await; assert_eq!(result.status(), 200); - assert_eq!(result.text().await, "{\"success\":1,\"failed\":1,\"errors\":[{\"datapoint\":{\"metric\":\"should_failed\",\"timestamp\":1000,\"value\":1.0,\"tags\":{\"host\":\"web01\"}},\"error\":\"Internal error: expected\"}]}"); + assert_eq!(result.text().await, "{\"success\":1,\"failed\":1,\"errors\":[{\"datapoint\":{\"metric\":\"should_failed\",\"timestamp\":1000,\"value\":1.0,\"tags\":{\"host\":\"web01\"}},\"error\":\"Internal error: 1003\"}]}"); let mut metrics = vec![]; while let Ok(s) = rx.try_recv() { diff --git a/src/servers/tests/opentsdb.rs b/src/servers/tests/opentsdb.rs index 145fdc07dbe6..79ac2ba21939 100644 --- a/src/servers/tests/opentsdb.rs +++ b/src/servers/tests/opentsdb.rs @@ -37,8 +37,8 @@ struct DummyOpentsdbInstance { #[async_trait] impl OpentsdbProtocolHandler for DummyOpentsdbInstance { - async fn exec(&self, data_point: &DataPoint, _ctx: QueryContextRef) -> Result<()> { - let metric = data_point.metric(); + async fn exec(&self, data_points: Vec, _ctx: QueryContextRef) -> Result { + let metric = data_points.first().unwrap().metric(); if metric == "should_failed" { return server_error::InternalSnafu { err_msg: "expected", @@ -47,7 +47,7 @@ impl OpentsdbProtocolHandler for DummyOpentsdbInstance { } let i = metric.parse::().unwrap(); let _ = self.tx.send(i * i).await; - Ok(()) + Ok(data_points.len()) } } diff --git a/src/storage/src/chunk.rs b/src/storage/src/chunk.rs index 8cef41dea647..e8fc4a555c83 100644 --- a/src/storage/src/chunk.rs +++ b/src/storage/src/chunk.rs @@ -245,11 +245,7 @@ impl ChunkReaderBuilder { reader_builder = reader_builder.push_batch_iter(iter); } - let predicate = Predicate::try_new( - self.filters.clone(), - self.schema.store_schema().schema().clone(), - ) - .context(error::BuildPredicateSnafu)?; + let predicate = Predicate::new(self.filters.clone()); let read_opts = ReadOptions { batch_size: self.iter_ctx.batch_size, diff --git a/src/storage/src/sst/parquet.rs b/src/storage/src/sst/parquet.rs index d989f674ded2..fa0cb9c56e0e 100644 --- a/src/storage/src/sst/parquet.rs +++ b/src/storage/src/sst/parquet.rs @@ -277,7 +277,10 @@ impl ParquetReader { let pruned_row_groups = self .predicate - .prune_row_groups(builder.metadata().row_groups()) + .prune_row_groups( + builder.metadata().row_groups(), + store_schema.schema().clone(), + ) .into_iter() .enumerate() .filter_map(|(idx, valid)| if valid { Some(idx) } else { None }) @@ -549,12 +552,11 @@ mod tests { let operator = create_object_store(dir.path().to_str().unwrap()); let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap()); - let user_schema = projected_schema.projected_user_schema().clone(); let reader = ParquetReader::new( sst_file_handle, operator, projected_schema, - Predicate::empty(user_schema), + Predicate::empty(), TimestampRange::min_to_max(), ); @@ -636,12 +638,11 @@ mod tests { let operator = create_object_store(dir.path().to_str().unwrap()); let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap()); - let user_schema = projected_schema.projected_user_schema().clone(); let reader = ParquetReader::new( file_handle, operator, projected_schema, - Predicate::empty(user_schema), + Predicate::empty(), TimestampRange::min_to_max(), ); @@ -665,14 +666,8 @@ mod tests { range: TimestampRange, expect: Vec, ) { - let store_schema = schema.schema_to_read().clone(); - let reader = ParquetReader::new( - file_handle, - object_store, - schema, - Predicate::empty(store_schema.schema().clone()), - range, - ); + let reader = + ParquetReader::new(file_handle, object_store, schema, Predicate::empty(), range); let mut stream = reader.chunk_stream().await.unwrap(); let result = stream.next_batch().await; diff --git a/src/storage/src/sst/pruning.rs b/src/storage/src/sst/pruning.rs index 7e24c894d5dd..6cfbd105f211 100644 --- a/src/storage/src/sst/pruning.rs +++ b/src/storage/src/sst/pruning.rs @@ -29,9 +29,11 @@ use datatypes::prelude::ConcreteDataType; use parquet::arrow::arrow_reader::{ArrowPredicate, RowFilter}; use parquet::arrow::ProjectionMask; use parquet::schema::types::SchemaDescriptor; +use snafu::ResultExt; use table::predicate::Predicate; use crate::error; +use crate::error::BuildPredicateSnafu; use crate::schema::StoreSchema; /// Builds row filters according to predicates. @@ -80,7 +82,11 @@ pub(crate) fn build_row_filter( Box::new(PlainTimestampRowFilter::new(time_range, ts_col_projection)) as _ }; let mut predicates = vec![time_range_row_filter]; - if let Ok(datafusion_filters) = predicate_to_row_filter(predicate, projection_mask) { + if let Ok(datafusion_filters) = predicate_to_row_filter( + predicate, + projection_mask, + store_schema.schema().arrow_schema(), + ) { predicates.extend(datafusion_filters); } let filter = RowFilter::new(predicates); @@ -90,9 +96,13 @@ pub(crate) fn build_row_filter( fn predicate_to_row_filter( predicate: &Predicate, projection_mask: ProjectionMask, + schema: &arrow::datatypes::SchemaRef, ) -> error::Result>> { - let mut datafusion_predicates = Vec::with_capacity(predicate.exprs().len()); - for expr in predicate.exprs() { + let physical_exprs = predicate + .to_physical_exprs(schema) + .context(BuildPredicateSnafu)?; + let mut datafusion_predicates = Vec::with_capacity(physical_exprs.len()); + for expr in &physical_exprs { datafusion_predicates.push(Box::new(DatafusionArrowPredicate { projection_mask: projection_mask.clone(), physical_expr: expr.clone(), diff --git a/src/table/src/predicate.rs b/src/table/src/predicate.rs index 0e0ab420a333..4dcdb1ce0303 100644 --- a/src/table/src/predicate.rs +++ b/src/table/src/predicate.rs @@ -27,6 +27,7 @@ use datafusion_expr::expr::InList; use datafusion_expr::{Between, BinaryExpr, ColumnarValue, Operator}; use datafusion_physical_expr::execution_props::ExecutionProps; use datafusion_physical_expr::{create_physical_expr, PhysicalExpr}; +use datatypes::arrow; use datatypes::arrow::array::BooleanArray; use datatypes::schema::SchemaRef; use datatypes::value::scalar_value_to_timestamp; @@ -39,19 +40,24 @@ mod stats; #[derive(Clone)] pub struct Predicate { - /// The schema of the table that the expressions being applied. - schema: SchemaRef, - /// Physical expressions of this predicate. - exprs: Vec>, + /// logical exprs + exprs: Vec, } impl Predicate { /// Creates a new `Predicate` by converting logical exprs to physical exprs that can be /// evaluated against record batches. /// Returns error when failed to convert exprs. - pub fn try_new(exprs: Vec, schema: SchemaRef) -> error::Result { - let arrow_schema = schema.arrow_schema(); - let df_schema = arrow_schema + pub fn new(exprs: Vec) -> Self { + Self { exprs } + } + + /// Builds physical exprs according to provided schema. + pub fn to_physical_exprs( + &self, + schema: &arrow::datatypes::SchemaRef, + ) -> error::Result>> { + let df_schema = schema .clone() .to_dfschema_ref() .context(error::DatafusionSnafu)?; @@ -61,47 +67,38 @@ impl Predicate { // registering variables. let execution_props = &ExecutionProps::new(); - let physical_exprs = exprs + self.exprs .iter() .map(|expr| { - create_physical_expr( - expr.df_expr(), - df_schema.as_ref(), - arrow_schema.as_ref(), - execution_props, - ) + create_physical_expr(expr.df_expr(), df_schema.as_ref(), schema, execution_props) }) .collect::>() - .context(error::DatafusionSnafu)?; - - Ok(Self { - schema, - exprs: physical_exprs, - }) - } - - #[inline] - pub fn exprs(&self) -> &[Arc] { - &self.exprs + .context(error::DatafusionSnafu) } /// Builds an empty predicate from given schema. - pub fn empty(schema: SchemaRef) -> Self { - Self { - schema, - exprs: vec![], - } + pub fn empty() -> Self { + Self { exprs: vec![] } } /// Evaluates the predicate against row group metadata. /// Returns a vector of boolean values, among which `false` means the row group can be skipped. - pub fn prune_row_groups(&self, row_groups: &[RowGroupMetaData]) -> Vec { + pub fn prune_row_groups( + &self, + row_groups: &[RowGroupMetaData], + schema: SchemaRef, + ) -> Vec { let mut res = vec![true; row_groups.len()]; - let arrow_schema = self.schema.arrow_schema(); - for expr in &self.exprs { + + let Ok(physical_exprs) = self.to_physical_exprs(schema.arrow_schema()) else { + return res; + }; + + let arrow_schema = schema.arrow_schema(); + for expr in &physical_exprs { match PruningPredicate::try_new(expr.clone(), arrow_schema.clone()) { Ok(p) => { - let stat = RowGroupPruningStatistics::new(row_groups, &self.schema); + let stat = RowGroupPruningStatistics::new(row_groups, &schema); match p.prune(&stat) { Ok(r) => { for (curr_val, res) in r.into_iter().zip(res.iter_mut()) { @@ -123,7 +120,9 @@ impl Predicate { /// Prunes primary keys pub fn prune_primary_key(&self, primary_key: &RecordBatch) -> error::Result { - for expr in &self.exprs { + let pk_schema = primary_key.schema(); + let physical_exprs = self.to_physical_exprs(&pk_schema)?; + for expr in &physical_exprs { // evaluate every filter against primary key let Ok(eva) = expr.evaluate(primary_key) else { continue; @@ -156,11 +155,22 @@ impl Predicate { /// Evaluates the predicate against the `stats`. /// Returns a vector of boolean values, among which `false` means the row group can be skipped. - pub fn prune_with_stats(&self, stats: &S) -> Vec { + pub fn prune_with_stats( + &self, + stats: &S, + schema: &arrow::datatypes::SchemaRef, + ) -> Vec { let mut res = vec![true; stats.num_containers()]; - let arrow_schema = self.schema.arrow_schema(); - for expr in &self.exprs { - match PruningPredicate::try_new(expr.clone(), arrow_schema.clone()) { + let physical_exprs = match self.to_physical_exprs(schema) { + Ok(expr) => expr, + Err(e) => { + warn!(e; "Failed to build physical expr from predicates: {:?}", &self.exprs); + return res; + } + }; + + for expr in &physical_exprs { + match PruningPredicate::try_new(expr.clone(), schema.clone()) { Ok(p) => match p.prune(stats) { Ok(r) => { for (curr_val, res) in r.into_iter().zip(res.iter_mut()) { @@ -643,7 +653,7 @@ mod tests { let dir = create_temp_dir("prune_parquet"); let (path, schema) = gen_test_parquet_file(&dir, array_cnt).await; let schema = Arc::new(datatypes::schema::Schema::try_from(schema).unwrap()); - let arrow_predicate = Predicate::try_new(filters, schema.clone()).unwrap(); + let arrow_predicate = Predicate::new(filters); let builder = ParquetRecordBatchStreamBuilder::new( tokio::fs::OpenOptions::new() .read(true) @@ -655,7 +665,7 @@ mod tests { .unwrap(); let metadata = builder.metadata().clone(); let row_groups = metadata.row_groups(); - let res = arrow_predicate.prune_row_groups(row_groups); + let res = arrow_predicate.prune_row_groups(row_groups, schema); assert_eq!(expect, res); } diff --git a/tests-integration/src/opentsdb.rs b/tests-integration/src/opentsdb.rs index 5d6338d94270..c5474ea4e240 100644 --- a/tests-integration/src/opentsdb.rs +++ b/tests-integration/src/opentsdb.rs @@ -46,6 +46,8 @@ mod tests { async fn test_exec(instance: &Arc) { let ctx = QueryContext::arc(); + + // should create new table "my_metric_1" directly let data_point1 = DataPoint::new( "my_metric_1".to_string(), 1000, @@ -55,9 +57,8 @@ mod tests { ("tagk2".to_string(), "tagv2".to_string()), ], ); - // should create new table "my_metric_1" directly - instance.exec(&data_point1, ctx.clone()).await.unwrap(); + // should create new column "tagk3" directly let data_point2 = DataPoint::new( "my_metric_1".to_string(), 2000, @@ -67,12 +68,12 @@ mod tests { ("tagk3".to_string(), "tagv3".to_string()), ], ); - // should create new column "tagk3" directly - instance.exec(&data_point2, ctx.clone()).await.unwrap(); - let data_point3 = DataPoint::new("my_metric_1".to_string(), 3000, 3.0, vec![]); // should handle null tags properly - instance.exec(&data_point3, ctx.clone()).await.unwrap(); + let data_point3 = DataPoint::new("my_metric_1".to_string(), 3000, 3.0, vec![]); + + let data_points = vec![data_point1, data_point2, data_point3]; + instance.exec(data_points, ctx.clone()).await.unwrap(); let output = instance .do_query( @@ -87,13 +88,13 @@ mod tests { let recordbatches = RecordBatches::try_collect(stream).await.unwrap(); let pretty_print = recordbatches.pretty_print().unwrap(); let expected = vec![ - "+---------------------+----------------+-------+-------+-------+", - "| greptime_timestamp | greptime_value | tagk1 | tagk2 | tagk3 |", - "+---------------------+----------------+-------+-------+-------+", - "| 1970-01-01T00:00:01 | 1.0 | tagv1 | tagv2 | |", - "| 1970-01-01T00:00:02 | 2.0 | | tagv2 | tagv3 |", - "| 1970-01-01T00:00:03 | 3.0 | | | |", - "+---------------------+----------------+-------+-------+-------+", + "+-------+-------+----------------+---------------------+-------+", + "| tagk1 | tagk2 | greptime_value | greptime_timestamp | tagk3 |", + "+-------+-------+----------------+---------------------+-------+", + "| tagv1 | tagv2 | 1.0 | 1970-01-01T00:00:01 | |", + "| | tagv2 | 2.0 | 1970-01-01T00:00:02 | tagv3 |", + "| | | 3.0 | 1970-01-01T00:00:03 | |", + "+-------+-------+----------------+---------------------+-------+", ] .into_iter() .join("\n"); diff --git a/tests/cases/distributed/optimizer/filter_push_down.result b/tests/cases/distributed/optimizer/filter_push_down.result index e6954bc84807..2b2d7af35550 100644 --- a/tests/cases/distributed/optimizer/filter_push_down.result +++ b/tests/cases/distributed/optimizer/filter_push_down.result @@ -237,7 +237,7 @@ SELECT i FROM (SELECT * FROM integers i1 UNION SELECT * FROM integers i2) a WHER -- SELECT * FROM (SELECT i1.i AS a, i2.i AS b, row_number() OVER (ORDER BY i1.i, i2.i) FROM integers i1, integers i2 WHERE i1.i IS NOT NULL AND i2.i IS NOT NULL) a1 WHERE a=b ORDER BY 1; SELECT * FROM (SELECT 0=1 AS cond FROM integers i1, integers i2) a1 WHERE cond ORDER BY 1; -Error: 1003(Internal), Invalid argument error: must either specify a row count or at least one column +Error: 3001(EngineExecuteQuery), Invalid argument error: must either specify a row count or at least one column SELECT * FROM (SELECT 0=1 AS cond FROM integers i1, integers i2 GROUP BY 1) a1 WHERE cond ORDER BY 1; diff --git a/tests/cases/standalone/common/order/order_by_exceptions.result b/tests/cases/standalone/common/order/order_by_exceptions.result index bd3bfe545a44..f5d049f0f495 100644 --- a/tests/cases/standalone/common/order/order_by_exceptions.result +++ b/tests/cases/standalone/common/order/order_by_exceptions.result @@ -13,7 +13,7 @@ Error: 3000(PlanQuery), Error during planning: Order by column out of bounds, sp -- Not work in greptimedb SELECT a FROM test ORDER BY 'hello', a; -Error: 1003(Internal), Error during planning: Sort operation is not applicable to scalar value hello +Error: 3001(EngineExecuteQuery), Error during planning: Sort operation is not applicable to scalar value hello -- Ambiguous reference in union alias, give and error in duckdb, but works in greptimedb SELECT a AS k, b FROM test UNION SELECT a, b AS k FROM test ORDER BY k; @@ -54,7 +54,7 @@ Error: 3000(PlanQuery), Error during planning: Order by column out of bounds, sp SELECT a % 2, b FROM test UNION SELECT a % 2 AS k, b FROM test ORDER BY -1; -Error: 1003(Internal), Error during planning: Sort operation is not applicable to scalar value -1 +Error: 3001(EngineExecuteQuery), Error during planning: Sort operation is not applicable to scalar value -1 SELECT a % 2, b FROM test UNION SELECT a % 2 AS k FROM test ORDER BY -1; diff --git a/tests/cases/standalone/common/promql/simple_histogram.result b/tests/cases/standalone/common/promql/simple_histogram.result new file mode 100644 index 000000000000..f67659b3a111 --- /dev/null +++ b/tests/cases/standalone/common/promql/simple_histogram.result @@ -0,0 +1,246 @@ +-- from prometheus/promql/testdata/histograms.test +-- cases related to metric `testhistogram_bucket` +create table histogram_bucket ( + ts timestamp time index, + le string, + s string, + val double, + primary key (s, le), +); + +Affected Rows: 0 + +insert into histogram_bucket values + (3000000, "0.1", "positive", 50), + (3000000, ".2", "positive", 70), + (3000000, "1e0", "positive", 110), + (3000000, "+Inf", "positive", 120), + (3000000, "-.2", "negative", 10), + (3000000, "-0.1", "negative", 20), + (3000000, "0.3", "negative", 20), + (3000000, "+Inf", "negative", 30); + +Affected Rows: 8 + +-- Quantile too low. +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(-0.1, histogram_bucket); + ++---------------------+----------+------+ +| ts | s | val | ++---------------------+----------+------+ +| 1970-01-01T00:50:00 | negative | -inf | +| 1970-01-01T00:50:00 | positive | -inf | ++---------------------+----------+------+ + +-- Quantile too high. +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(1.01, histogram_bucket); + ++---------------------+----------+-----+ +| ts | s | val | ++---------------------+----------+-----+ +| 1970-01-01T00:50:00 | negative | inf | +| 1970-01-01T00:50:00 | positive | inf | ++---------------------+----------+-----+ + +-- Quantile invalid. +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(NaN, histogram_bucket); + ++---------------------+----------+-----+ +| ts | s | val | ++---------------------+----------+-----+ +| 1970-01-01T00:50:00 | negative | NaN | +| 1970-01-01T00:50:00 | positive | NaN | ++---------------------+----------+-----+ + +-- Quantile value in lowest bucket, which is positive. +tql eval (3000, 3000, '1s') histogram_quantile(0, histogram_bucket{s="positive"}); + ++---------------------+----------+-----+ +| ts | s | val | ++---------------------+----------+-----+ +| 1970-01-01T00:50:00 | positive | 0.0 | ++---------------------+----------+-----+ + +-- Quantile value in lowest bucket, which is negative. +tql eval (3000, 3000, '1s') histogram_quantile(0, histogram_bucket{s="negative"}); + ++---------------------+----------+------+ +| ts | s | val | ++---------------------+----------+------+ +| 1970-01-01T00:50:00 | negative | -0.2 | ++---------------------+----------+------+ + +-- Quantile value in highest bucket. +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(1, histogram_bucket); + ++---------------------+----------+-----+ +| ts | s | val | ++---------------------+----------+-----+ +| 1970-01-01T00:50:00 | negative | 0.3 | +| 1970-01-01T00:50:00 | positive | 1.0 | ++---------------------+----------+-----+ + +-- Finally some useful quantiles. +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(0.2, histogram_bucket); + ++---------------------+----------+-------+ +| ts | s | val | ++---------------------+----------+-------+ +| 1970-01-01T00:50:00 | negative | -0.2 | +| 1970-01-01T00:50:00 | positive | 0.048 | ++---------------------+----------+-------+ + +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(0.5, histogram_bucket); + ++---------------------+----------+----------------------+ +| ts | s | val | ++---------------------+----------+----------------------+ +| 1970-01-01T00:50:00 | negative | -0.15000000000000002 | +| 1970-01-01T00:50:00 | positive | 0.15000000000000002 | ++---------------------+----------+----------------------+ + +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(0.8, histogram_bucket); + ++---------------------+----------+------+ +| ts | s | val | ++---------------------+----------+------+ +| 1970-01-01T00:50:00 | negative | 0.3 | +| 1970-01-01T00:50:00 | positive | 0.72 | ++---------------------+----------+------+ + +-- More realistic with rates. +-- This case doesn't contains value because other point are not inserted. +-- quantile with rate is covered in other cases +tql eval (3000, 3000, '1s') histogram_quantile(0.2, rate(histogram_bucket[5m])); + +++ +++ + +drop table histogram_bucket; + +Affected Rows: 0 + +-- cases related to `testhistogram2_bucket` +create table histogram2_bucket ( + ts timestamp time index, + le string, + val double, + primary key (le), +); + +Affected Rows: 0 + +insert into histogram2_bucket values + (0, "0", 0), + (300000, "0", 0), + (600000, "0", 0), + (900000, "0", 0), + (1200000, "0", 0), + (1500000, "0", 0), + (1800000, "0", 0), + (2100000, "0", 0), + (2400000, "0", 0), + (2700000, "0", 0), + (0, "2", 1), + (300000, "2", 2), + (600000, "2", 3), + (900000, "2", 4), + (1200000, "2", 5), + (1500000, "2", 6), + (1800000, "2", 7), + (2100000, "2", 8), + (2400000, "2", 9), + (2700000, "2", 10), + (0, "4", 2), + (300000, "4", 4), + (600000, "4", 6), + (900000, "4", 8), + (1200000, "4", 10), + (1500000, "4", 12), + (1800000, "4", 14), + (2100000, "4", 16), + (2400000, "4", 18), + (2700000, "4", 20), + (0, "6", 3), + (300000, "6", 6), + (600000, "6", 9), + (900000, "6", 12), + (1200000, "6", 15), + (1500000, "6", 18), + (1800000, "6", 21), + (2100000, "6", 24), + (2400000, "6", 27), + (2700000, "6", 30), + (0, "+Inf", 3), + (300000, "+Inf", 6), + (600000, "+Inf", 9), + (900000, "+Inf", 12), + (1200000, "+Inf", 15), + (1500000, "+Inf", 18), + (1800000, "+Inf", 21), + (2100000, "+Inf", 24), + (2400000, "+Inf", 27), + (2700000, "+Inf", 30); + +Affected Rows: 50 + +-- Want results exactly in the middle of the bucket. +tql eval (420, 420, '1s') histogram_quantile(0.166, histogram2_bucket); + ++---------------------+-------+ +| ts | val | ++---------------------+-------+ +| 1970-01-01T00:07:00 | 0.996 | ++---------------------+-------+ + +tql eval (420, 420, '1s') histogram_quantile(0.5, histogram2_bucket); + ++---------------------+-----+ +| ts | val | ++---------------------+-----+ +| 1970-01-01T00:07:00 | 3.0 | ++---------------------+-----+ + +tql eval (420, 420, '1s') histogram_quantile(0.833, histogram2_bucket); + ++---------------------+-------------------+ +| ts | val | ++---------------------+-------------------+ +| 1970-01-01T00:07:00 | 4.997999999999999 | ++---------------------+-------------------+ + +tql eval (2820, 2820, '1s') histogram_quantile(0.166, rate(histogram2_bucket[15m])); + ++---------------------+----------------------------+ +| ts | prom_rate(ts_range,val,ts) | ++---------------------+----------------------------+ +| 1970-01-01T00:47:00 | 0.996 | ++---------------------+----------------------------+ + +tql eval (2820, 2820, '1s') histogram_quantile(0.5, rate(histogram2_bucket[15m])); + ++---------------------+----------------------------+ +| ts | prom_rate(ts_range,val,ts) | ++---------------------+----------------------------+ +| 1970-01-01T00:47:00 | 3.0 | ++---------------------+----------------------------+ + +tql eval (2820, 2820, '1s') histogram_quantile(0.833, rate(histogram2_bucket[15m])); + ++---------------------+----------------------------+ +| ts | prom_rate(ts_range,val,ts) | ++---------------------+----------------------------+ +| 1970-01-01T00:47:00 | 4.998 | ++---------------------+----------------------------+ + +drop table histogram2_bucket; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/promql/simple_histogram.sql b/tests/cases/standalone/common/promql/simple_histogram.sql new file mode 100644 index 000000000000..2eb31670e5c1 --- /dev/null +++ b/tests/cases/standalone/common/promql/simple_histogram.sql @@ -0,0 +1,134 @@ +-- from prometheus/promql/testdata/histograms.test +-- cases related to metric `testhistogram_bucket` + +create table histogram_bucket ( + ts timestamp time index, + le string, + s string, + val double, + primary key (s, le), +); + +insert into histogram_bucket values + (3000000, "0.1", "positive", 50), + (3000000, ".2", "positive", 70), + (3000000, "1e0", "positive", 110), + (3000000, "+Inf", "positive", 120), + (3000000, "-.2", "negative", 10), + (3000000, "-0.1", "negative", 20), + (3000000, "0.3", "negative", 20), + (3000000, "+Inf", "negative", 30); + +-- Quantile too low. +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(-0.1, histogram_bucket); + +-- Quantile too high. +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(1.01, histogram_bucket); + +-- Quantile invalid. +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(NaN, histogram_bucket); + +-- Quantile value in lowest bucket, which is positive. +tql eval (3000, 3000, '1s') histogram_quantile(0, histogram_bucket{s="positive"}); + +-- Quantile value in lowest bucket, which is negative. +tql eval (3000, 3000, '1s') histogram_quantile(0, histogram_bucket{s="negative"}); + +-- Quantile value in highest bucket. +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(1, histogram_bucket); + +-- Finally some useful quantiles. +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(0.2, histogram_bucket); + +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(0.5, histogram_bucket); + +-- SQLNESS SORT_RESULT 3 1 +tql eval (3000, 3000, '1s') histogram_quantile(0.8, histogram_bucket); + +-- More realistic with rates. +-- This case doesn't contains value because other point are not inserted. +-- quantile with rate is covered in other cases +tql eval (3000, 3000, '1s') histogram_quantile(0.2, rate(histogram_bucket[5m])); + +drop table histogram_bucket; + +-- cases related to `testhistogram2_bucket` +create table histogram2_bucket ( + ts timestamp time index, + le string, + val double, + primary key (le), +); + +insert into histogram2_bucket values + (0, "0", 0), + (300000, "0", 0), + (600000, "0", 0), + (900000, "0", 0), + (1200000, "0", 0), + (1500000, "0", 0), + (1800000, "0", 0), + (2100000, "0", 0), + (2400000, "0", 0), + (2700000, "0", 0), + (0, "2", 1), + (300000, "2", 2), + (600000, "2", 3), + (900000, "2", 4), + (1200000, "2", 5), + (1500000, "2", 6), + (1800000, "2", 7), + (2100000, "2", 8), + (2400000, "2", 9), + (2700000, "2", 10), + (0, "4", 2), + (300000, "4", 4), + (600000, "4", 6), + (900000, "4", 8), + (1200000, "4", 10), + (1500000, "4", 12), + (1800000, "4", 14), + (2100000, "4", 16), + (2400000, "4", 18), + (2700000, "4", 20), + (0, "6", 3), + (300000, "6", 6), + (600000, "6", 9), + (900000, "6", 12), + (1200000, "6", 15), + (1500000, "6", 18), + (1800000, "6", 21), + (2100000, "6", 24), + (2400000, "6", 27), + (2700000, "6", 30), + (0, "+Inf", 3), + (300000, "+Inf", 6), + (600000, "+Inf", 9), + (900000, "+Inf", 12), + (1200000, "+Inf", 15), + (1500000, "+Inf", 18), + (1800000, "+Inf", 21), + (2100000, "+Inf", 24), + (2400000, "+Inf", 27), + (2700000, "+Inf", 30); + +-- Want results exactly in the middle of the bucket. +tql eval (420, 420, '1s') histogram_quantile(0.166, histogram2_bucket); + +tql eval (420, 420, '1s') histogram_quantile(0.5, histogram2_bucket); + +tql eval (420, 420, '1s') histogram_quantile(0.833, histogram2_bucket); + +tql eval (2820, 2820, '1s') histogram_quantile(0.166, rate(histogram2_bucket[15m])); + +tql eval (2820, 2820, '1s') histogram_quantile(0.5, rate(histogram2_bucket[15m])); + +tql eval (2820, 2820, '1s') histogram_quantile(0.833, rate(histogram2_bucket[15m])); + +drop table histogram2_bucket; diff --git a/tests/cases/standalone/common/range/error.result b/tests/cases/standalone/common/range/error.result index 289e52f23735..cf720ee9f1e3 100644 --- a/tests/cases/standalone/common/range/error.result +++ b/tests/cases/standalone/common/range/error.result @@ -37,7 +37,7 @@ Error: 2000(InvalidSyntax), sql parser error: Illegal Range select, no RANGE key SELECT min(val) RANGE '10s', max(val) FROM host ALIGN '5s'; -Error: 1003(Internal), No field named "MAX(host.val)". Valid fields are "MIN(host.val) RANGE 10s FILL NULL", host.ts, host.host. +Error: 3001(EngineExecuteQuery), No field named "MAX(host.val)". Valid fields are "MIN(host.val) RANGE 10s FILL NULL", host.ts, host.host. SELECT min(val) * 2 RANGE '10s' FROM host ALIGN '5s'; @@ -50,12 +50,12 @@ Error: 2000(InvalidSyntax), sql parser error: Can't use the RANGE keyword in Exp -- 2.2 no align param SELECT min(val) RANGE '5s' FROM host; -Error: 1003(Internal), Error during planning: Missing argument in range select query +Error: 3000(PlanQuery), Error during planning: Missing argument in range select query -- 2.3 type mismatch SELECT covar(ceil(val), floor(val)) RANGE '20s' FROM host ALIGN '10s'; -Error: 1003(Internal), Internal error: Unsupported data type Int64 for function ceil. +Error: 3001(EngineExecuteQuery), Internal error: Unsupported data type Int64 for function ceil. This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker -- 2.4 nest query @@ -71,11 +71,11 @@ Error: 2000(InvalidSyntax), Range Query: Window functions is not allowed in Rang -- 2.6 invalid fill SELECT min(val) RANGE '5s', min(val) RANGE '5s' FILL NULL FROM host ALIGN '5s'; -Error: 1003(Internal), Schema contains duplicate unqualified field name "MIN(host.val) RANGE 5s FILL NULL" +Error: 3001(EngineExecuteQuery), Schema contains duplicate unqualified field name "MIN(host.val) RANGE 5s FILL NULL" SELECT min(val) RANGE '5s' FROM host ALIGN '5s' FILL 3.0; -Error: 1003(Internal), Error during planning: 3.0 is not a valid fill option, fail to convert to a const value. { Arrow error: Cast error: Cannot cast string '3.0' to value of Int64 type } +Error: 3000(PlanQuery), Error during planning: 3.0 is not a valid fill option, fail to convert to a const value. { Arrow error: Cast error: Cannot cast string '3.0' to value of Int64 type } DROP TABLE host; diff --git a/tests/cases/standalone/common/select/prune.result b/tests/cases/standalone/common/select/prune.result new file mode 100644 index 000000000000..c6884bddccd5 --- /dev/null +++ b/tests/cases/standalone/common/select/prune.result @@ -0,0 +1,83 @@ +create table demo(ts timestamp time index, `value` double, host string,idc string, collector string, primary key(host, idc, collector)); + +Affected Rows: 0 + +insert into demo values(1,2,'test1', 'idc1', 'disk') ,(2,3,'test2', 'idc1', 'disk'), (3,4,'test3', 'idc2','memory'); + +Affected Rows: 3 + +select * from demo where host='test1'; + ++-------------------------+-------+-------+------+-----------+ +| ts | value | host | idc | collector | ++-------------------------+-------+-------+------+-----------+ +| 1970-01-01T00:00:00.001 | 2.0 | test1 | idc1 | disk | ++-------------------------+-------+-------+------+-----------+ + +select * from demo where host='test2'; + ++-------------------------+-------+-------+------+-----------+ +| ts | value | host | idc | collector | ++-------------------------+-------+-------+------+-----------+ +| 1970-01-01T00:00:00.002 | 3.0 | test2 | idc1 | disk | ++-------------------------+-------+-------+------+-----------+ + +select * from demo where host='test3'; + ++-------------------------+-------+-------+------+-----------+ +| ts | value | host | idc | collector | ++-------------------------+-------+-------+------+-----------+ +| 1970-01-01T00:00:00.003 | 4.0 | test3 | idc2 | memory | ++-------------------------+-------+-------+------+-----------+ + +select * from demo where host='test2' and idc='idc1'; + ++-------------------------+-------+-------+------+-----------+ +| ts | value | host | idc | collector | ++-------------------------+-------+-------+------+-----------+ +| 1970-01-01T00:00:00.002 | 3.0 | test2 | idc1 | disk | ++-------------------------+-------+-------+------+-----------+ + +select * from demo where host='test2' and idc='idc1' and collector='disk'; + ++-------------------------+-------+-------+------+-----------+ +| ts | value | host | idc | collector | ++-------------------------+-------+-------+------+-----------+ +| 1970-01-01T00:00:00.002 | 3.0 | test2 | idc1 | disk | ++-------------------------+-------+-------+------+-----------+ + +select * from demo where host='test2' and idc='idc2'; + +++ +++ + +select * from demo where host='test3' and idc>'idc1'; + ++-------------------------+-------+-------+------+-----------+ +| ts | value | host | idc | collector | ++-------------------------+-------+-------+------+-----------+ +| 1970-01-01T00:00:00.003 | 4.0 | test3 | idc2 | memory | ++-------------------------+-------+-------+------+-----------+ + +select * from demo where idc='idc1' order by ts; + ++-------------------------+-------+-------+------+-----------+ +| ts | value | host | idc | collector | ++-------------------------+-------+-------+------+-----------+ +| 1970-01-01T00:00:00.001 | 2.0 | test1 | idc1 | disk | +| 1970-01-01T00:00:00.002 | 3.0 | test2 | idc1 | disk | ++-------------------------+-------+-------+------+-----------+ + +select * from demo where collector='disk' order by ts; + ++-------------------------+-------+-------+------+-----------+ +| ts | value | host | idc | collector | ++-------------------------+-------+-------+------+-----------+ +| 1970-01-01T00:00:00.001 | 2.0 | test1 | idc1 | disk | +| 1970-01-01T00:00:00.002 | 3.0 | test2 | idc1 | disk | ++-------------------------+-------+-------+------+-----------+ + +drop table demo; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/select/prune.sql b/tests/cases/standalone/common/select/prune.sql new file mode 100644 index 000000000000..fb007105ed58 --- /dev/null +++ b/tests/cases/standalone/common/select/prune.sql @@ -0,0 +1,23 @@ +create table demo(ts timestamp time index, `value` double, host string,idc string, collector string, primary key(host, idc, collector)); + +insert into demo values(1,2,'test1', 'idc1', 'disk') ,(2,3,'test2', 'idc1', 'disk'), (3,4,'test3', 'idc2','memory'); + +select * from demo where host='test1'; + +select * from demo where host='test2'; + +select * from demo where host='test3'; + +select * from demo where host='test2' and idc='idc1'; + +select * from demo where host='test2' and idc='idc1' and collector='disk'; + +select * from demo where host='test2' and idc='idc2'; + +select * from demo where host='test3' and idc>'idc1'; + +select * from demo where idc='idc1' order by ts; + +select * from demo where collector='disk' order by ts; + +drop table demo; diff --git a/tests/cases/standalone/common/types/interval/interval.result b/tests/cases/standalone/common/types/interval/interval.result index 00203179abb1..3bb9f46ea5ff 100644 --- a/tests/cases/standalone/common/types/interval/interval.result +++ b/tests/cases/standalone/common/types/interval/interval.result @@ -249,12 +249,12 @@ SELECT TIMESTAMP '1992-09-20 11:30:00.123456' - interval_value as new_value from -- Interval type does not support aggregation functions. SELECT MIN(interval_value) from intervals; -Error: 1003(Internal), Internal error: Min/Max accumulator not implemented for type Interval(MonthDayNano). +Error: 3001(EngineExecuteQuery), Internal error: Min/Max accumulator not implemented for type Interval(MonthDayNano). This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker SELECT MAX(interval_value) from intervals; -Error: 1003(Internal), Internal error: Min/Max accumulator not implemented for type Interval(MonthDayNano). +Error: 3001(EngineExecuteQuery), Internal error: Min/Max accumulator not implemented for type Interval(MonthDayNano). This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker SELECT SUM(interval_value) from intervals; diff --git a/tests/cases/standalone/common/types/timestamp/timestamp_precision.result b/tests/cases/standalone/common/types/timestamp/timestamp_precision.result index 6864cca4b716..03126f280608 100644 --- a/tests/cases/standalone/common/types/timestamp/timestamp_precision.result +++ b/tests/cases/standalone/common/types/timestamp/timestamp_precision.result @@ -23,7 +23,7 @@ SELECT CAST(sec AS VARCHAR), CAST(msec AS VARCHAR), CAST(micros AS VARCHAR), CAS SELECT EXTRACT(MICROSECONDS FROM sec), EXTRACT(MICROSECONDS FROM msec), EXTRACT(MICROSECONDS FROM micros), EXTRACT(MICROSECONDS FROM nanos) FROM ts_precision; -Error: 1003(Internal), Execution error: Date part 'MICROSECONDS' not supported +Error: 3001(EngineExecuteQuery), Execution error: Date part 'MICROSECONDS' not supported -- we only support precisions 0, 3, 6, and 9 -- any other precision is rounded up (e.g. 1/2 -> 3, 4/5 -> 6, 7/8 -> 9) diff --git a/tests/conf/datanode-test.toml.template b/tests/conf/datanode-test.toml.template index a7ce09693fe7..bda5b8cb365e 100644 --- a/tests/conf/datanode-test.toml.template +++ b/tests/conf/datanode-test.toml.template @@ -4,12 +4,11 @@ require_lease_before_startup = true rpc_addr = '127.0.0.1:4100' rpc_hostname = '127.0.0.1' rpc_runtime_size = 8 -require_lease_before_startup = true [wal] file_size = '1GB' purge_interval = '10m' -purge_threshold = '50GB' +purge_threshold = '10GB' read_batch_size = 128 sync_write = false diff --git a/tests/conf/standalone-test.toml.template b/tests/conf/standalone-test.toml.template index f81f73b9a24e..b0b20809dace 100644 --- a/tests/conf/standalone-test.toml.template +++ b/tests/conf/standalone-test.toml.template @@ -5,7 +5,7 @@ require_lease_before_startup = true [wal] file_size = '1GB' purge_interval = '10m' -purge_threshold = '50GB' +purge_threshold = '10GB' read_batch_size = 128 sync_write = false diff --git a/tests/runner/src/env.rs b/tests/runner/src/env.rs index 8455252e4daa..498e08ac7795 100644 --- a/tests/runner/src/env.rs +++ b/tests/runner/src/env.rs @@ -235,6 +235,8 @@ impl Env { args.push(format!("--http-addr=127.0.0.1:430{id}")); args.push(format!("--data-home={}", data_home.display())); args.push(format!("--node-id={id}")); + args.push("-c".to_string()); + args.push(self.generate_config_file(subcommand, db_ctx)); args.push("--metasrv-addr=127.0.0.1:3002".to_string()); (args, format!("127.0.0.1:410{id}")) }