diff --git a/.asf.yaml b/.asf.yaml index bd063d4bbf4a..366c719597aa 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -38,7 +38,7 @@ github: features: issues: true protected_branches: - master: + main: required_status_checks: # require branches to be up-to-date before merging strict: true diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml index aa1d1d9c14da..20da777ec0e5 100644 --- a/.github/actions/setup-builder/action.yaml +++ b/.github/actions/setup-builder/action.yaml @@ -30,7 +30,7 @@ runs: using: "composite" steps: - name: Cache Cargo - uses: actions/cache@v3 + uses: actions/cache@v4 with: # these represent dependencies downloaded by cargo # and thus do not depend on the OS, arch nor rust version. diff --git a/.github/dependabot.yml b/.github/dependabot.yml index ffde5378da93..b22c01f8a1b9 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,14 +5,14 @@ updates: schedule: interval: daily open-pull-requests-limit: 10 - target-branch: master + target-branch: main labels: [ auto-dependencies, arrow ] - package-ecosystem: cargo directory: "/object_store" schedule: interval: daily open-pull-requests-limit: 10 - target-branch: master + target-branch: main labels: [ auto-dependencies, object_store ] - package-ecosystem: "github-actions" directory: "/" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 679ccc956a20..08bdf123f4d6 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -20,6 +20,6 @@ The CI is structured so most tests are run in specific workflows: `arrow.yml` for `arrow`, `parquet.yml` for `parquet` and so on. -The basic idea is to run all tests on pushes to master (to ensure we -keep master green) but run only the individual workflows on PRs that +The basic idea is to run all tests on pushes to main (to ensure we +keep main green) but run only the individual workflows on PRs that change files that could affect them. diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index d3b2526740fa..daf38f2523fc 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -26,7 +26,7 @@ on: # always trigger push: branches: - - master + - main pull_request: paths: - .github/** @@ -61,39 +61,39 @@ jobs: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - - name: Test arrow-buffer with all features + - name: Test arrow-buffer run: cargo test -p arrow-buffer --all-features - - name: Test arrow-data with all features + - name: Test arrow-data run: cargo test -p arrow-data --all-features - - name: Test arrow-schema with all features + - name: Test arrow-schema run: cargo test -p arrow-schema --all-features - - name: Test arrow-array with all features + - name: Test arrow-array run: cargo test -p arrow-array --all-features - - name: Test arrow-select with all features + - name: Test arrow-select run: cargo test -p arrow-select --all-features - - name: Test arrow-cast with all features + - name: Test arrow-cast run: cargo test -p arrow-cast --all-features - - name: Test arrow-ipc with all features + - name: Test arrow-ipc run: cargo test -p arrow-ipc --all-features - - name: Test arrow-csv with all features + - name: Test arrow-csv run: cargo test -p arrow-csv --all-features - - name: Test arrow-json with all features + - name: Test arrow-json run: cargo test -p arrow-json --all-features - - name: Test arrow-avro with all features + - name: Test arrow-avro run: cargo test -p arrow-avro --all-features - - name: Test arrow-string with all features + - name: Test arrow-string run: cargo test -p arrow-string --all-features - - name: Test arrow-ord with all features + - name: Test arrow-ord run: cargo test -p arrow-ord --all-features - - name: Test arrow-arith with all features + - name: Test arrow-arith run: cargo test -p arrow-arith --all-features - - name: Test arrow-row with all features + - name: Test arrow-row run: cargo test -p arrow-row --all-features - - name: Test arrow-integration-test with all features + - name: Test arrow-integration-test run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features run: cargo test -p arrow - - name: Test arrow with all features except pyarrow + - name: Test arrow except pyarrow run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,chrono-tz - name: Run examples run: | @@ -163,37 +163,139 @@ jobs: uses: ./.github/actions/setup-builder - name: Setup Clippy run: rustup component add clippy - - name: Clippy arrow-buffer with all features - run: cargo clippy -p arrow-buffer --all-targets --all-features -- -D warnings - - name: Clippy arrow-data with all features - run: cargo clippy -p arrow-data --all-targets --all-features -- -D warnings - - name: Clippy arrow-schema with all features - run: cargo clippy -p arrow-schema --all-targets --all-features -- -D warnings - - name: Clippy arrow-array with all features - run: cargo clippy -p arrow-array --all-targets --all-features -- -D warnings - - name: Clippy arrow-select with all features - run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings - - name: Clippy arrow-cast with all features - run: cargo clippy -p arrow-cast --all-targets --all-features -- -D warnings - - name: Clippy arrow-ipc with all features - run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings - - name: Clippy arrow-csv with all features - run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings - - name: Clippy arrow-json with all features - run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings - - name: Clippy arrow-avro with all features - run: cargo clippy -p arrow-avro --all-targets --all-features -- -D warnings - - name: Clippy arrow-string with all features - run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - - name: Clippy arrow-ord with all features - run: cargo clippy -p arrow-ord --all-targets --all-features -- -D warnings - - name: Clippy arrow-arith with all features - run: cargo clippy -p arrow-arith --all-targets --all-features -- -D warnings - - name: Clippy arrow-row with all features - run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - - name: Clippy arrow with all features - run: cargo clippy -p arrow --all-features --all-targets -- -D warnings - - name: Clippy arrow-integration-test with all features - run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings - - name: Clippy arrow-integration-testing with all features - run: cargo clippy -p arrow-integration-testing --all-targets --all-features -- -D warnings + - name: Clippy arrow-buffer + run: | + mod=arrow-buffer + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-data + run: | + mod=arrow-data + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-schema + run: | + mod=arrow-schema + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-array + run: | + mod=arrow-array + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-select + run: | + mod=arrow-select + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-cast + run: | + mod=arrow-cast + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-ipc + run: | + mod=arrow-ipc + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-csv + run: | + mod=arrow-csv + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-json + run: | + mod=arrow-json + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-avro + run: | + mod=arrow-avro + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-string + run: | + mod=arrow-string + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-ord + run: | + mod=arrow-ord + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-arith + run: | + mod=arrow-arith + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-row + run: | + mod=arrow-row + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow + run: | + mod=arrow + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-integration-test + run: | + mod=arrow-integration-test + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-integration-testing + run: | + mod=arrow-integration-testing + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 242e0f2a3b0d..79627448ca40 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -23,11 +23,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -# trigger for all PRs that touch certain files and changes to master +# trigger for all PRs that touch certain files and changes to main on: push: branches: - - master + - main pull_request: paths: - arrow-array/** diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 2c1dcdfd2100..e6254ea24a58 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -21,11 +21,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -# trigger for all PRs that touch certain files and changes to master +# trigger for all PRs that touch certain files and changes to main on: push: branches: - - master + - main pull_request: paths: - '**/Cargo.toml' diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 2026e257ab29..b28e8c20cfe7 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -21,11 +21,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -# trigger for all PRs and changes to master +# trigger for all PRs and changes to main on: push: branches: - - master + - main pull_request: env: diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 08d287bcceb2..d6ec0622f6ed 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -21,11 +21,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -# trigger for all PRs and changes to master +# trigger for all PRs and changes to main on: push: branches: - - master + - main pull_request: jobs: @@ -70,8 +70,8 @@ jobs: path: target/doc deploy: - # Only deploy if a push to master - if: github.ref_name == 'master' && github.event_name == 'push' + # Only deploy if a push to main + if: github.ref_name == 'main' && github.event_name == 'push' needs: docs permissions: contents: write @@ -90,7 +90,7 @@ jobs: cp .asf.yaml ./website/build/.asf.yaml - name: Deploy to gh-pages uses: peaceiris/actions-gh-pages@v4.0.0 - if: github.event_name == 'push' && github.ref_name == 'master' + if: github.event_name == 'push' && github.ref_name == 'main' with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: website/build diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 868729a168e8..9b23b1b5ad2e 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -21,11 +21,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -# trigger for all PRs that touch certain files and changes to master +# trigger for all PRs that touch certain files and changes to main on: push: branches: - - master + - main pull_request: paths: - .github/** diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index 19b432121b6f..ce67546a104b 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -21,11 +21,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -# trigger for all PRs that touch certain files and changes to master +# trigger for all PRs that touch certain files and changes to main on: push: branches: - - master + - main pull_request: paths: - .github/** diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 1857b330326a..93f809aaabd4 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -23,11 +23,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -# trigger for all PRs that touch certain files and changes to master +# trigger for all PRs that touch certain files and changes to main on: push: branches: - - master + - main pull_request: paths: - object_store/** @@ -138,9 +138,10 @@ jobs: - name: Setup LocalStack (AWS emulation) run: | - echo "LOCALSTACK_CONTAINER=$(docker run -d -p 4566:4566 localstack/localstack:3.8.1)" >> $GITHUB_ENV + echo "LOCALSTACK_CONTAINER=$(docker run -d -p 4566:4566 localstack/localstack:4.0.3)" >> $GITHUB_ENV echo "EC2_METADATA_CONTAINER=$(docker run -d -p 1338:1338 amazon/amazon-ec2-metadata-mock:v1.9.2 --imdsv2)" >> $GITHUB_ENV aws --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket + aws --endpoint-url=http://localhost:4566 s3api create-bucket --bucket test-object-lock --object-lock-enabled-for-bucket aws --endpoint-url=http://localhost:4566 dynamodb create-table --table-name test-table --key-schema AttributeName=path,KeyType=HASH AttributeName=etag,KeyType=RANGE --attribute-definitions AttributeName=path,AttributeType=S AttributeName=etag,AttributeType=S --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 KMS_KEY=$(aws --endpoint-url=http://localhost:4566 kms create-key --description "test key") @@ -164,7 +165,7 @@ jobs: - name: Run object_store tests (AWS native conditional put) run: cargo test --features=aws env: - AWS_CONDITIONAL_PUT: etag-put-if-not-exists + AWS_CONDITIONAL_PUT: etag AWS_COPY_IF_NOT_EXISTS: multipart - name: GCS Output diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index a4e654892662..2269950fd235 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -23,11 +23,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -# trigger for all PRs that touch certain files and changes to master +# trigger for all PRs that touch certain files and changes to main on: push: branches: - - master + - main pull_request: paths: - arrow/** diff --git a/.github/workflows/parquet_derive.yml b/.github/workflows/parquet_derive.yml index d8b02f73a8aa..17aec724a820 100644 --- a/.github/workflows/parquet_derive.yml +++ b/.github/workflows/parquet_derive.yml @@ -23,11 +23,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -# trigger for all PRs that touch certain files and changes to master +# trigger for all PRs that touch certain files and changes to main on: push: branches: - - master + - main pull_request: paths: - parquet/** diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 1b65c5057de1..ff5040fd2947 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -22,11 +22,11 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true -# trigger for all PRs and changes to master +# trigger for all PRs and changes to main on: push: branches: - - master + - main pull_request: jobs: diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 5b3a3255ffcd..376da6277114 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,69 @@ # Historical Changelog +## [53.2.0](https://github.com/apache/arrow-rs/tree/53.2.0) (2024-10-21) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/53.1.0...53.2.0) + +**Implemented enhancements:** + +- Implement arrow\_json encoder for Decimal128 & Decimal256 DataTypes [\#6605](https://github.com/apache/arrow-rs/issues/6605) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support DataType::FixedSizeList in make\_builder within struct\_builder.rs [\#6594](https://github.com/apache/arrow-rs/issues/6594) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support DataType::Dictionary in `make_builder` within struct\_builder.rs [\#6589](https://github.com/apache/arrow-rs/issues/6589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Interval parsing from string - accept "mon" and "mons" token [\#6548](https://github.com/apache/arrow-rs/issues/6548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `AsyncArrowWriter` API to get the total size of a written parquet file [\#6530](https://github.com/apache/arrow-rs/issues/6530) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `append_many` for Dictionary builders [\#6529](https://github.com/apache/arrow-rs/issues/6529) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Missing tonic `GRPC_STATUS` with tonic 0.12.1 [\#6515](https://github.com/apache/arrow-rs/issues/6515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add example of how to use parquet metadata reader APIs for a local cache [\#6504](https://github.com/apache/arrow-rs/issues/6504) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Remove reliance on `raw-entry` feature of Hashbrown [\#6498](https://github.com/apache/arrow-rs/issues/6498) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Improve page index metadata loading in `SerializedFileReader::new_with_options` [\#6491](https://github.com/apache/arrow-rs/issues/6491) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Release arrow-rs / parquet minor version `53.1.0` \(October 2024\) [\#6340](https://github.com/apache/arrow-rs/issues/6340) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Compilation fail where `c_char = u8` [\#6571](https://github.com/apache/arrow-rs/issues/6571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Arrow flight CI test failing on `master` [\#6568](https://github.com/apache/arrow-rs/issues/6568) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Documentation updates:** + +- Minor: Document SIMD rationale and tips [\#6554](https://github.com/apache/arrow-rs/pull/6554) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Closed issues:** + +- Casting to and from unions [\#6247](https://github.com/apache/arrow-rs/issues/6247) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Minor: more comments for `RecordBatch.get_array_memory_size()` [\#6607](https://github.com/apache/arrow-rs/pull/6607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([2010YOUY01](https://github.com/2010YOUY01)) +- Implement arrow\_json encoder for Decimal128 & Decimal256 [\#6606](https://github.com/apache/arrow-rs/pull/6606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phillipleblanc](https://github.com/phillipleblanc)) +- Add support for building FixedSizeListBuilder in struct\_builder's mak… [\#6595](https://github.com/apache/arrow-rs/pull/6595) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kszlim](https://github.com/kszlim)) +- Add limited support for dictionary builders in `make_builders` for stru… [\#6593](https://github.com/apache/arrow-rs/pull/6593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kszlim](https://github.com/kszlim)) +- Fix CI with new valid certificates and add script for future usage [\#6585](https://github.com/apache/arrow-rs/pull/6585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) +- Update proc-macro2 requirement from =1.0.87 to =1.0.88 [\#6579](https://github.com/apache/arrow-rs/pull/6579) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix clippy complaints [\#6573](https://github.com/apache/arrow-rs/pull/6573) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) +- Use c\_char instead of i8 to compile on platforms where c\_char = u8 [\#6572](https://github.com/apache/arrow-rs/pull/6572) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime)) +- Bump pyspark from 3.3.1 to 3.3.2 in /parquet/pytest [\#6564](https://github.com/apache/arrow-rs/pull/6564) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- `unsafe` improvements [\#6551](https://github.com/apache/arrow-rs/pull/6551) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ssbr](https://github.com/ssbr)) +- Update README.md [\#6550](https://github.com/apache/arrow-rs/pull/6550) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Abdullahsab3](https://github.com/Abdullahsab3)) +- Fix string '0' cast to decimal with scale 0 [\#6547](https://github.com/apache/arrow-rs/pull/6547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Add finish to `AsyncArrowWriter::finish` [\#6543](https://github.com/apache/arrow-rs/pull/6543) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Add append\_nulls to dictionary builders [\#6542](https://github.com/apache/arrow-rs/pull/6542) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adriangb](https://github.com/adriangb)) +- Improve UnionArray::is\_nullable [\#6540](https://github.com/apache/arrow-rs/pull/6540) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Allow to read parquet binary column as UTF8 type [\#6539](https://github.com/apache/arrow-rs/pull/6539) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([goldmedal](https://github.com/goldmedal)) +- Use HashTable instead of raw\_entry\_mut [\#6537](https://github.com/apache/arrow-rs/pull/6537) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add append\_many to dictionary arrays to allow adding repeated values [\#6534](https://github.com/apache/arrow-rs/pull/6534) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adriangb](https://github.com/adriangb)) +- Adds documentation and example recommending Vec\ over ChunkedArray [\#6527](https://github.com/apache/arrow-rs/pull/6527) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([efredine](https://github.com/efredine)) +- Update proc-macro2 requirement from =1.0.86 to =1.0.87 [\#6526](https://github.com/apache/arrow-rs/pull/6526) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add `ColumnChunkMetadataBuilder` clear APIs [\#6523](https://github.com/apache/arrow-rs/pull/6523) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Update sysinfo requirement from 0.31.2 to 0.32.0 [\#6521](https://github.com/apache/arrow-rs/pull/6521) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update Tonic to 0.12.3 [\#6517](https://github.com/apache/arrow-rs/pull/6517) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([cisaacson](https://github.com/cisaacson)) +- Detect missing page indexes while reading Parquet metadata [\#6507](https://github.com/apache/arrow-rs/pull/6507) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Use ParquetMetaDataReader to load page indexes in `SerializedFileReader::new_with_options` [\#6506](https://github.com/apache/arrow-rs/pull/6506) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Improve parquet `MetadataFetch` and `AsyncFileReader` docs [\#6505](https://github.com/apache/arrow-rs/pull/6505) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- fix arrow-json encoding with dictionary including nulls [\#6503](https://github.com/apache/arrow-rs/pull/6503) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([samuelcolvin](https://github.com/samuelcolvin)) +- Update brotli requirement from 6.0 to 7.0 [\#6499](https://github.com/apache/arrow-rs/pull/6499) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Benchmark both scenarios, with records skipped and without skipping, for delta-bin-packed primitive arrays with half nulls. [\#6489](https://github.com/apache/arrow-rs/pull/6489) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([wiedld](https://github.com/wiedld)) +- Add round trip tests for reading/writing parquet metadata [\#6463](https://github.com/apache/arrow-rs/pull/6463) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) ## [53.1.0](https://github.com/apache/arrow-rs/tree/53.1.0) (2024-10-02) [Full Changelog](https://github.com/apache/arrow-rs/compare/53.0.0...53.1.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fdf9b6dd95c..3b729360608b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,69 +19,101 @@ # Changelog -## [53.2.0](https://github.com/apache/arrow-rs/tree/53.2.0) (2024-10-21) +## [53.3.0](https://github.com/apache/arrow-rs/tree/53.3.0) (2024-11-17) -[Full Changelog](https://github.com/apache/arrow-rs/compare/53.1.0...53.2.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/53.2.0...53.3.0) **Implemented enhancements:** -- Implement arrow\_json encoder for Decimal128 & Decimal256 DataTypes [\#6605](https://github.com/apache/arrow-rs/issues/6605) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support DataType::FixedSizeList in make\_builder within struct\_builder.rs [\#6594](https://github.com/apache/arrow-rs/issues/6594) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support DataType::Dictionary in `make_builder` within struct\_builder.rs [\#6589](https://github.com/apache/arrow-rs/issues/6589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Interval parsing from string - accept "mon" and "mons" token [\#6548](https://github.com/apache/arrow-rs/issues/6548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `AsyncArrowWriter` API to get the total size of a written parquet file [\#6530](https://github.com/apache/arrow-rs/issues/6530) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `append_many` for Dictionary builders [\#6529](https://github.com/apache/arrow-rs/issues/6529) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Missing tonic `GRPC_STATUS` with tonic 0.12.1 [\#6515](https://github.com/apache/arrow-rs/issues/6515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Add example of how to use parquet metadata reader APIs for a local cache [\#6504](https://github.com/apache/arrow-rs/issues/6504) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Remove reliance on `raw-entry` feature of Hashbrown [\#6498](https://github.com/apache/arrow-rs/issues/6498) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Improve page index metadata loading in `SerializedFileReader::new_with_options` [\#6491](https://github.com/apache/arrow-rs/issues/6491) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Release arrow-rs / parquet minor version `53.1.0` \(October 2024\) [\#6340](https://github.com/apache/arrow-rs/issues/6340) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `PartialEq` of GenericByteViewArray \(StringViewArray / ByteViewArray\) that compares on equality rather than logical value [\#6679](https://github.com/apache/arrow-rs/issues/6679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Need a mechanism to handle schema changes due to dictionary hydration in FlightSQL server implementations [\#6672](https://github.com/apache/arrow-rs/issues/6672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support encoding Utf8View columns to JSON [\#6642](https://github.com/apache/arrow-rs/issues/6642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `append_n` for `BooleanBuilder` [\#6634](https://github.com/apache/arrow-rs/issues/6634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Some take optimizations [\#6621](https://github.com/apache/arrow-rs/issues/6621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Error Instead of Panic On Attempting to Write More Than 32769 Row Groups [\#6591](https://github.com/apache/arrow-rs/issues/6591) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Make casting from a timestamp without timezone to a timestamp with timezone configurable [\#6555](https://github.com/apache/arrow-rs/issues/6555) +- Add `record_batch!` macro for easy record batch creation [\#6553](https://github.com/apache/arrow-rs/issues/6553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `Binary` --\> `Utf8View` casting [\#6531](https://github.com/apache/arrow-rs/issues/6531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `downcast_primitive_array` and `downcast_dictionary_array` are not hygienic wrt imports [\#6400](https://github.com/apache/arrow-rs/issues/6400) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement interleave\_record\_batch [\#6731](https://github.com/apache/arrow-rs/pull/6731) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waynexia](https://github.com/waynexia)) +- feat: `record_batch!` macro [\#6588](https://github.com/apache/arrow-rs/pull/6588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ByteBaker](https://github.com/ByteBaker)) **Fixed bugs:** -- Compilation fail where `c_char = u8` [\#6571](https://github.com/apache/arrow-rs/issues/6571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Arrow flight CI test failing on `master` [\#6568](https://github.com/apache/arrow-rs/issues/6568) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Signed decimal e-notation parsing bug [\#6728](https://github.com/apache/arrow-rs/issues/6728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for Utf8View -\> numeric in can\_cast\_types [\#6715](https://github.com/apache/arrow-rs/issues/6715) +- IPC file writer produces incorrect footer when not preserving dict ID [\#6710](https://github.com/apache/arrow-rs/issues/6710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet from\_thrift\_helper incorrectly checks index [\#6693](https://github.com/apache/arrow-rs/issues/6693) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Primitive REPEATED fields not contained in LIST annotated groups aren't read as lists by record reader [\#6648](https://github.com/apache/arrow-rs/issues/6648) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- DictionaryHandling does not recurse into Map fields [\#6644](https://github.com/apache/arrow-rs/issues/6644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Array writer output empty when no record is written [\#6613](https://github.com/apache/arrow-rs/issues/6613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Archery Integration Test with c\# failing on main [\#6577](https://github.com/apache/arrow-rs/issues/6577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Potential unsoundness in `filter_run_end_array` [\#6569](https://github.com/apache/arrow-rs/issues/6569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet reader can generate incorrect validity buffer information for nested structures [\#6510](https://github.com/apache/arrow-rs/issues/6510) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- arrow-array ffi: FFI\_ArrowArray.null\_count is always interpreted as unsigned and initialized during conversion from C to Rust. [\#6497](https://github.com/apache/arrow-rs/issues/6497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Documentation updates:** -- Minor: Document SIMD rationale and tips [\#6554](https://github.com/apache/arrow-rs/pull/6554) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Minor: Document pattern for accessing views in StringView [\#6673](https://github.com/apache/arrow-rs/pull/6673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve Array::is\_nullable documentation [\#6615](https://github.com/apache/arrow-rs/pull/6615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Minor: improve docs for ByteViewArray-\>ByteArray From impl [\#6610](https://github.com/apache/arrow-rs/pull/6610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- Speed up `filter_run_end_array` [\#6712](https://github.com/apache/arrow-rs/pull/6712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) **Closed issues:** -- Casting to and from unions [\#6247](https://github.com/apache/arrow-rs/issues/6247) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect like results for pattern starting/ending with `%` percent and containing escape characters [\#6702](https://github.com/apache/arrow-rs/issues/6702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Minor: more comments for `RecordBatch.get_array_memory_size()` [\#6607](https://github.com/apache/arrow-rs/pull/6607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([2010YOUY01](https://github.com/2010YOUY01)) -- Implement arrow\_json encoder for Decimal128 & Decimal256 [\#6606](https://github.com/apache/arrow-rs/pull/6606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([phillipleblanc](https://github.com/phillipleblanc)) -- Add support for building FixedSizeListBuilder in struct\_builder's mak… [\#6595](https://github.com/apache/arrow-rs/pull/6595) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kszlim](https://github.com/kszlim)) -- Add limited support for dictionary builders in `make_builders` for stru… [\#6593](https://github.com/apache/arrow-rs/pull/6593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kszlim](https://github.com/kszlim)) -- Fix CI with new valid certificates and add script for future usage [\#6585](https://github.com/apache/arrow-rs/pull/6585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) -- Update proc-macro2 requirement from =1.0.87 to =1.0.88 [\#6579](https://github.com/apache/arrow-rs/pull/6579) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix clippy complaints [\#6573](https://github.com/apache/arrow-rs/pull/6573) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) -- Use c\_char instead of i8 to compile on platforms where c\_char = u8 [\#6572](https://github.com/apache/arrow-rs/pull/6572) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime)) -- Bump pyspark from 3.3.1 to 3.3.2 in /parquet/pytest [\#6564](https://github.com/apache/arrow-rs/pull/6564) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- `unsafe` improvements [\#6551](https://github.com/apache/arrow-rs/pull/6551) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ssbr](https://github.com/ssbr)) -- Update README.md [\#6550](https://github.com/apache/arrow-rs/pull/6550) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Abdullahsab3](https://github.com/Abdullahsab3)) -- Fix string '0' cast to decimal with scale 0 [\#6547](https://github.com/apache/arrow-rs/pull/6547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Add finish to `AsyncArrowWriter::finish` [\#6543](https://github.com/apache/arrow-rs/pull/6543) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- Add append\_nulls to dictionary builders [\#6542](https://github.com/apache/arrow-rs/pull/6542) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adriangb](https://github.com/adriangb)) -- Improve UnionArray::is\_nullable [\#6540](https://github.com/apache/arrow-rs/pull/6540) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Allow to read parquet binary column as UTF8 type [\#6539](https://github.com/apache/arrow-rs/pull/6539) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([goldmedal](https://github.com/goldmedal)) -- Use HashTable instead of raw\_entry\_mut [\#6537](https://github.com/apache/arrow-rs/pull/6537) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add append\_many to dictionary arrays to allow adding repeated values [\#6534](https://github.com/apache/arrow-rs/pull/6534) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adriangb](https://github.com/adriangb)) -- Adds documentation and example recommending Vec\ over ChunkedArray [\#6527](https://github.com/apache/arrow-rs/pull/6527) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([efredine](https://github.com/efredine)) -- Update proc-macro2 requirement from =1.0.86 to =1.0.87 [\#6526](https://github.com/apache/arrow-rs/pull/6526) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add `ColumnChunkMetadataBuilder` clear APIs [\#6523](https://github.com/apache/arrow-rs/pull/6523) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Update sysinfo requirement from 0.31.2 to 0.32.0 [\#6521](https://github.com/apache/arrow-rs/pull/6521) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update Tonic to 0.12.3 [\#6517](https://github.com/apache/arrow-rs/pull/6517) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([cisaacson](https://github.com/cisaacson)) -- Detect missing page indexes while reading Parquet metadata [\#6507](https://github.com/apache/arrow-rs/pull/6507) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- Use ParquetMetaDataReader to load page indexes in `SerializedFileReader::new_with_options` [\#6506](https://github.com/apache/arrow-rs/pull/6506) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- Improve parquet `MetadataFetch` and `AsyncFileReader` docs [\#6505](https://github.com/apache/arrow-rs/pull/6505) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- fix arrow-json encoding with dictionary including nulls [\#6503](https://github.com/apache/arrow-rs/pull/6503) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([samuelcolvin](https://github.com/samuelcolvin)) -- Update brotli requirement from 6.0 to 7.0 [\#6499](https://github.com/apache/arrow-rs/pull/6499) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Benchmark both scenarios, with records skipped and without skipping, for delta-bin-packed primitive arrays with half nulls. [\#6489](https://github.com/apache/arrow-rs/pull/6489) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([wiedld](https://github.com/wiedld)) -- Add round trip tests for reading/writing parquet metadata [\#6463](https://github.com/apache/arrow-rs/pull/6463) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Fix signed decimal e-notation parsing [\#6729](https://github.com/apache/arrow-rs/pull/6729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) +- Clean up some arrow-flight tests and duplicated code [\#6725](https://github.com/apache/arrow-rs/pull/6725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) +- Update PR template section about API breaking changes [\#6723](https://github.com/apache/arrow-rs/pull/6723) ([findepi](https://github.com/findepi)) +- Support for casting `StringViewArray` to `DecimalArray` [\#6720](https://github.com/apache/arrow-rs/pull/6720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- File writer preserve dict bug [\#6711](https://github.com/apache/arrow-rs/pull/6711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Add filter\_kernel benchmark for run array [\#6706](https://github.com/apache/arrow-rs/pull/6706) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- Fix string view ILIKE checks with NULL values [\#6705](https://github.com/apache/arrow-rs/pull/6705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Implement logical\_null\_count for more array types [\#6704](https://github.com/apache/arrow-rs/pull/6704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Fix LIKE with escapes [\#6703](https://github.com/apache/arrow-rs/pull/6703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Speed up `filter_bytes` [\#6699](https://github.com/apache/arrow-rs/pull/6699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Minor: fix misleading comment in byte view [\#6695](https://github.com/apache/arrow-rs/pull/6695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) +- minor fix on checking index [\#6694](https://github.com/apache/arrow-rs/pull/6694) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317)) +- Undo run end filter performance regression [\#6691](https://github.com/apache/arrow-rs/pull/6691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- Reimplement `PartialEq` of `GenericByteViewArray` compares by logical value [\#6689](https://github.com/apache/arrow-rs/pull/6689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- feat: expose known\_schema from FlightDataEncoder [\#6688](https://github.com/apache/arrow-rs/pull/6688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) +- Update hashbrown requirement from 0.14.2 to 0.15.1 [\#6684](https://github.com/apache/arrow-rs/pull/6684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support Duration in JSON Reader [\#6683](https://github.com/apache/arrow-rs/pull/6683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([simonvandel](https://github.com/simonvandel)) +- Check predicate and values are the same length for run end array filter safety [\#6675](https://github.com/apache/arrow-rs/pull/6675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- \[ffi\] Fix arrow-array null\_count error during conversion from C to Rust [\#6674](https://github.com/apache/arrow-rs/pull/6674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adbmal](https://github.com/adbmal)) +- Support `Utf8View` for `bit_length` kernel [\#6671](https://github.com/apache/arrow-rs/pull/6671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([austin362667](https://github.com/austin362667)) +- Fix string view LIKE checks with NULL values [\#6662](https://github.com/apache/arrow-rs/pull/6662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Improve documentation for `nullif` kernel [\#6658](https://github.com/apache/arrow-rs/pull/6658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve test\_auth error message when contains\(\) fails [\#6657](https://github.com/apache/arrow-rs/pull/6657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Let std::fmt::Debug for StructArray output Null/Validity info [\#6655](https://github.com/apache/arrow-rs/pull/6655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XinyuZeng](https://github.com/XinyuZeng)) +- Include offending line number when processing CSV file fails [\#6653](https://github.com/apache/arrow-rs/pull/6653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- feat: add write\_bytes for GenericBinaryBuilder [\#6652](https://github.com/apache/arrow-rs/pull/6652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun)) +- feat: Support Utf8View in JSON serialization [\#6651](https://github.com/apache/arrow-rs/pull/6651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease)) +- fix: include chrono-tz in flight sql cli [\#6650](https://github.com/apache/arrow-rs/pull/6650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Handle primitive REPEATED field not contained in LIST annotated group [\#6649](https://github.com/apache/arrow-rs/pull/6649) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Implement `append_n` for `BooleanBuilder` [\#6646](https://github.com/apache/arrow-rs/pull/6646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- fix: recurse into Map datatype when hydrating dictionaries [\#6645](https://github.com/apache/arrow-rs/pull/6645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) +- fix: enable TLS roots for flight CLI client [\#6640](https://github.com/apache/arrow-rs/pull/6640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- doc: Clarify take kernel semantics [\#6632](https://github.com/apache/arrow-rs/pull/6632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Return error rather than panic when too many row groups are written [\#6629](https://github.com/apache/arrow-rs/pull/6629) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Fix test feature selection so all feature combinations work as expected [\#6626](https://github.com/apache/arrow-rs/pull/6626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([itsjunetime](https://github.com/itsjunetime)) +- Add Parquet RowSelection benchmark [\#6623](https://github.com/apache/arrow-rs/pull/6623) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) +- Optimize `take_bits` to optimize `take_boolean` / `take_primitive` / `take_byte_view`: up to -25% [\#6622](https://github.com/apache/arrow-rs/pull/6622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Make downcast macros hygenic \(\#6400\) [\#6620](https://github.com/apache/arrow-rs/pull/6620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.88 to =1.0.89 [\#6618](https://github.com/apache/arrow-rs/pull/6618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix arrow-json writer empty [\#6614](https://github.com/apache/arrow-rs/pull/6614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gwik](https://github.com/gwik)) +- Add `ParquetObjectReader::with_runtime` [\#6612](https://github.com/apache/arrow-rs/pull/6612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime)) +- Re-enable `C#` arrow flight integration test [\#6611](https://github.com/apache/arrow-rs/pull/6611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add Array::logical\_null\_count for inspecting number of null values [\#6608](https://github.com/apache/arrow-rs/pull/6608) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Added casting from Binary/LargeBinary to Utf8View [\#6592](https://github.com/apache/arrow-rs/pull/6592) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ngli-me](https://github.com/ngli-me)) +- Parquet AsyncReader: Don't panic when empty offset\_index is Some\(\[\]\) [\#6582](https://github.com/apache/arrow-rs/pull/6582) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jroddev](https://github.com/jroddev)) +- Skip writing down null buffers for non-nullable primitive arrays [\#6524](https://github.com/apache/arrow-rs/pull/6524) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([bkirwi](https://github.com/bkirwi)) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2dea0b2cca64..38236ee39125 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -138,7 +138,7 @@ cargo test cargo test -p arrow ``` -For some changes, you may want to run additional tests. You can find up-to-date information on the current CI tests in [.github/workflows](https://github.com/apache/arrow-rs/tree/master/.github/workflows). Here are some examples of additional tests you may want to run: +For some changes, you may want to run additional tests. You can find up-to-date information on the current CI tests in [.github/workflows](https://github.com/apache/arrow-rs/tree/main/.github/workflows). Here are some examples of additional tests you may want to run: ```bash # run tests for the parquet crate @@ -217,13 +217,13 @@ cargo bench -p arrow-cast --bench parse_time To set the baseline for your benchmarks, use the --save-baseline flag: ```bash -git checkout master +git checkout main -cargo bench --bench parse_time -- --save-baseline master +cargo bench --bench parse_time -- --save-baseline main git checkout feature -cargo bench --bench parse_time -- --baseline master +cargo bench --bench parse_time -- --baseline main ``` ## Git Pre-Commit Hook diff --git a/Cargo.toml b/Cargo.toml index f210ae210012..375a4efac551 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,7 +62,7 @@ exclude = [ ] [workspace.package] -version = "53.2.0" +version = "53.3.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -77,20 +77,20 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "53.2.0", path = "./arrow", default-features = false } -arrow-arith = { version = "53.2.0", path = "./arrow-arith" } -arrow-array = { version = "53.2.0", path = "./arrow-array" } -arrow-buffer = { version = "53.2.0", path = "./arrow-buffer" } -arrow-cast = { version = "53.2.0", path = "./arrow-cast" } -arrow-csv = { version = "53.2.0", path = "./arrow-csv" } -arrow-data = { version = "53.2.0", path = "./arrow-data" } -arrow-ipc = { version = "53.2.0", path = "./arrow-ipc" } -arrow-json = { version = "53.2.0", path = "./arrow-json" } -arrow-ord = { version = "53.2.0", path = "./arrow-ord" } -arrow-row = { version = "53.2.0", path = "./arrow-row" } -arrow-schema = { version = "53.2.0", path = "./arrow-schema" } -arrow-select = { version = "53.2.0", path = "./arrow-select" } -arrow-string = { version = "53.2.0", path = "./arrow-string" } -parquet = { version = "53.2.0", path = "./parquet", default-features = false } +arrow = { version = "53.3.0", path = "./arrow", default-features = false } +arrow-arith = { version = "53.3.0", path = "./arrow-arith" } +arrow-array = { version = "53.3.0", path = "./arrow-array" } +arrow-buffer = { version = "53.3.0", path = "./arrow-buffer" } +arrow-cast = { version = "53.3.0", path = "./arrow-cast" } +arrow-csv = { version = "53.3.0", path = "./arrow-csv" } +arrow-data = { version = "53.3.0", path = "./arrow-data" } +arrow-ipc = { version = "53.3.0", path = "./arrow-ipc" } +arrow-json = { version = "53.3.0", path = "./arrow-json" } +arrow-ord = { version = "53.3.0", path = "./arrow-ord" } +arrow-row = { version = "53.3.0", path = "./arrow-row" } +arrow-schema = { version = "53.3.0", path = "./arrow-schema" } +arrow-select = { version = "53.3.0", path = "./arrow-select" } +arrow-string = { version = "53.3.0", path = "./arrow-string" } +parquet = { version = "53.3.0", path = "./parquet", default-features = false } chrono = { version = "0.4.34", default-features = false, features = ["clock"] } diff --git a/README.md b/README.md index 98c0a6615d9d..57794b1d6a46 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,6 @@ # Native Rust implementation of Apache Arrow and Apache Parquet -[![Coverage Status](https://codecov.io/gh/apache/arrow-rs/rust/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/arrow-rs?branch=master) - Welcome to the [Rust][rust] implementation of [Apache Arrow], the popular in-memory columnar format. This repo contains the following main components: @@ -58,7 +56,7 @@ breaking API changes) at most once a quarter, and release incremental minor versions in the intervening months. See [this ticket] for more details. To keep our maintenance burden down, we do regularly scheduled releases (major -and minor) from the `master` branch. How we handle PRs with breaking API changes +and minor) from the `main` branch. How we handle PRs with breaking API changes is described in the [contributing] guide. [contributing]: CONTRIBUTING.md#breaking-changes @@ -67,11 +65,11 @@ Planned Release Schedule | Approximate Date | Version | Notes | | ---------------- | -------- | --------------------------------------- | -| Sep 2024 | `53.0.0` | Major, potentially breaking API changes | -| Oct 2024 | `53.1.0` | Minor, NO breaking API changes | -| Oct 2024 | `53.2.0` | Minor, NO breaking API changes | | Nov 2024 | `53.3.0` | Minor, NO breaking API changes | | Dec 2024 | `54.0.0` | Major, potentially breaking API changes | +| Jan 2025 | `54.1.0` | Minor, NO breaking API changes | +| Feb 2025 | `54.2.0` | Minor, NO breaking API changes | +| Mar 2025 | `55.0.0` | Major, potentially breaking API changes | [this ticket]: https://github.com/apache/arrow-rs/issues/5368 [semantic versioning]: https://semver.org/ diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index d2ee0b9e2c72..66696df8aa04 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -39,7 +39,6 @@ arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } chrono = { workspace = true } -half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] diff --git a/arrow-arith/LICENSE.txt b/arrow-arith/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-arith/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-arith/NOTICE.txt b/arrow-arith/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-arith/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index bb983e1225ac..9b3272abb617 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -18,14 +18,12 @@ //! Kernels for operating on [`PrimitiveArray`]s use arrow_array::builder::BufferBuilder; -use arrow_array::types::ArrowDictionaryKeyType; use arrow_array::*; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::ArrowNativeType; use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::ArrowError; -use std::sync::Arc; /// See [`PrimitiveArray::unary`] pub fn unary(array: &PrimitiveArray, op: F) -> PrimitiveArray @@ -71,97 +69,6 @@ where array.try_unary_mut(op) } -/// A helper function that applies an infallible unary function to a dictionary array with primitive value type. -fn unary_dict(array: &DictionaryArray, op: F) -> Result -where - K: ArrowDictionaryKeyType + ArrowNumericType, - T: ArrowPrimitiveType, - F: Fn(T::Native) -> T::Native, -{ - let dict_values = array.values().as_any().downcast_ref().unwrap(); - let values = unary::(dict_values, op); - Ok(Arc::new(array.with_values(Arc::new(values)))) -} - -/// A helper function that applies a fallible unary function to a dictionary array with primitive value type. -fn try_unary_dict(array: &DictionaryArray, op: F) -> Result -where - K: ArrowDictionaryKeyType + ArrowNumericType, - T: ArrowPrimitiveType, - F: Fn(T::Native) -> Result, -{ - if !PrimitiveArray::::is_compatible(&array.value_type()) { - return Err(ArrowError::CastError(format!( - "Cannot perform the unary operation of type {} on dictionary array of value type {}", - T::DATA_TYPE, - array.value_type() - ))); - } - - let dict_values = array.values().as_any().downcast_ref().unwrap(); - let values = try_unary::(dict_values, op)?; - Ok(Arc::new(array.with_values(Arc::new(values)))) -} - -/// Applies an infallible unary function to an array with primitive values. -#[deprecated(note = "Use arrow_array::AnyDictionaryArray")] -pub fn unary_dyn(array: &dyn Array, op: F) -> Result -where - T: ArrowPrimitiveType, - F: Fn(T::Native) -> T::Native, -{ - downcast_dictionary_array! { - array => unary_dict::<_, F, T>(array, op), - t => { - if PrimitiveArray::::is_compatible(t) { - Ok(Arc::new(unary::( - array.as_any().downcast_ref::>().unwrap(), - op, - ))) - } else { - Err(ArrowError::NotYetImplemented(format!( - "Cannot perform unary operation of type {} on array of type {}", - T::DATA_TYPE, - t - ))) - } - } - } -} - -/// Applies a fallible unary function to an array with primitive values. -#[deprecated(note = "Use arrow_array::AnyDictionaryArray")] -pub fn try_unary_dyn(array: &dyn Array, op: F) -> Result -where - T: ArrowPrimitiveType, - F: Fn(T::Native) -> Result, -{ - downcast_dictionary_array! { - array => if array.values().data_type() == &T::DATA_TYPE { - try_unary_dict::<_, F, T>(array, op) - } else { - Err(ArrowError::NotYetImplemented(format!( - "Cannot perform unary operation on dictionary array of type {}", - array.data_type() - ))) - }, - t => { - if PrimitiveArray::::is_compatible(t) { - Ok(Arc::new(try_unary::( - array.as_any().downcast_ref::>().unwrap(), - op, - )?)) - } else { - Err(ArrowError::NotYetImplemented(format!( - "Cannot perform unary operation of type {} on array of type {}", - T::DATA_TYPE, - t - ))) - } - } - } -} - /// Allies a binary infallable function to two [`PrimitiveArray`]s, /// producing a new [`PrimitiveArray`] /// @@ -510,8 +417,8 @@ where #[cfg(test)] mod tests { use super::*; - use arrow_array::builder::*; use arrow_array::types::*; + use std::sync::Arc; #[test] #[allow(deprecated)] @@ -523,53 +430,6 @@ mod tests { result, Float64Array::from(vec![None, Some(7.0), None, Some(7.0)]) ); - - let result = unary_dyn::<_, Float64Type>(&input_slice, |n| n + 1.0).unwrap(); - - assert_eq!( - result.as_any().downcast_ref::().unwrap(), - &Float64Array::from(vec![None, Some(7.8), None, Some(8.2)]) - ); - } - - #[test] - #[allow(deprecated)] - fn test_unary_dict_and_unary_dyn() { - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(5).unwrap(); - builder.append(6).unwrap(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append_null(); - builder.append(9).unwrap(); - let dictionary_array = builder.finish(); - - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(6).unwrap(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append(9).unwrap(); - builder.append_null(); - builder.append(10).unwrap(); - let expected = builder.finish(); - - let result = unary_dict::<_, _, Int32Type>(&dictionary_array, |n| n + 1).unwrap(); - assert_eq!( - result - .as_any() - .downcast_ref::>() - .unwrap(), - &expected - ); - - let result = unary_dyn::<_, Int32Type>(&dictionary_array, |n| n + 1).unwrap(); - assert_eq!( - result - .as_any() - .downcast_ref::>() - .unwrap(), - &expected - ); } #[test] diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index 09d690d3237c..3458669a6fd1 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use arrow_array::cast::AsArray; use cast::as_primitive_array; -use chrono::{Datelike, NaiveDateTime, Offset, TimeZone, Timelike, Utc}; +use chrono::{Datelike, TimeZone, Timelike, Utc}; use arrow_array::temporal_conversions::{ date32_to_datetime, date64_to_datetime, timestamp_ms_to_datetime, timestamp_ns_to_datetime, @@ -82,6 +82,7 @@ impl std::fmt::Display for DatePart { /// Returns function to extract relevant [`DatePart`] from types like a /// [`NaiveDateTime`] or [`DateTime`]. /// +/// [`NaiveDateTime`]: chrono::NaiveDateTime /// [`DateTime`]: chrono::DateTime fn get_date_time_part_extract_fn(part: DatePart) -> fn(T) -> i32 where @@ -664,20 +665,6 @@ impl ChronoDateExt for T { } } -/// Parse the given string into a string representing fixed-offset that is correct as of the given -/// UTC NaiveDateTime. -/// -/// Note that the offset is function of time and can vary depending on whether daylight savings is -/// in effect or not. e.g. Australia/Sydney is +10:00 or +11:00 depending on DST. -#[deprecated(note = "Use arrow_array::timezone::Tz instead")] -pub fn using_chrono_tz_and_utc_naive_date_time( - tz: &str, - utc: NaiveDateTime, -) -> Option { - let tz: Tz = tz.parse().ok()?; - Some(tz.offset_from_utc_datetime(&utc).fix()) -} - /// Extracts the hours of a given array as an array of integers within /// the range of [0, 23]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. diff --git a/arrow-array/LICENSE.txt b/arrow-array/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-array/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-array/NOTICE.txt b/arrow-array/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-array/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-array/benches/fixed_size_list_array.rs b/arrow-array/benches/fixed_size_list_array.rs index 5f001a4f3d3a..5270a4a5def3 100644 --- a/arrow-array/benches/fixed_size_list_array.rs +++ b/arrow-array/benches/fixed_size_list_array.rs @@ -26,7 +26,7 @@ fn gen_fsl(len: usize, value_len: usize) -> FixedSizeListArray { let values = Arc::new(Int32Array::from( (0..len).map(|_| rng.gen::()).collect::>(), )); - let field = Arc::new(Field::new("item", values.data_type().clone(), true)); + let field = Arc::new(Field::new_list_field(values.data_type().clone(), true)); FixedSizeListArray::new(field, value_len as i32, values, None) } diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 8f8a39b2093f..0e8a7a7cb618 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -24,12 +24,6 @@ use arrow_schema::DataType; pub type GenericBinaryArray = GenericByteArray>; impl GenericBinaryArray { - /// Get the data type of the array. - #[deprecated(note = "please use `Self::DATA_TYPE` instead")] - pub const fn get_data_type() -> DataType { - Self::DATA_TYPE - } - /// Creates a [GenericBinaryArray] from a vector of byte slices /// /// See also [`Self::from_iter_values`] @@ -358,7 +352,7 @@ mod tests { let values = b"helloparquet"; let child_data = ArrayData::builder(DataType::UInt8) .len(12) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(values)) .build() .unwrap(); let offsets = [0, 5, 5, 12].map(|n| O::from_usize(n).unwrap()); @@ -372,11 +366,9 @@ mod tests { .unwrap(); let binary_array1 = GenericBinaryArray::::from(array_data1); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( - "item", - DataType::UInt8, - false, - ))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( + Field::new_list_field(DataType::UInt8, false), + )); let array_data2 = ArrayData::builder(data_type) .len(3) @@ -415,17 +407,15 @@ mod tests { let child_data = ArrayData::builder(DataType::UInt8) .len(15) .offset(5) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(values)) .build() .unwrap(); let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); let null_buffer = Buffer::from_slice_ref([0b101]); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( - "item", - DataType::UInt8, - false, - ))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( + Field::new_list_field(DataType::UInt8, false), + )); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -460,17 +450,15 @@ mod tests { let values = b"HelloArrow"; let child_data = ArrayData::builder(DataType::UInt8) .len(10) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(values)) .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010]))) .build() .unwrap(); let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( - "item", - DataType::UInt8, - true, - ))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( + Field::new_list_field(DataType::UInt8, true), + )); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -558,7 +546,7 @@ mod tests { .unwrap(); let offsets: [i32; 4] = [0, 5, 5, 12]; - let data_type = DataType::List(Arc::new(Field::new("item", DataType::UInt32, false))); + let data_type = DataType::List(Arc::new(Field::new_list_field(DataType::UInt32, false))); let array_data = ArrayData::builder(data_type) .len(3) .add_buffer(Buffer::from_slice_ref(offsets)) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 0f95adacf10c..9c2d4af8c454 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -308,6 +308,13 @@ impl Array for BooleanArray { self.values.is_empty() } + fn shrink_to_fit(&mut self) { + self.values.shrink_to_fit(); + if let Some(nulls) = &mut self.nulls { + nulls.shrink_to_fit(); + } + } + fn offset(&self) -> usize { self.values.offset() } diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index bec0caab1045..f2b22507081d 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -453,6 +453,14 @@ impl Array for GenericByteArray { self.value_offsets.len() <= 1 } + fn shrink_to_fit(&mut self) { + self.value_offsets.shrink_to_fit(); + self.value_data.shrink_to_fit(); + if let Some(nulls) = &mut self.nulls { + nulls.shrink_to_fit(); + } + } + fn offset(&self) -> usize { 0 } diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 81bb6a38550b..9d2d396a5266 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -430,31 +430,31 @@ impl GenericByteViewArray { /// /// Before GC: /// ```text - /// ┌──────┐ - /// │......│ - /// │......│ - /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer + /// ┌──────┐ + /// │......│ + /// │......│ + /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer /// │ View 1 │─ ─ ─ ─ │......│ with data that /// ├────────────────────┤ │......│ is not referred /// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or - /// └────────────────────┘ │......│ View 2 - /// │......│ - /// 2 views, refer to │......│ - /// small portions of a └──────┘ - /// large buffer + /// └────────────────────┘ │......│ View 2 + /// │......│ + /// 2 views, refer to │......│ + /// small portions of a └──────┘ + /// large buffer /// ``` - /// + /// /// After GC: /// /// ```text /// ┌────────────────────┐ ┌─────┐ After gc, only - /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is - /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by - /// │ View 2 │─ ─ ─ ─ └─────┘ the views is - /// └────────────────────┘ left - /// - /// - /// 2 views + /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is + /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by + /// │ View 2 │─ ─ ─ ─ └─────┘ the views is + /// └────────────────────┘ left + /// + /// + /// 2 views /// ``` /// This method will compact the data buffers by recreating the view array and only include the data /// that is pointed to by the views. @@ -575,6 +575,15 @@ impl Array for GenericByteViewArray { self.views.is_empty() } + fn shrink_to_fit(&mut self) { + self.views.shrink_to_fit(); + self.buffers.iter_mut().for_each(|b| b.shrink_to_fit()); + self.buffers.shrink_to_fit(); + if let Some(nulls) = &mut self.nulls { + nulls.shrink_to_fit(); + } + } + fn offset(&self) -> usize { 0 } diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 1187e16769a0..f852b57fb65e 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -249,7 +249,7 @@ pub struct DictionaryArray { /// map to the real values. keys: PrimitiveArray, - /// Array of dictionary values (can by any DataType). + /// Array of dictionary values (can be any DataType). values: ArrayRef, /// Values are ordered. @@ -720,6 +720,11 @@ impl Array for DictionaryArray { self.keys.is_empty() } + fn shrink_to_fit(&mut self) { + self.keys.shrink_to_fit(); + self.values.shrink_to_fit(); + } + fn offset(&self) -> usize { self.keys.offset() } @@ -729,7 +734,7 @@ impl Array for DictionaryArray { } fn logical_nulls(&self) -> Option { - match self.values.nulls() { + match self.values.logical_nulls() { None => self.nulls().cloned(), Some(value_nulls) => { let mut builder = BooleanBufferBuilder::new(self.len()); @@ -749,6 +754,26 @@ impl Array for DictionaryArray { } } + fn logical_null_count(&self) -> usize { + match (self.keys.nulls(), self.values.logical_nulls()) { + (None, None) => 0, + (Some(key_nulls), None) => key_nulls.null_count(), + (None, Some(value_nulls)) => self + .keys + .values() + .iter() + .filter(|k| value_nulls.is_null(k.as_usize())) + .count(), + (Some(key_nulls), Some(value_nulls)) => self + .keys + .values() + .iter() + .enumerate() + .filter(|(idx, k)| key_nulls.is_null(*idx) || value_nulls.is_null(k.as_usize())) + .count(), + } + } + fn is_nullable(&self) -> bool { !self.is_empty() && (self.nulls().is_some() || self.values.is_nullable()) } @@ -1020,7 +1045,7 @@ impl AnyDictionaryArray for DictionaryArray { mod tests { use super::*; use crate::cast::as_dictionary_array; - use crate::{Int16Array, Int32Array, Int8Array}; + use crate::{Int16Array, Int32Array, Int8Array, RunArray}; use arrow_buffer::{Buffer, ToByteSlice}; #[test] @@ -1445,6 +1470,54 @@ mod tests { assert_eq!(values, &[Some(50), None, None, Some(2)]) } + #[test] + fn test_logical_nulls() -> Result<(), ArrowError> { + let values = Arc::new(RunArray::try_new( + &Int32Array::from(vec![1, 3, 7]), + &Int32Array::from(vec![Some(1), None, Some(3)]), + )?) as ArrayRef; + + // For this test to be meaningful, the values array need to have different nulls and logical nulls + assert_eq!(values.null_count(), 0); + assert_eq!(values.logical_null_count(), 2); + + // Construct a trivial dictionary with 1-1 mapping to underlying array + let dictionary = DictionaryArray::::try_new( + Int8Array::from((0..values.len()).map(|i| i as i8).collect::>()), + Arc::clone(&values), + )?; + + // No keys are null + assert_eq!(dictionary.null_count(), 0); + // Dictionary array values are logically nullable + assert_eq!(dictionary.logical_null_count(), values.logical_null_count()); + assert_eq!(dictionary.logical_nulls(), values.logical_nulls()); + assert!(dictionary.is_nullable()); + + // Construct a trivial dictionary with 1-1 mapping to underlying array except that key 0 is nulled out + let dictionary = DictionaryArray::::try_new( + Int8Array::from( + (0..values.len()) + .map(|i| i as i8) + .map(|i| if i == 0 { None } else { Some(i) }) + .collect::>(), + ), + Arc::clone(&values), + )?; + + // One key is null + assert_eq!(dictionary.null_count(), 1); + + // Dictionary array values are logically nullable + assert_eq!( + dictionary.logical_null_count(), + values.logical_null_count() + 1 + ); + assert!(dictionary.is_nullable()); + + Ok(()) + } + #[test] fn test_normalized_keys() { let values = vec![132, 0, 1].into(); diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 8f1489ee4c3c..576b8012491b 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -237,6 +237,7 @@ impl FixedSizeBinaryArray { /// /// Returns error if argument has length zero, or sizes of nested slices don't match. #[deprecated( + since = "28.0.0", note = "This function will fail if the iterator produces only None values; prefer `try_from_sparse_iter_with_size`" )] pub fn try_from_sparse_iter(mut iter: T) -> Result @@ -602,6 +603,13 @@ impl Array for FixedSizeBinaryArray { self.len == 0 } + fn shrink_to_fit(&mut self) { + self.value_data.shrink_to_fit(); + if let Some(nulls) = &mut self.nulls { + nulls.shrink_to_fit(); + } + } + fn offset(&self) -> usize { 0 } @@ -662,7 +670,7 @@ mod tests { let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) .len(3) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(&values)) .build() .unwrap(); let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); @@ -691,7 +699,7 @@ mod tests { let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) .len(2) .offset(1) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(&values)) .build() .unwrap(); let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); @@ -721,7 +729,7 @@ mod tests { // [null, [10, 11, 12, 13]] let array_data = unsafe { ArrayData::builder(DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::UInt8, false)), + Arc::new(Field::new_list_field(DataType::UInt8, false)), 4, )) .len(2) @@ -757,7 +765,7 @@ mod tests { let array_data = unsafe { ArrayData::builder(DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Binary, false)), + Arc::new(Field::new_list_field(DataType::Binary, false)), 4, )) .len(3) @@ -781,7 +789,7 @@ mod tests { let array_data = unsafe { ArrayData::builder(DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::UInt8, false)), + Arc::new(Field::new_list_field(DataType::UInt8, false)), 4, )) .len(3) @@ -798,7 +806,7 @@ mod tests { let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) .len(3) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(&values)) .build() .unwrap(); let arr = FixedSizeBinaryArray::from(array_data); diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 00a3144a87ad..44be442c9f85 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -95,7 +95,7 @@ use std::sync::Arc; /// .build() /// .unwrap(); /// let list_data_type = DataType::FixedSizeList( -/// Arc::new(Field::new("item", DataType::Int32, false)), +/// Arc::new(Field::new_list_field(DataType::Int32, false)), /// 3, /// ); /// let list_data = ArrayData::builder(list_data_type.clone()) @@ -401,6 +401,13 @@ impl Array for FixedSizeListArray { self.len == 0 } + fn shrink_to_fit(&mut self) { + self.values.shrink_to_fit(); + if let Some(nulls) = &mut self.nulls { + nulls.shrink_to_fit(); + } + } + fn offset(&self) -> usize { 0 } @@ -487,7 +494,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, false)), 3); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_child_data(value_data.clone()) @@ -540,7 +547,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, false)), 3); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -569,7 +576,7 @@ mod tests { // Construct a fixed size list array from the above two let list_data_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 2); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, false)), 2); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data.clone()) @@ -627,7 +634,7 @@ mod tests { // Construct a fixed size list array from the above two let list_data_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 2); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, false)), 2); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data) @@ -650,7 +657,7 @@ mod tests { Some(4), ])); - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let list = FixedSizeListArray::new(field.clone(), 2, values.clone(), None); assert_eq!(list.len(), 3); @@ -674,7 +681,7 @@ mod tests { let err = FixedSizeListArray::try_new(field, 2, values.clone(), Some(nulls)).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeListArray, expected 3 got 2"); - let field = Arc::new(Field::new("item", DataType::Int32, false)); + let field = Arc::new(Field::new_list_field(DataType::Int32, false)); let err = FixedSizeListArray::try_new(field.clone(), 2, values.clone(), None).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Found unmasked nulls for non-nullable FixedSizeListArray field \"item\""); @@ -682,14 +689,14 @@ mod tests { let nulls = NullBuffer::new(BooleanBuffer::new(Buffer::from([0b0000101]), 0, 3)); FixedSizeListArray::new(field, 2, values.clone(), Some(nulls)); - let field = Arc::new(Field::new("item", DataType::Int64, true)); + let field = Arc::new(Field::new_list_field(DataType::Int64, true)); let err = FixedSizeListArray::try_new(field, 2, values, None).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: FixedSizeListArray expected data type Int64 got Int32 for \"item\""); } #[test] fn empty_fixed_size_list() { - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let nulls = NullBuffer::new_null(2); let values = new_empty_array(&DataType::Int32); let list = FixedSizeListArray::new(field.clone(), 0, values, Some(nulls)); diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 1fab0009f2cc..bed0bdf889b2 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -485,6 +485,14 @@ impl Array for GenericListArray { self.value_offsets.len() <= 1 } + fn shrink_to_fit(&mut self) { + if let Some(nulls) = &mut self.nulls { + nulls.shrink_to_fit(); + } + self.values.shrink_to_fit(); + self.value_offsets.shrink_to_fit(); + } + fn offset(&self) -> usize { 0 } @@ -565,7 +573,7 @@ mod tests { // [[0, 1, 2], [3, 4, 5], [6, 7]] let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 6, 8])); - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); ListArray::new(field, offsets, Arc::new(values), None) } @@ -595,7 +603,8 @@ mod tests { let value_offsets = Buffer::from([]); // Construct a list array from the above two - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(0) .add_buffer(value_offsets) @@ -621,7 +630,8 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -766,7 +776,8 @@ mod tests { bit_util::set_bit(&mut null_bits, 8); // Construct a list array from the above two - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -917,7 +928,8 @@ mod tests { .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build_unchecked() }; - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -934,7 +946,8 @@ mod tests { #[cfg(not(feature = "force_validate"))] fn test_list_array_invalid_child_array_len() { let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -964,7 +977,8 @@ mod tests { let value_offsets = Buffer::from_slice_ref([2, 2, 5, 7]); - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1010,7 +1024,8 @@ mod tests { .build_unchecked() }; - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .add_buffer(buf2) diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 4e949a642701..7e52a6f3e457 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -326,6 +326,15 @@ impl Array for GenericListViewArray { self.value_sizes.is_empty() } + fn shrink_to_fit(&mut self) { + if let Some(nulls) = &mut self.nulls { + nulls.shrink_to_fit(); + } + self.values.shrink_to_fit(); + self.value_offsets.shrink_to_fit(); + self.value_sizes.shrink_to_fit(); + } + fn offset(&self) -> usize { 0 } @@ -490,7 +499,7 @@ mod tests { fn test_empty_list_view_array() { // Construct an empty value array let vec: Vec = vec![]; - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let sizes = ScalarBuffer::from(vec![]); let offsets = ScalarBuffer::from(vec![]); let values = Int32Array::from(vec); @@ -508,7 +517,7 @@ mod tests { .build() .unwrap(); - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let sizes = ScalarBuffer::from(vec![3i32, 3, 2]); let offsets = ScalarBuffer::from(vec![0i32, 3, 6]); let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); @@ -544,7 +553,7 @@ mod tests { .build() .unwrap(); - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let sizes = ScalarBuffer::from(vec![3i64, 3, 2]); let offsets = ScalarBuffer::from(vec![0i64, 3, 6]); let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); @@ -590,7 +599,7 @@ mod tests { let buffer = BooleanBuffer::new(Buffer::from(null_bits), 0, 9); let null_buffer = NullBuffer::new(buffer); - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let sizes = ScalarBuffer::from(vec![2, 0, 0, 2, 2, 0, 3, 0, 1]); let offsets = ScalarBuffer::from(vec![0, 2, 2, 2, 4, 6, 6, 9, 9]); let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); @@ -656,7 +665,7 @@ mod tests { let null_buffer = NullBuffer::new(buffer); // Construct a large list view array from the above two - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let sizes = ScalarBuffer::from(vec![2i64, 0, 0, 2, 2, 0, 3, 0, 1]); let offsets = ScalarBuffer::from(vec![0i64, 2, 2, 2, 4, 6, 6, 9, 9]); let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); @@ -718,7 +727,7 @@ mod tests { // Construct a buffer for value offsets, for the nested array: // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] // Construct a list array from the above two - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let sizes = ScalarBuffer::from(vec![2i32, 0, 0, 2, 2, 0, 3, 0, 1]); let offsets = ScalarBuffer::from(vec![0i32, 2, 2, 2, 4, 6, 6, 9, 9]); let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); @@ -741,7 +750,7 @@ mod tests { .build_unchecked() }; let list_data_type = - DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -759,7 +768,7 @@ mod tests { fn test_list_view_array_invalid_child_array_len() { let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); let list_data_type = - DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -771,7 +780,7 @@ mod tests { #[test] fn test_list_view_array_offsets_need_not_start_at_zero() { - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let sizes = ScalarBuffer::from(vec![0i32, 0, 3]); let offsets = ScalarBuffer::from(vec![2i32, 2, 5]); let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); @@ -800,7 +809,7 @@ mod tests { }; let list_data_type = - DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .add_buffer(offset_buf2) @@ -942,7 +951,7 @@ mod tests { .build_unchecked() }; let list_data_type = - DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(2) @@ -976,7 +985,7 @@ mod tests { .build_unchecked() }; let list_data_type = - DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -1015,7 +1024,7 @@ mod tests { .build_unchecked() }; let list_data_type = - DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 254437630a44..18a7c491aa16 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -372,6 +372,14 @@ impl Array for MapArray { self.value_offsets.len() <= 1 } + fn shrink_to_fit(&mut self) { + if let Some(nulls) = &mut self.nulls { + nulls.shrink_to_fit(); + } + self.entries.shrink_to_fit(); + self.value_offsets.shrink_to_fit(); + } + fn offset(&self) -> usize { 0 } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 4a9e54a60789..23b3cb628aaf 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -76,6 +76,8 @@ mod list_view_array; pub use list_view_array::*; +use crate::iterator::ArrayIter; + /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the array as [`Any`] so that it can be @@ -165,6 +167,12 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// ``` fn is_empty(&self) -> bool; + /// Shrinks the capacity of any exclusively owned buffer as much as possible + /// + /// Shared or externally allocated buffers will be ignored, and + /// any buffer offsets will be preserved. + fn shrink_to_fit(&mut self) {} + /// Returns the offset into the underlying data used by this array(-slice). /// Note that the underlying data can be shared by many arrays. /// This defaults to `0`. @@ -315,8 +323,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// even if the nulls present in [`DictionaryArray::values`] are not referenced by any key, /// and therefore would not appear in [`Array::logical_nulls`]. fn is_nullable(&self) -> bool { - // TODO this is not necessarily perfect default implementation, since null_count() and logical_null_count() are not always equivalent - self.null_count() != 0 + self.logical_null_count() != 0 } /// Returns the total number of bytes of memory pointed to by this array. @@ -364,6 +371,15 @@ impl Array for ArrayRef { self.as_ref().is_empty() } + /// For shared buffers, this is a no-op. + fn shrink_to_fit(&mut self) { + if let Some(slf) = Arc::get_mut(self) { + slf.shrink_to_fit(); + } else { + // We ignore shared buffers. + } + } + fn offset(&self) -> usize { self.as_ref().offset() } @@ -570,6 +586,40 @@ pub trait ArrayAccessor: Array { unsafe fn value_unchecked(&self, index: usize) -> Self::Item; } +/// A trait for Arrow String Arrays, currently three types are supported: +/// - `StringArray` +/// - `LargeStringArray` +/// - `StringViewArray` +/// +/// This trait helps to abstract over the different types of string arrays +/// so that we don't need to duplicate the implementation for each type. +pub trait StringArrayType<'a>: ArrayAccessor + Sized { + /// Returns true if all data within this string array is ASCII + fn is_ascii(&self) -> bool; + + /// Constructs a new iterator + fn iter(&self) -> ArrayIter; +} + +impl<'a, O: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray { + fn is_ascii(&self) -> bool { + GenericStringArray::::is_ascii(self) + } + + fn iter(&self) -> ArrayIter { + GenericStringArray::::iter(self) + } +} +impl<'a> StringArrayType<'a> for &'a StringViewArray { + fn is_ascii(&self) -> bool { + StringViewArray::is_ascii(self) + } + + fn iter(&self) -> ArrayIter { + StringViewArray::iter(self) + } +} + impl PartialEq for dyn Array + '_ { fn eq(&self, other: &Self) -> bool { self.to_data().eq(&other.to_data()) @@ -876,7 +926,7 @@ mod tests { #[test] fn test_empty_list_primitive() { - let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let array = new_empty_array(&data_type); let a = array.as_any().downcast_ref::().unwrap(); assert_eq!(a.len(), 0); @@ -934,7 +984,7 @@ mod tests { #[test] fn test_null_list_primitive() { - let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let array = new_null_array(&data_type, 9); let a = array.as_any().downcast_ref::().unwrap(); assert_eq!(a.len(), 9); diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 7b0d6c5ca1b6..57aa23bf9040 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1152,6 +1152,13 @@ impl Array for PrimitiveArray { self.values.is_empty() } + fn shrink_to_fit(&mut self) { + self.values.shrink_to_fit(); + if let Some(nulls) = &mut self.nulls { + nulls.shrink_to_fit(); + } + } + fn offset(&self) -> usize { 0 } @@ -1480,24 +1487,6 @@ def_numeric_from_vec!(TimestampMicrosecondType); def_numeric_from_vec!(TimestampNanosecondType); impl PrimitiveArray { - /// Construct a timestamp array from a vec of i64 values and an optional timezone - #[deprecated(note = "Use with_timezone_opt instead")] - pub fn from_vec(data: Vec, timezone: Option) -> Self - where - Self: From>, - { - Self::from(data).with_timezone_opt(timezone) - } - - /// Construct a timestamp array from a vec of `Option` values and an optional timezone - #[deprecated(note = "Use with_timezone_opt instead")] - pub fn from_opt_vec(data: Vec>, timezone: Option) -> Self - where - Self: From>>, - { - Self::from(data).with_timezone_opt(timezone) - } - /// Returns the timezone of this array if any pub fn timezone(&self) -> Option<&str> { match self.data_type() { @@ -2296,7 +2285,7 @@ mod tests { ]; let array_data = ArrayData::builder(DataType::Decimal128(38, 6)) .len(2) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(&values)) .build() .unwrap(); let decimal_array = Decimal128Array::from(array_data); diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index dc4e6c96d9da..b340bf9a9065 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -330,6 +330,11 @@ impl Array for RunArray { self.run_ends.is_empty() } + fn shrink_to_fit(&mut self) { + self.run_ends.shrink_to_fit(); + self.values.shrink_to_fit(); + } + fn offset(&self) -> usize { self.run_ends.offset() } diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 25581cfaa49d..ed70e5744fff 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -17,18 +17,12 @@ use crate::types::GenericStringType; use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait}; -use arrow_schema::{ArrowError, DataType}; +use arrow_schema::ArrowError; /// A [`GenericByteArray`] for storing `str` pub type GenericStringArray = GenericByteArray>; impl GenericStringArray { - /// Get the data type of the array. - #[deprecated(note = "please use `Self::DATA_TYPE` instead")] - pub const fn get_data_type() -> DataType { - Self::DATA_TYPE - } - /// Returns the number of `Unicode Scalar Value` in the string at index `i`. /// # Performance /// This function has `O(n)` time complexity where `n` is the string length. @@ -167,7 +161,7 @@ mod tests { use crate::Array; use arrow_buffer::Buffer; use arrow_data::ArrayData; - use arrow_schema::Field; + use arrow_schema::{DataType, Field}; use std::sync::Arc; #[test] @@ -382,17 +376,15 @@ mod tests { let child_data = ArrayData::builder(DataType::UInt8) .len(15) .offset(5) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(values)) .build() .unwrap(); let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); let null_buffer = Buffer::from_slice_ref([0b101]); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( - "item", - DataType::UInt8, - false, - ))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( + Field::new_list_field(DataType::UInt8, false), + )); // [None, Some("Parquet")] let array_data = ArrayData::builder(data_type) @@ -427,7 +419,7 @@ mod tests { let values = b"HelloArrow"; let child_data = ArrayData::builder(DataType::UInt8) .len(10) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(values)) .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010]))) .build() .unwrap(); @@ -436,11 +428,9 @@ mod tests { // It is possible to create a null struct containing a non-nullable child // see https://github.com/apache/arrow-rs/pull/3244 for details - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( - "item", - DataType::UInt8, - true, - ))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( + Field::new_list_field(DataType::UInt8, true), + )); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -469,16 +459,14 @@ mod tests { let values = b"HelloArrow"; let child_data = ArrayData::builder(DataType::UInt16) .len(5) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(values)) .build() .unwrap(); let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap()); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( - "item", - DataType::UInt16, - false, - ))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( + Field::new_list_field(DataType::UInt16, false), + )); let array_data = ArrayData::builder(data_type) .len(2) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 41eb8235e540..de6d9c699d22 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -239,12 +239,6 @@ impl StructArray { &self.fields } - /// Returns child array refs of the struct array - #[deprecated(note = "Use columns().to_vec()")] - pub fn columns_ref(&self) -> Vec { - self.columns().to_vec() - } - /// Return field names in this struct array pub fn column_names(&self) -> Vec<&str> { match self.data_type() { @@ -370,6 +364,13 @@ impl Array for StructArray { self.len == 0 } + fn shrink_to_fit(&mut self) { + if let Some(nulls) = &mut self.nulls { + nulls.shrink_to_fit(); + } + self.fields.iter_mut().for_each(|n| n.shrink_to_fit()); + } + fn offset(&self) -> usize { 0 } diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 3c6da5a7b5c0..b442395b4978 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -653,6 +653,17 @@ impl UnionArray { } } } + + /// Returns a vector of tuples containing each field's type_id and its logical null buffer. + /// Only fields with non-zero null counts are included. + fn fields_logical_nulls(&self) -> Vec<(i8, NullBuffer)> { + self.fields + .iter() + .enumerate() + .filter_map(|(type_id, field)| Some((type_id as i8, field.as_ref()?.logical_nulls()?))) + .filter(|(_, nulls)| nulls.null_count() > 0) + .collect() + } } impl From for UnionArray { @@ -744,6 +755,17 @@ impl Array for UnionArray { self.type_ids.is_empty() } + fn shrink_to_fit(&mut self) { + self.type_ids.shrink_to_fit(); + if let Some(offsets) = &mut self.offsets { + offsets.shrink_to_fit(); + } + for array in self.fields.iter_mut().flatten() { + array.shrink_to_fit(); + } + self.fields.shrink_to_fit(); + } + fn offset(&self) -> usize { 0 } @@ -768,11 +790,7 @@ impl Array for UnionArray { .flatten(); } - let logical_nulls = fields - .iter() - .filter_map(|(type_id, _)| Some((type_id, self.child(type_id).logical_nulls()?))) - .filter(|(_, nulls)| nulls.null_count() > 0) - .collect::>(); + let logical_nulls = self.fields_logical_nulls(); if logical_nulls.is_empty() { return None; @@ -1941,15 +1959,14 @@ mod tests { let array = UnionArray::try_new(union_fields(), type_ids, Some(offsets), children).unwrap(); - let result = array.logical_nulls(); + let expected = BooleanBuffer::from(vec![true, true, true, false, false, false]); - let expected = NullBuffer::from(vec![true, true, true, false, false, false]); - assert_eq!(Some(expected), result); + assert_eq!(expected, array.logical_nulls().unwrap().into_inner()); + assert_eq!(expected, array.gather_nulls(array.fields_logical_nulls())); } #[test] fn test_sparse_union_logical_nulls_mask_all_nulls_skip_one() { - // If we used union_fields() (3 fields with nulls), the choosen strategy would be Gather on x86 without any specified target feature e.g CI runtime let fields: UnionFields = [ (1, Arc::new(Field::new("A", DataType::Int32, true))), (3, Arc::new(Field::new("B", DataType::Float64, true))), @@ -1966,10 +1983,13 @@ mod tests { let array = UnionArray::try_new(fields.clone(), type_ids, None, children).unwrap(); - let result = array.logical_nulls(); + let expected = BooleanBuffer::from(vec![false, false, true, false]); - let expected = NullBuffer::from(vec![false, false, true, false]); - assert_eq!(Some(expected), result); + assert_eq!(expected, array.logical_nulls().unwrap().into_inner()); + assert_eq!( + expected, + array.mask_sparse_all_with_nulls_skip_one(array.fields_logical_nulls()) + ); //like above, but repeated to genereate two exact bitmasks and a non empty remainder let len = 2 * 64 + 32; @@ -1986,12 +2006,15 @@ mod tests { ) .unwrap(); - let result = array.logical_nulls(); - let expected = - NullBuffer::from_iter([false, false, true, false].into_iter().cycle().take(len)); + BooleanBuffer::from_iter([false, false, true, false].into_iter().cycle().take(len)); + assert_eq!(array.len(), len); - assert_eq!(Some(expected), result); + assert_eq!(expected, array.logical_nulls().unwrap().into_inner()); + assert_eq!( + expected, + array.mask_sparse_all_with_nulls_skip_one(array.fields_logical_nulls()) + ); } #[test] @@ -2010,10 +2033,13 @@ mod tests { let array = UnionArray::try_new(union_fields(), type_ids, None, children).unwrap(); - let result = array.logical_nulls(); + let expected = BooleanBuffer::from(vec![true, true, true, true, false, false]); - let expected = NullBuffer::from(vec![true, true, true, true, false, false]); - assert_eq!(Some(expected), result); + assert_eq!(expected, array.logical_nulls().unwrap().into_inner()); + assert_eq!( + expected, + array.mask_sparse_skip_without_nulls(array.fields_logical_nulls()) + ); //like above, but repeated to genereate two exact bitmasks and a non empty remainder let len = 2 * 64 + 32; @@ -2031,16 +2057,19 @@ mod tests { let array = UnionArray::try_new(union_fields(), type_ids, None, children).unwrap(); - let result = array.logical_nulls(); - - let expected = NullBuffer::from_iter( + let expected = BooleanBuffer::from_iter( [true, true, true, true, false, true] .into_iter() .cycle() .take(len), ); + assert_eq!(array.len(), len); - assert_eq!(Some(expected), result); + assert_eq!(expected, array.logical_nulls().unwrap().into_inner()); + assert_eq!( + expected, + array.mask_sparse_skip_without_nulls(array.fields_logical_nulls()) + ); } #[test] @@ -2059,10 +2088,13 @@ mod tests { let array = UnionArray::try_new(union_fields(), type_ids, None, children).unwrap(); - let result = array.logical_nulls(); + let expected = BooleanBuffer::from(vec![false, false, true, true, false, false]); - let expected = NullBuffer::from(vec![false, false, true, true, false, false]); - assert_eq!(Some(expected), result); + assert_eq!(expected, array.logical_nulls().unwrap().into_inner()); + assert_eq!( + expected, + array.mask_sparse_skip_fully_null(array.fields_logical_nulls()) + ); //like above, but repeated to genereate two exact bitmasks and a non empty remainder let len = 2 * 64 + 32; @@ -2080,16 +2112,19 @@ mod tests { let array = UnionArray::try_new(union_fields(), type_ids, None, children).unwrap(); - let result = array.logical_nulls(); - - let expected = NullBuffer::from_iter( + let expected = BooleanBuffer::from_iter( [false, false, true, true, false, false] .into_iter() .cycle() .take(len), ); + assert_eq!(array.len(), len); - assert_eq!(Some(expected), result); + assert_eq!(expected, array.logical_nulls().unwrap().into_inner()); + assert_eq!( + expected, + array.mask_sparse_skip_fully_null(array.fields_logical_nulls()) + ); } #[test] @@ -2125,11 +2160,10 @@ mod tests { ) .unwrap(); - let result = array.logical_nulls(); - - let expected = NullBuffer::from(vec![true, false, true, false]); + let expected = BooleanBuffer::from(vec![true, false, true, false]); - assert_eq!(Some(expected), result); + assert_eq!(expected, array.logical_nulls().unwrap().into_inner()); + assert_eq!(expected, array.gather_nulls(array.fields_logical_nulls())); } fn union_fields() -> UnionFields { diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index 5dff67650687..5c142b277d14 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -182,7 +182,7 @@ where let field = self .field .clone() - .unwrap_or_else(|| Arc::new(Field::new("item", values.data_type().clone(), true))); + .unwrap_or_else(|| Arc::new(Field::new_list_field(values.data_type().clone(), true))); FixedSizeListArray::new(field, self.list_len, values, nulls) } @@ -204,7 +204,7 @@ where let field = self .field .clone() - .unwrap_or_else(|| Arc::new(Field::new("item", values.data_type().clone(), true))); + .unwrap_or_else(|| Arc::new(Field::new_list_field(values.data_type().clone(), true))); FixedSizeListArray::new(field, self.list_len, values, nulls) } diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index d12c2b7db468..7268e751b149 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -136,7 +136,7 @@ impl GenericByteViewBuilder { /// Override the size of buffers to allocate for holding string data /// Use `with_fixed_block_size` instead. - #[deprecated(note = "Use `with_fixed_block_size` instead")] + #[deprecated(since = "53.0.0", note = "Use `with_fixed_block_size` instead")] pub fn with_block_size(self, block_size: u32) -> Self { self.with_fixed_block_size(block_size) } diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index a7d16f45f53b..a9c88ec6c586 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -49,7 +49,6 @@ use std::sync::Arc; /// builder.append(true); /// /// // Null -/// builder.values().append_value("?"); // irrelevant /// builder.append(false); /// /// // [D] @@ -70,15 +69,14 @@ use std::sync::Arc; /// array.values().as_ref(), /// &StringArray::from(vec![ /// Some("A"), Some("B"), Some("C"), -/// Some("?"), Some("D"), None, -/// Some("F") +/// Some("D"), None, Some("F") /// ]) /// ); /// /// // Offsets are indexes into the values array /// assert_eq!( /// array.value_offsets(), -/// &[0, 3, 3, 4, 5, 7] +/// &[0, 3, 3, 3, 4, 6] /// ); /// ``` /// @@ -299,7 +297,7 @@ where let field = match &self.field { Some(f) => f.clone(), - None => Arc::new(Field::new("item", values.data_type().clone(), true)), + None => Arc::new(Field::new_list_field(values.data_type().clone(), true)), }; GenericListArray::new(field, offsets, values, nulls) @@ -316,7 +314,7 @@ where let field = match &self.field { Some(f) => f.clone(), - None => Arc::new(Field::new("item", values.data_type().clone(), true)), + None => Arc::new(Field::new_list_field(values.data_type().clone(), true)), }; GenericListArray::new(field, offsets, values, nulls) @@ -586,7 +584,7 @@ mod tests { fn test_boxed_list_list_array_builder() { // This test is same as `test_list_list_array_builder` but uses boxed builders. let values_builder = make_builder( - &DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + &DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))), 10, ); test_boxed_generic_list_generic_list_array_builder::(values_builder); @@ -596,7 +594,7 @@ mod tests { fn test_boxed_large_list_large_list_array_builder() { // This test is same as `test_list_list_array_builder` but uses boxed builders. let values_builder = make_builder( - &DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))), + &DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))), 10, ); test_boxed_generic_list_generic_list_array_builder::(values_builder); @@ -791,7 +789,7 @@ mod tests { #[test] #[should_panic(expected = "Non-nullable field of ListArray \\\"item\\\" cannot contain nulls")] fn test_checks_nullability() { - let field = Arc::new(Field::new("item", DataType::Int32, false)); + let field = Arc::new(Field::new_list_field(DataType::Int32, false)); let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); builder.append_value([Some(1), None]); builder.finish(); @@ -800,7 +798,7 @@ mod tests { #[test] #[should_panic(expected = "ListArray expected data type Int64 got Int32")] fn test_checks_data_type() { - let field = Arc::new(Field::new("item", DataType::Int64, false)); + let field = Arc::new(Field::new_list_field(DataType::Int64, false)); let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone()); builder.append_value([Some(1)]); builder.finish(); diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index dd1a5c3ae722..89a96280eb87 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -123,7 +123,7 @@ //! let string_field = Arc::new(Field::new("i32", DataType::Utf8, false)); //! //! let i32_list = Arc::new(self.i32_list.finish()) as ArrayRef; -//! let value_field = Arc::new(Field::new("item", DataType::Int32, true)); +//! let value_field = Arc::new(Field::new_list_field(DataType::Int32, true)); //! let i32_list_field = Arc::new(Field::new("i32_list", DataType::List(value_field), true)); //! //! StructArray::from(vec![ diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 396ab2fed851..2b288445c74b 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::*; -use crate::types::Int32Type; use crate::StructArray; +use crate::{ + builder::*, + types::{Int16Type, Int32Type, Int64Type, Int8Type}, +}; use arrow_buffer::NullBufferBuilder; use arrow_schema::{DataType, Fields, IntervalUnit, SchemaBuilder, TimeUnit}; use std::sync::Arc; @@ -46,8 +48,7 @@ use std::sync::Arc; /// let mut example_col = ListBuilder::new(StructBuilder::from_fields( /// vec![Field::new( /// "value_list", -/// DataType::List(Arc::new(Field::new( -/// "item", +/// DataType::List(Arc::new(Field::new_list_field( /// DataType::Struct(Fields::from(vec![ /// Field::new("key", DataType::Utf8, true), /// Field::new("value", DataType::Utf8, true), @@ -291,29 +292,42 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box panic!("The field of Map data type {t:?} should has a child Struct field"), }, DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)), - DataType::Dictionary(key_type, value_type) if **key_type == DataType::Int32 => { - match &**value_type { - DataType::Utf8 => { - let dict_builder: StringDictionaryBuilder = - StringDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) - } - DataType::LargeUtf8 => { - let dict_builder: LargeStringDictionaryBuilder = - LargeStringDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) - } - DataType::Binary => { - let dict_builder: BinaryDictionaryBuilder = - BinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) - } - DataType::LargeBinary => { - let dict_builder: LargeBinaryDictionaryBuilder = - LargeBinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); - Box::new(dict_builder) + t @ DataType::Dictionary(key_type, value_type) => { + macro_rules! dict_builder { + ($key_type:ty) => { + match &**value_type { + DataType::Utf8 => { + let dict_builder: StringDictionaryBuilder<$key_type> = + StringDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + DataType::LargeUtf8 => { + let dict_builder: LargeStringDictionaryBuilder<$key_type> = + LargeStringDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + DataType::Binary => { + let dict_builder: BinaryDictionaryBuilder<$key_type> = + BinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + DataType::LargeBinary => { + let dict_builder: LargeBinaryDictionaryBuilder<$key_type> = + LargeBinaryDictionaryBuilder::with_capacity(capacity, 256, 1024); + Box::new(dict_builder) + } + t => panic!("Dictionary value type {t:?} is not currently supported"), + } + }; + } + match &**key_type { + DataType::Int8 => dict_builder!(Int8Type), + DataType::Int16 => dict_builder!(Int16Type), + DataType::Int32 => dict_builder!(Int32Type), + DataType::Int64 => dict_builder!(Int64Type), + _ => { + panic!("Data type {t:?} with key type {key_type:?} is not currently supported") } - t => panic!("Unsupported dictionary value type {t:?} is not currently supported"), } } t => panic!("Data type {t:?} is not currently supported"), @@ -431,12 +445,14 @@ impl StructBuilder { #[cfg(test)] mod tests { + use std::any::type_name; + use super::*; use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::Field; - use crate::array::Array; + use crate::{array::Array, types::ArrowDictionaryKeyType}; #[test] fn test_struct_array_builder() { @@ -691,10 +707,31 @@ mod tests { } #[test] - fn test_struct_array_builder_from_dictionary_type() { + fn test_struct_array_builder_from_dictionary_type_int8_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int8); + } + + #[test] + fn test_struct_array_builder_from_dictionary_type_int16_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int16); + } + + #[test] + fn test_struct_array_builder_from_dictionary_type_int32_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int32); + } + + #[test] + fn test_struct_array_builder_from_dictionary_type_int64_key() { + test_struct_array_builder_from_dictionary_type_inner::(DataType::Int64); + } + + fn test_struct_array_builder_from_dictionary_type_inner( + key_type: DataType, + ) { let dict_field = Field::new( "f1", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + DataType::Dictionary(Box::new(key_type), Box::new(DataType::Utf8)), false, ); let fields = vec![dict_field.clone()]; @@ -702,10 +739,14 @@ mod tests { let cloned_dict_field = dict_field.clone(); let expected_child_dtype = dict_field.data_type(); let mut struct_builder = StructBuilder::from_fields(vec![cloned_dict_field], 5); - struct_builder - .field_builder::>(0) - .expect("Builder should be StringDictionaryBuilder") - .append_value("dict string"); + let Some(dict_builder) = struct_builder.field_builder::>(0) + else { + panic!( + "Builder should be StringDictionaryBuilder<{}>", + type_name::() + ) + }; + dict_builder.append_value("dict string"); struct_builder.append(true); let array = struct_builder.finish(); @@ -715,13 +756,15 @@ mod tests { } #[test] - #[should_panic(expected = "Data type Dictionary(Int16, Utf8) is not currently supported")] + #[should_panic( + expected = "Data type Dictionary(UInt64, Utf8) with key type UInt64 is not currently supported" + )] fn test_struct_array_builder_from_schema_unsupported_type() { let fields = vec![ - Field::new("f1", DataType::Int16, false), + Field::new("f1", DataType::UInt64, false), Field::new( "f2", - DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), false, ), ]; @@ -730,7 +773,7 @@ mod tests { } #[test] - #[should_panic(expected = "Unsupported dictionary value type Int32 is not currently supported")] + #[should_panic(expected = "Dictionary value type Int32 is not currently supported")] fn test_struct_array_builder_from_dict_with_unsupported_value_type() { let fields = vec![Field::new( "f1", diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 232b29560cbf..fc657f94c6a6 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -689,12 +689,6 @@ array_downcast_fn!(as_struct_array, StructArray); array_downcast_fn!(as_union_array, UnionArray); array_downcast_fn!(as_map_array, MapArray); -/// Force downcast of an Array, such as an ArrayRef to Decimal128Array, panic’ing on failure. -#[deprecated(note = "please use `as_primitive_array::` instead")] -pub fn as_decimal_array(arr: &dyn Array) -> &PrimitiveArray { - as_primitive_array::(arr) -} - /// Downcasts a `dyn Array` to a concrete type /// /// ``` diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index 4426e0986409..144f2a21afec 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -121,7 +121,10 @@ type Result = std::result::Result; /// This function copies the content of two FFI structs [arrow_data::ffi::FFI_ArrowArray] and /// [arrow_schema::ffi::FFI_ArrowSchema] in the array to the location pointed by the raw pointers. /// Usually the raw pointers are provided by the array data consumer. -#[deprecated(note = "Use FFI_ArrowArray::new and FFI_ArrowSchema::try_from")] +#[deprecated( + since = "52.0.0", + note = "Use FFI_ArrowArray::new and FFI_ArrowSchema::try_from" +)] pub unsafe fn export_array_into_raw( src: ArrayRef, out_array: *mut FFI_ArrowArray, @@ -719,7 +722,7 @@ mod tests_to_then_from_ffi { // Construct a list array from the above two let list_data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::Int32, false), + Field::new_list_field(DataType::Int32, false), )); let list_data = ArrayData::builder(list_data_type) @@ -1478,7 +1481,7 @@ mod tests_from_ffi { let offsets: Vec = vec![0, 2, 4, 6, 8, 10, 12, 14, 16]; let value_offsets = Buffer::from_slice_ref(offsets); let inner_list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let inner_list_data = ArrayData::builder(inner_list_data_type.clone()) .len(8) .add_buffer(value_offsets) diff --git a/arrow-array/src/ffi_stream.rs b/arrow-array/src/ffi_stream.rs index 34f0cd7cfc74..3d4e89e80b89 100644 --- a/arrow-array/src/ffi_stream.rs +++ b/arrow-array/src/ffi_stream.rs @@ -379,21 +379,6 @@ impl RecordBatchReader for ArrowArrayStreamReader { } } -/// Exports a record batch reader to raw pointer of the C Stream Interface provided by the consumer. -/// -/// # Safety -/// Assumes that the pointer represents valid C Stream Interfaces, both in memory -/// representation and lifetime via the `release` mechanism. -#[deprecated(note = "Use FFI_ArrowArrayStream::new")] -pub unsafe fn export_reader_into_raw( - reader: Box, - out_stream: *mut FFI_ArrowArrayStream, -) { - let stream = FFI_ArrowArrayStream::new(reader); - - std::ptr::write_unaligned(out_stream, stream); -} - #[cfg(test)] mod tests { use super::*; diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 78108d441b05..8958ca6fae62 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -32,15 +32,6 @@ pub trait RecordBatchReader: Iterator> { /// Implementation of this trait should guarantee that all `RecordBatch`'s returned by this /// reader should have the same schema as returned from this method. fn schema(&self) -> SchemaRef; - - /// Reads the next `RecordBatch`. - #[deprecated( - since = "2.0.0", - note = "This method is deprecated in favour of `next` from the trait Iterator." - )] - fn next_batch(&mut self) -> Result, ArrowError> { - self.next().transpose() - } } impl RecordBatchReader for Box { @@ -58,6 +49,129 @@ pub trait RecordBatchWriter { fn close(self) -> Result<(), ArrowError>; } +/// Creates an array from a literal slice of values, +/// suitable for rapid testing and development. +/// +/// Example: +/// +/// ```rust +/// +/// use arrow_array::create_array; +/// +/// let array = create_array!(Int32, [1, 2, 3, 4, 5]); +/// let array = create_array!(Utf8, [Some("a"), Some("b"), None, Some("e")]); +/// ``` +/// Support for limited data types is available. The macro will return a compile error if an unsupported data type is used. +/// Presently supported data types are: +/// - `Boolean`, `Null` +/// - `Decimal128`, `Decimal256` +/// - `Float16`, `Float32`, `Float64` +/// - `Int8`, `Int16`, `Int32`, `Int64` +/// - `UInt8`, `UInt16`, `UInt32`, `UInt64` +/// - `IntervalDayTime`, `IntervalYearMonth` +/// - `Second`, `Millisecond`, `Microsecond`, `Nanosecond` +/// - `Second32`, `Millisecond32`, `Microsecond64`, `Nanosecond64` +/// - `DurationSecond`, `DurationMillisecond`, `DurationMicrosecond`, `DurationNanosecond` +/// - `TimestampSecond`, `TimestampMillisecond`, `TimestampMicrosecond`, `TimestampNanosecond` +/// - `Utf8`, `Utf8View`, `LargeUtf8`, `Binary`, `LargeBinary` +#[macro_export] +macro_rules! create_array { + // `@from` is used for those types that have a common method `::from` + (@from Boolean) => { $crate::BooleanArray }; + (@from Int8) => { $crate::Int8Array }; + (@from Int16) => { $crate::Int16Array }; + (@from Int32) => { $crate::Int32Array }; + (@from Int64) => { $crate::Int64Array }; + (@from UInt8) => { $crate::UInt8Array }; + (@from UInt16) => { $crate::UInt16Array }; + (@from UInt32) => { $crate::UInt32Array }; + (@from UInt64) => { $crate::UInt64Array }; + (@from Float16) => { $crate::Float16Array }; + (@from Float32) => { $crate::Float32Array }; + (@from Float64) => { $crate::Float64Array }; + (@from Utf8) => { $crate::StringArray }; + (@from Utf8View) => { $crate::StringViewArray }; + (@from LargeUtf8) => { $crate::LargeStringArray }; + (@from IntervalDayTime) => { $crate::IntervalDayTimeArray }; + (@from IntervalYearMonth) => { $crate::IntervalYearMonthArray }; + (@from Second) => { $crate::TimestampSecondArray }; + (@from Millisecond) => { $crate::TimestampMillisecondArray }; + (@from Microsecond) => { $crate::TimestampMicrosecondArray }; + (@from Nanosecond) => { $crate::TimestampNanosecondArray }; + (@from Second32) => { $crate::Time32SecondArray }; + (@from Millisecond32) => { $crate::Time32MillisecondArray }; + (@from Microsecond64) => { $crate::Time64MicrosecondArray }; + (@from Nanosecond64) => { $crate::Time64Nanosecond64Array }; + (@from DurationSecond) => { $crate::DurationSecondArray }; + (@from DurationMillisecond) => { $crate::DurationMillisecondArray }; + (@from DurationMicrosecond) => { $crate::DurationMicrosecondArray }; + (@from DurationNanosecond) => { $crate::DurationNanosecondArray }; + (@from Decimal128) => { $crate::Decimal128Array }; + (@from Decimal256) => { $crate::Decimal256Array }; + (@from TimestampSecond) => { $crate::TimestampSecondArray }; + (@from TimestampMillisecond) => { $crate::TimestampMillisecondArray }; + (@from TimestampMicrosecond) => { $crate::TimestampMicrosecondArray }; + (@from TimestampNanosecond) => { $crate::TimestampNanosecondArray }; + + (@from $ty: ident) => { + compile_error!(concat!("Unsupported data type: ", stringify!($ty))) + }; + + (Null, $size: expr) => { + std::sync::Arc::new($crate::NullArray::new($size)) + }; + + (Binary, [$($values: expr),*]) => { + std::sync::Arc::new($crate::BinaryArray::from_vec(vec![$($values),*])) + }; + + (LargeBinary, [$($values: expr),*]) => { + std::sync::Arc::new($crate::LargeBinaryArray::from_vec(vec![$($values),*])) + }; + + ($ty: tt, [$($values: expr),*]) => { + std::sync::Arc::new(<$crate::create_array!(@from $ty)>::from(vec![$($values),*])) + }; +} + +/// Creates a record batch from literal slice of values, suitable for rapid +/// testing and development. +/// +/// Example: +/// +/// ```rust +/// use arrow_array::record_batch; +/// use arrow_schema; +/// +/// let batch = record_batch!( +/// ("a", Int32, [1, 2, 3]), +/// ("b", Float64, [Some(4.0), None, Some(5.0)]), +/// ("c", Utf8, ["alpha", "beta", "gamma"]) +/// ); +/// ``` +/// Due to limitation of [`create_array!`] macro, support for limited data types is available. +#[macro_export] +macro_rules! record_batch { + ($(($name: expr, $type: ident, [$($values: expr),*])),*) => { + { + let schema = std::sync::Arc::new(arrow_schema::Schema::new(vec![ + $( + arrow_schema::Field::new($name, arrow_schema::DataType::$type, true), + )* + ])); + + let batch = $crate::RecordBatch::try_new( + schema, + vec![$( + $crate::create_array!($type, [$($values),*]), + )*] + ); + + batch + } + } +} + /// A two-dimensional batch of column-oriented data with a defined /// [schema](arrow_schema::Schema). /// @@ -68,6 +182,19 @@ pub trait RecordBatchWriter { /// /// Record batches are a convenient unit of work for various /// serialization and computation functions, possibly incremental. +/// +/// Use the [`record_batch!`] macro to create a [`RecordBatch`] from +/// literal slice of values, useful for rapid prototyping and testing. +/// +/// Example: +/// ```rust +/// use arrow_array::record_batch; +/// let batch = record_batch!( +/// ("a", Int32, [1, 2, 3]), +/// ("b", Float64, [Some(4.0), None, Some(5.0)]), +/// ("c", Utf8, ["alpha", "beta", "gamma"]) +/// ); +/// ``` #[derive(Clone, Debug, PartialEq)] pub struct RecordBatch { schema: SchemaRef, @@ -411,6 +538,19 @@ impl RecordBatch { /// ("b", b), /// ]); /// ``` + /// Another way to quickly create a [`RecordBatch`] is to use the [`record_batch!`] macro, + /// which is particularly helpful for rapid prototyping and testing. + /// + /// Example: + /// + /// ```rust + /// use arrow_array::record_batch; + /// let batch = record_batch!( + /// ("a", Int32, [1, 2, 3]), + /// ("b", Float64, [Some(4.0), None, Some(5.0)]), + /// ("c", Utf8, ["alpha", "beta", "gamma"]) + /// ); + /// ``` pub fn try_from_iter(value: I) -> Result where I: IntoIterator, @@ -806,7 +946,7 @@ mod tests { fn create_record_batch_field_name_mismatch() { let fields = vec![ Field::new("a1", DataType::Int32, false), - Field::new_list("a2", Field::new("item", DataType::Int8, false), false), + Field::new_list("a2", Field::new_list_field(DataType::Int8, false), false), ]; let schema = Arc::new(Schema::new(vec![Field::new_struct("a", fields, true)])); diff --git a/arrow-array/src/temporal_conversions.rs b/arrow-array/src/temporal_conversions.rs index 8d238b3a196c..23f950d55048 100644 --- a/arrow-array/src/temporal_conversions.rs +++ b/arrow-array/src/temporal_conversions.rs @@ -37,8 +37,18 @@ pub const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS; pub const MICROSECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MICROSECONDS; /// Number of nanoseconds in a day pub const NANOSECONDS_IN_DAY: i64 = SECONDS_IN_DAY * NANOSECONDS; -/// Number of days between 0001-01-01 and 1970-01-01 -pub const EPOCH_DAYS_FROM_CE: i32 = 719_163; + +/// Constant from chrono crate +/// +/// Number of days between Januari 1, 1970 and December 31, 1 BCE which we define to be day 0. +/// 4 full leap year cycles until December 31, 1600 4 * 146097 = 584388 +/// 1 day until January 1, 1601 1 +/// 369 years until Januari 1, 1970 369 * 365 = 134685 +/// of which floor(369 / 4) are leap years floor(369 / 4) = 92 +/// except for 1700, 1800 and 1900 -3 + +/// -------- +/// 719163 +pub const UNIX_EPOCH_DAY: i64 = 719_163; /// converts a `i32` representing a `date32` to [`NaiveDateTime`] #[inline] @@ -134,6 +144,31 @@ pub fn timestamp_s_to_datetime(v: i64) -> Option { Some(DateTime::from_timestamp(v, 0)?.naive_utc()) } +/// Similar to timestamp_s_to_datetime but only compute `date` +#[inline] +pub fn timestamp_s_to_date(secs: i64) -> Option { + let days = secs.div_euclid(86_400) + UNIX_EPOCH_DAY; + if days < i32::MIN as i64 || days > i32::MAX as i64 { + return None; + } + let date = NaiveDate::from_num_days_from_ce_opt(days as i32)?; + Some(date.and_time(NaiveTime::default()).and_utc().naive_utc()) +} + +/// Similar to timestamp_s_to_datetime but only compute `time` +#[inline] +pub fn timestamp_s_to_time(secs: i64) -> Option { + let secs = secs.rem_euclid(86_400); + let time = NaiveTime::from_num_seconds_from_midnight_opt(secs as u32, 0)?; + Some( + DateTime::::from_naive_utc_and_offset( + NaiveDateTime::new(NaiveDate::default(), time), + Utc, + ) + .naive_utc(), + ) +} + /// converts a `i64` representing a `timestamp(ms)` to [`NaiveDateTime`] #[inline] pub fn timestamp_ms_to_datetime(v: i64) -> Option { @@ -274,10 +309,28 @@ pub fn as_duration(v: i64) -> Option { mod tests { use crate::temporal_conversions::{ date64_to_datetime, split_second, timestamp_ms_to_datetime, timestamp_ns_to_datetime, + timestamp_s_to_date, timestamp_s_to_datetime, timestamp_s_to_time, timestamp_us_to_datetime, NANOSECONDS, }; use chrono::DateTime; + #[test] + fn test_timestamp_func() { + let timestamp = 1234; + let datetime = timestamp_s_to_datetime(timestamp).unwrap(); + let expected_date = datetime.date(); + let expected_time = datetime.time(); + + assert_eq!( + timestamp_s_to_date(timestamp).unwrap().date(), + expected_date + ); + assert_eq!( + timestamp_s_to_time(timestamp).unwrap().time(), + expected_time + ); + } + #[test] fn negative_input_timestamp_ns_to_datetime() { assert_eq!( diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 92262fc04a57..3d8cfcdb112b 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -69,7 +69,7 @@ pub trait ArrowPrimitiveType: primitive::PrimitiveTypeSealed + 'static { const DATA_TYPE: DataType; /// Returns the byte width of this primitive type. - #[deprecated(note = "Use ArrowNativeType::get_byte_width")] + #[deprecated(since = "52.0.0", note = "Use ArrowNativeType::get_byte_width")] fn get_byte_width() -> usize { std::mem::size_of::() } @@ -324,12 +324,6 @@ pub trait ArrowTimestampType: ArrowTemporalType { /// The [`TimeUnit`] of this timestamp. const UNIT: TimeUnit; - /// Returns the `TimeUnit` of this timestamp. - #[deprecated(note = "Use Self::UNIT")] - fn get_time_unit() -> TimeUnit { - Self::UNIT - } - /// Creates a ArrowTimestampType::Native from the provided [`NaiveDateTime`] /// /// See [`DataType::Timestamp`] for more information on timezone handling diff --git a/arrow-avro/Cargo.toml b/arrow-avro/Cargo.toml index d2436f0c15de..c103c2ecc0f3 100644 --- a/arrow-avro/Cargo.toml +++ b/arrow-avro/Cargo.toml @@ -39,11 +39,9 @@ deflate = ["flate2"] snappy = ["snap", "crc"] [dependencies] -arrow-array = { workspace = true } -arrow-buffer = { workspace = true } -arrow-cast = { workspace = true } -arrow-data = { workspace = true } -arrow-schema = { workspace = true } +arrow-schema = { workspace = true } +arrow-buffer = { workspace = true } +arrow-array = { workspace = true } serde_json = { version = "1.0", default-features = false, features = ["std"] } serde = { version = "1.0.188", features = ["derive"] } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true } @@ -53,4 +51,5 @@ crc = { version = "3.0", optional = true } [dev-dependencies] +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow-avro/LICENSE.txt b/arrow-avro/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-avro/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-avro/NOTICE.txt b/arrow-avro/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-avro/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-avro/src/codec.rs b/arrow-avro/src/codec.rs index 1e2acd99d828..2ac1ad038bd7 100644 --- a/arrow-avro/src/codec.rs +++ b/arrow-avro/src/codec.rs @@ -29,7 +29,7 @@ use std::sync::Arc; /// To accommodate this we special case two-variant unions where one of the /// variants is the null type, and use this to derive arrow's notion of nullability #[derive(Debug, Copy, Clone)] -enum Nulls { +pub enum Nullability { /// The nulls are encoded as the first union variant NullFirst, /// The nulls are encoded as the second union variant @@ -39,7 +39,7 @@ enum Nulls { /// An Avro datatype mapped to the arrow data model #[derive(Debug, Clone)] pub struct AvroDataType { - nulls: Option, + nullability: Option, metadata: HashMap, codec: Codec, } @@ -48,7 +48,15 @@ impl AvroDataType { /// Returns an arrow [`Field`] with the given name pub fn field_with_name(&self, name: &str) -> Field { let d = self.codec.data_type(); - Field::new(name, d, self.nulls.is_some()).with_metadata(self.metadata.clone()) + Field::new(name, d, self.nullability.is_some()).with_metadata(self.metadata.clone()) + } + + pub fn codec(&self) -> &Codec { + &self.codec + } + + pub fn nullability(&self) -> Option { + self.nullability } } @@ -65,9 +73,13 @@ impl AvroField { self.data_type.field_with_name(&self.name) } - /// Returns the [`Codec`] - pub fn codec(&self) -> &Codec { - &self.data_type.codec + /// Returns the [`AvroDataType`] + pub fn data_type(&self) -> &AvroDataType { + &self.data_type + } + + pub fn name(&self) -> &str { + &self.name } } @@ -114,7 +126,7 @@ pub enum Codec { Fixed(i32), List(Arc), Struct(Arc<[AvroField]>), - Duration, + Interval, } impl Codec { @@ -137,9 +149,11 @@ impl Codec { Self::TimestampMicros(is_utc) => { DataType::Timestamp(TimeUnit::Microsecond, is_utc.then(|| "+00:00".into())) } - Self::Duration => DataType::Interval(IntervalUnit::MonthDayNano), + Self::Interval => DataType::Interval(IntervalUnit::MonthDayNano), Self::Fixed(size) => DataType::FixedSizeBinary(*size), - Self::List(f) => DataType::List(Arc::new(f.field_with_name("item"))), + Self::List(f) => { + DataType::List(Arc::new(f.field_with_name(Field::LIST_FIELD_DEFAULT_NAME))) + } Self::Struct(f) => DataType::Struct(f.iter().map(|x| x.field()).collect()), } } @@ -198,7 +212,7 @@ fn make_data_type<'a>( ) -> Result { match schema { Schema::TypeName(TypeName::Primitive(p)) => Ok(AvroDataType { - nulls: None, + nullability: None, metadata: Default::default(), codec: (*p).into(), }), @@ -211,12 +225,12 @@ fn make_data_type<'a>( match (f.len() == 2, null) { (true, Some(0)) => { let mut field = make_data_type(&f[1], namespace, resolver)?; - field.nulls = Some(Nulls::NullFirst); + field.nullability = Some(Nullability::NullFirst); Ok(field) } (true, Some(1)) => { let mut field = make_data_type(&f[0], namespace, resolver)?; - field.nulls = Some(Nulls::NullSecond); + field.nullability = Some(Nullability::NullSecond); Ok(field) } _ => Err(ArrowError::NotYetImplemented(format!( @@ -239,7 +253,7 @@ fn make_data_type<'a>( .collect::>()?; let field = AvroDataType { - nulls: None, + nullability: None, codec: Codec::Struct(fields), metadata: r.attributes.field_metadata(), }; @@ -249,7 +263,7 @@ fn make_data_type<'a>( ComplexType::Array(a) => { let mut field = make_data_type(a.items.as_ref(), namespace, resolver)?; Ok(AvroDataType { - nulls: None, + nullability: None, metadata: a.attributes.field_metadata(), codec: Codec::List(Arc::new(field)), }) @@ -260,7 +274,7 @@ fn make_data_type<'a>( })?; let field = AvroDataType { - nulls: None, + nullability: None, metadata: f.attributes.field_metadata(), codec: Codec::Fixed(size), }; @@ -296,7 +310,7 @@ fn make_data_type<'a>( (Some("local-timestamp-micros"), c @ Codec::Int64) => { *c = Codec::TimestampMicros(false) } - (Some("duration"), c @ Codec::Fixed(12)) => *c = Codec::Duration, + (Some("duration"), c @ Codec::Fixed(12)) => *c = Codec::Interval, (Some(logical), _) => { // Insert unrecognized logical type into metadata map field.metadata.insert("logicalType".into(), logical.into()); diff --git a/arrow-avro/src/compression.rs b/arrow-avro/src/compression.rs index c5c7a6dabc33..f29b8dd07606 100644 --- a/arrow-avro/src/compression.rs +++ b/arrow-avro/src/compression.rs @@ -16,7 +16,6 @@ // under the License. use arrow_schema::ArrowError; -use flate2::read; use std::io; use std::io::Read; @@ -35,7 +34,7 @@ impl CompressionCodec { match self { #[cfg(feature = "deflate")] CompressionCodec::Deflate => { - let mut decoder = read::DeflateDecoder::new(block); + let mut decoder = flate2::read::DeflateDecoder::new(block); let mut out = Vec::new(); decoder.read_to_end(&mut out)?; Ok(out) diff --git a/arrow-avro/src/reader/cursor.rs b/arrow-avro/src/reader/cursor.rs new file mode 100644 index 000000000000..4b6a5a4d65db --- /dev/null +++ b/arrow-avro/src/reader/cursor.rs @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::reader::vlq::read_varint; +use arrow_schema::ArrowError; + +/// A wrapper around a byte slice, providing low-level decoding for Avro +/// +/// +#[derive(Debug)] +pub(crate) struct AvroCursor<'a> { + buf: &'a [u8], + start_len: usize, +} + +impl<'a> AvroCursor<'a> { + pub(crate) fn new(buf: &'a [u8]) -> Self { + Self { + buf, + start_len: buf.len(), + } + } + + /// Returns the current cursor position + #[inline] + pub(crate) fn position(&self) -> usize { + self.start_len - self.buf.len() + } + + /// Read a single `u8` + #[inline] + pub(crate) fn get_u8(&mut self) -> Result { + match self.buf.first().copied() { + Some(x) => { + self.buf = &self.buf[1..]; + Ok(x) + } + None => Err(ArrowError::ParseError("Unexpected EOF".to_string())), + } + } + + #[inline] + pub(crate) fn get_bool(&mut self) -> Result { + Ok(self.get_u8()? != 0) + } + + pub(crate) fn read_vlq(&mut self) -> Result { + let (val, offset) = read_varint(self.buf) + .ok_or_else(|| ArrowError::ParseError("bad varint".to_string()))?; + self.buf = &self.buf[offset..]; + Ok(val) + } + + #[inline] + pub(crate) fn get_int(&mut self) -> Result { + let varint = self.read_vlq()?; + let val: u32 = varint + .try_into() + .map_err(|_| ArrowError::ParseError("varint overflow".to_string()))?; + Ok((val >> 1) as i32 ^ -((val & 1) as i32)) + } + + #[inline] + pub(crate) fn get_long(&mut self) -> Result { + let val = self.read_vlq()?; + Ok((val >> 1) as i64 ^ -((val & 1) as i64)) + } + + pub(crate) fn get_bytes(&mut self) -> Result<&'a [u8], ArrowError> { + let len: usize = self.get_long()?.try_into().map_err(|_| { + ArrowError::ParseError("offset overflow reading avro bytes".to_string()) + })?; + + if (self.buf.len() < len) { + return Err(ArrowError::ParseError( + "Unexpected EOF reading bytes".to_string(), + )); + } + let ret = &self.buf[..len]; + self.buf = &self.buf[len..]; + Ok(ret) + } + + #[inline] + pub(crate) fn get_float(&mut self) -> Result { + if (self.buf.len() < 4) { + return Err(ArrowError::ParseError( + "Unexpected EOF reading float".to_string(), + )); + } + let ret = f32::from_le_bytes(self.buf[..4].try_into().unwrap()); + self.buf = &self.buf[4..]; + Ok(ret) + } + + #[inline] + pub(crate) fn get_double(&mut self) -> Result { + if (self.buf.len() < 8) { + return Err(ArrowError::ParseError( + "Unexpected EOF reading float".to_string(), + )); + } + let ret = f64::from_le_bytes(self.buf[..8].try_into().unwrap()); + self.buf = &self.buf[8..]; + Ok(ret) + } +} diff --git a/arrow-avro/src/reader/header.rs b/arrow-avro/src/reader/header.rs index 19d48d1f89a1..98c285171bf3 100644 --- a/arrow-avro/src/reader/header.rs +++ b/arrow-avro/src/reader/header.rs @@ -19,7 +19,7 @@ use crate::compression::{CompressionCodec, CODEC_METADATA_KEY}; use crate::reader::vlq::VLQDecoder; -use crate::schema::Schema; +use crate::schema::{Schema, SCHEMA_METADATA_KEY}; use arrow_schema::ArrowError; #[derive(Debug)] @@ -89,6 +89,17 @@ impl Header { ))), } } + + /// Returns the [`Schema`] if any + pub fn schema(&self) -> Result>, ArrowError> { + self.get(SCHEMA_METADATA_KEY) + .map(|x| { + serde_json::from_slice(x).map_err(|e| { + ArrowError::ParseError(format!("Failed to parse Avro schema JSON: {e}")) + }) + }) + .transpose() + } } /// A decoder for [`Header`] diff --git a/arrow-avro/src/reader/mod.rs b/arrow-avro/src/reader/mod.rs index 0151db7f855a..12fa67d9c8e3 100644 --- a/arrow-avro/src/reader/mod.rs +++ b/arrow-avro/src/reader/mod.rs @@ -26,6 +26,8 @@ mod header; mod block; +mod cursor; +mod record; mod vlq; /// Read a [`Header`] from the provided [`BufRead`] @@ -73,35 +75,144 @@ fn read_blocks(mut reader: R) -> impl Iterator RecordBatch { + let file = File::open(file).unwrap(); + let mut reader = BufReader::new(file); + let header = read_header(&mut reader).unwrap(); + let compression = header.compression().unwrap(); + let schema = header.schema().unwrap().unwrap(); + let root = AvroField::try_from(&schema).unwrap(); + let mut decoder = RecordDecoder::try_new(root.data_type()).unwrap(); + + for result in read_blocks(reader) { + let block = result.unwrap(); + assert_eq!(block.sync, header.sync()); + if let Some(c) = compression { + let decompressed = c.decompress(&block.data).unwrap(); + + let mut offset = 0; + let mut remaining = block.count; + while remaining > 0 { + let to_read = remaining.max(batch_size); + offset += decoder + .decode(&decompressed[offset..], block.count) + .unwrap(); + + remaining -= to_read; + } + assert_eq!(offset, decompressed.len()); + } + } + decoder.flush().unwrap() + } #[test] - fn test_mux() { + fn test_alltypes() { let files = [ "avro/alltypes_plain.avro", "avro/alltypes_plain.snappy.avro", "avro/alltypes_plain.zstandard.avro", - "avro/alltypes_nulls_plain.avro", ]; + let expected = RecordBatch::try_from_iter_with_nullable([ + ( + "id", + Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _, + true, + ), + ( + "bool_col", + Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _, + true, + ), + ( + "tinyint_col", + Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _, + true, + ), + ( + "smallint_col", + Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _, + true, + ), + ( + "int_col", + Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _, + true, + ), + ( + "bigint_col", + Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _, + true, + ), + ( + "float_col", + Arc::new(Float32Array::from_iter_values( + (0..8).map(|x| (x % 2) as f32 * 1.1), + )) as _, + true, + ), + ( + "double_col", + Arc::new(Float64Array::from_iter_values( + (0..8).map(|x| (x % 2) as f64 * 10.1), + )) as _, + true, + ), + ( + "date_string_col", + Arc::new(BinaryArray::from_iter_values([ + [48, 51, 47, 48, 49, 47, 48, 57], + [48, 51, 47, 48, 49, 47, 48, 57], + [48, 52, 47, 48, 49, 47, 48, 57], + [48, 52, 47, 48, 49, 47, 48, 57], + [48, 50, 47, 48, 49, 47, 48, 57], + [48, 50, 47, 48, 49, 47, 48, 57], + [48, 49, 47, 48, 49, 47, 48, 57], + [48, 49, 47, 48, 49, 47, 48, 57], + ])) as _, + true, + ), + ( + "string_col", + Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _, + true, + ), + ( + "timestamp_col", + Arc::new( + TimestampMicrosecondArray::from_iter_values([ + 1235865600000000, // 2009-03-01T00:00:00.000 + 1235865660000000, // 2009-03-01T00:01:00.000 + 1238544000000000, // 2009-04-01T00:00:00.000 + 1238544060000000, // 2009-04-01T00:01:00.000 + 1233446400000000, // 2009-02-01T00:00:00.000 + 1233446460000000, // 2009-02-01T00:01:00.000 + 1230768000000000, // 2009-01-01T00:00:00.000 + 1230768060000000, // 2009-01-01T00:01:00.000 + ]) + .with_timezone("+00:00"), + ) as _, + true, + ), + ]) + .unwrap(); + for file in files { - println!("file: {file}"); - let file = File::open(arrow_test_data(file)).unwrap(); - let mut reader = BufReader::new(file); - let header = read_header(&mut reader).unwrap(); - let compression = header.compression().unwrap(); - println!("compression: {compression:?}"); - for result in read_blocks(reader) { - let block = result.unwrap(); - assert_eq!(block.sync, header.sync()); - if let Some(c) = compression { - c.decompress(&block.data).unwrap(); - } - } + let file = arrow_test_data(file); + + assert_eq!(read_file(&file, 8), expected); + assert_eq!(read_file(&file, 3), expected); } } } diff --git a/arrow-avro/src/reader/record.rs b/arrow-avro/src/reader/record.rs new file mode 100644 index 000000000000..52a58cf63303 --- /dev/null +++ b/arrow-avro/src/reader/record.rs @@ -0,0 +1,292 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::codec::{AvroDataType, Codec, Nullability}; +use crate::reader::block::{Block, BlockDecoder}; +use crate::reader::cursor::AvroCursor; +use crate::reader::header::Header; +use crate::schema::*; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::*; +use arrow_schema::{ + ArrowError, DataType, Field as ArrowField, FieldRef, Fields, Schema as ArrowSchema, SchemaRef, +}; +use std::collections::HashMap; +use std::io::Read; +use std::sync::Arc; + +/// Decodes avro encoded data into [`RecordBatch`] +pub struct RecordDecoder { + schema: SchemaRef, + fields: Vec, +} + +impl RecordDecoder { + pub fn try_new(data_type: &AvroDataType) -> Result { + match Decoder::try_new(data_type)? { + Decoder::Record(fields, encodings) => Ok(Self { + schema: Arc::new(ArrowSchema::new(fields)), + fields: encodings, + }), + encoding => Err(ArrowError::ParseError(format!( + "Expected record got {encoding:?}" + ))), + } + } + + pub fn schema(&self) -> &SchemaRef { + &self.schema + } + + /// Decode `count` records from `buf` + pub fn decode(&mut self, buf: &[u8], count: usize) -> Result { + let mut cursor = AvroCursor::new(buf); + for _ in 0..count { + for field in &mut self.fields { + field.decode(&mut cursor)?; + } + } + Ok(cursor.position()) + } + + /// Flush the decoded records into a [`RecordBatch`] + pub fn flush(&mut self) -> Result { + let arrays = self + .fields + .iter_mut() + .map(|x| x.flush(None)) + .collect::, _>>()?; + + RecordBatch::try_new(self.schema.clone(), arrays) + } +} + +#[derive(Debug)] +enum Decoder { + Null(usize), + Boolean(BooleanBufferBuilder), + Int32(Vec), + Int64(Vec), + Float32(Vec), + Float64(Vec), + Date32(Vec), + TimeMillis(Vec), + TimeMicros(Vec), + TimestampMillis(bool, Vec), + TimestampMicros(bool, Vec), + Binary(OffsetBufferBuilder, Vec), + String(OffsetBufferBuilder, Vec), + List(FieldRef, OffsetBufferBuilder, Box), + Record(Fields, Vec), + Nullable(Nullability, NullBufferBuilder, Box), +} + +impl Decoder { + fn try_new(data_type: &AvroDataType) -> Result { + let nyi = |s: &str| Err(ArrowError::NotYetImplemented(s.to_string())); + + let decoder = match data_type.codec() { + Codec::Null => Self::Null(0), + Codec::Boolean => Self::Boolean(BooleanBufferBuilder::new(DEFAULT_CAPACITY)), + Codec::Int32 => Self::Int32(Vec::with_capacity(DEFAULT_CAPACITY)), + Codec::Int64 => Self::Int64(Vec::with_capacity(DEFAULT_CAPACITY)), + Codec::Float32 => Self::Float32(Vec::with_capacity(DEFAULT_CAPACITY)), + Codec::Float64 => Self::Float64(Vec::with_capacity(DEFAULT_CAPACITY)), + Codec::Binary => Self::Binary( + OffsetBufferBuilder::new(DEFAULT_CAPACITY), + Vec::with_capacity(DEFAULT_CAPACITY), + ), + Codec::Utf8 => Self::String( + OffsetBufferBuilder::new(DEFAULT_CAPACITY), + Vec::with_capacity(DEFAULT_CAPACITY), + ), + Codec::Date32 => Self::Date32(Vec::with_capacity(DEFAULT_CAPACITY)), + Codec::TimeMillis => Self::TimeMillis(Vec::with_capacity(DEFAULT_CAPACITY)), + Codec::TimeMicros => Self::TimeMicros(Vec::with_capacity(DEFAULT_CAPACITY)), + Codec::TimestampMillis(is_utc) => { + Self::TimestampMillis(*is_utc, Vec::with_capacity(DEFAULT_CAPACITY)) + } + Codec::TimestampMicros(is_utc) => { + Self::TimestampMicros(*is_utc, Vec::with_capacity(DEFAULT_CAPACITY)) + } + Codec::Fixed(_) => return nyi("decoding fixed"), + Codec::Interval => return nyi("decoding interval"), + Codec::List(item) => { + let decoder = Self::try_new(item)?; + Self::List( + Arc::new(item.field_with_name("item")), + OffsetBufferBuilder::new(DEFAULT_CAPACITY), + Box::new(decoder), + ) + } + Codec::Struct(fields) => { + let mut arrow_fields = Vec::with_capacity(fields.len()); + let mut encodings = Vec::with_capacity(fields.len()); + for avro_field in fields.iter() { + let encoding = Self::try_new(avro_field.data_type())?; + arrow_fields.push(avro_field.field()); + encodings.push(encoding); + } + Self::Record(arrow_fields.into(), encodings) + } + }; + + Ok(match data_type.nullability() { + Some(nullability) => Self::Nullable( + nullability, + NullBufferBuilder::new(DEFAULT_CAPACITY), + Box::new(decoder), + ), + None => decoder, + }) + } + + /// Append a null record + fn append_null(&mut self) { + match self { + Self::Null(count) => *count += 1, + Self::Boolean(b) => b.append(false), + Self::Int32(v) | Self::Date32(v) | Self::TimeMillis(v) => v.push(0), + Self::Int64(v) + | Self::TimeMicros(v) + | Self::TimestampMillis(_, v) + | Self::TimestampMicros(_, v) => v.push(0), + Self::Float32(v) => v.push(0.), + Self::Float64(v) => v.push(0.), + Self::Binary(offsets, _) | Self::String(offsets, _) => offsets.push_length(0), + Self::List(_, offsets, e) => { + offsets.push_length(0); + e.append_null(); + } + Self::Record(_, e) => e.iter_mut().for_each(|e| e.append_null()), + Self::Nullable(_, _, _) => unreachable!("Nulls cannot be nested"), + } + } + + /// Decode a single record from `buf` + fn decode(&mut self, buf: &mut AvroCursor<'_>) -> Result<(), ArrowError> { + match self { + Self::Null(x) => *x += 1, + Self::Boolean(values) => values.append(buf.get_bool()?), + Self::Int32(values) | Self::Date32(values) | Self::TimeMillis(values) => { + values.push(buf.get_int()?) + } + Self::Int64(values) + | Self::TimeMicros(values) + | Self::TimestampMillis(_, values) + | Self::TimestampMicros(_, values) => values.push(buf.get_long()?), + Self::Float32(values) => values.push(buf.get_float()?), + Self::Float64(values) => values.push(buf.get_double()?), + Self::Binary(offsets, values) | Self::String(offsets, values) => { + let data = buf.get_bytes()?; + offsets.push_length(data.len()); + values.extend_from_slice(data); + } + Self::List(_, _, _) => { + return Err(ArrowError::NotYetImplemented( + "Decoding ListArray".to_string(), + )) + } + Self::Record(_, encodings) => { + for encoding in encodings { + encoding.decode(buf)?; + } + } + Self::Nullable(nullability, nulls, e) => { + let is_valid = buf.get_bool()? == matches!(nullability, Nullability::NullFirst); + nulls.append(is_valid); + match is_valid { + true => e.decode(buf)?, + false => e.append_null(), + } + } + } + Ok(()) + } + + /// Flush decoded records to an [`ArrayRef`] + fn flush(&mut self, nulls: Option) -> Result { + Ok(match self { + Self::Nullable(_, n, e) => e.flush(n.finish())?, + Self::Null(size) => Arc::new(NullArray::new(std::mem::replace(size, 0))), + Self::Boolean(b) => Arc::new(BooleanArray::new(b.finish(), nulls)), + Self::Int32(values) => Arc::new(flush_primitive::(values, nulls)), + Self::Date32(values) => Arc::new(flush_primitive::(values, nulls)), + Self::Int64(values) => Arc::new(flush_primitive::(values, nulls)), + Self::TimeMillis(values) => { + Arc::new(flush_primitive::(values, nulls)) + } + Self::TimeMicros(values) => { + Arc::new(flush_primitive::(values, nulls)) + } + Self::TimestampMillis(is_utc, values) => Arc::new( + flush_primitive::(values, nulls) + .with_timezone_opt(is_utc.then(|| "+00:00")), + ), + Self::TimestampMicros(is_utc, values) => Arc::new( + flush_primitive::(values, nulls) + .with_timezone_opt(is_utc.then(|| "+00:00")), + ), + Self::Float32(values) => Arc::new(flush_primitive::(values, nulls)), + Self::Float64(values) => Arc::new(flush_primitive::(values, nulls)), + + Self::Binary(offsets, values) => { + let offsets = flush_offsets(offsets); + let values = flush_values(values).into(); + Arc::new(BinaryArray::new(offsets, values, nulls)) + } + Self::String(offsets, values) => { + let offsets = flush_offsets(offsets); + let values = flush_values(values).into(); + Arc::new(StringArray::new(offsets, values, nulls)) + } + Self::List(field, offsets, values) => { + let values = values.flush(None)?; + let offsets = flush_offsets(offsets); + Arc::new(ListArray::new(field.clone(), offsets, values, nulls)) + } + Self::Record(fields, encodings) => { + let arrays = encodings + .iter_mut() + .map(|x| x.flush(None)) + .collect::, _>>()?; + Arc::new(StructArray::new(fields.clone(), arrays, nulls)) + } + }) + } +} + +#[inline] +fn flush_values(values: &mut Vec) -> Vec { + std::mem::replace(values, Vec::with_capacity(DEFAULT_CAPACITY)) +} + +#[inline] +fn flush_offsets(offsets: &mut OffsetBufferBuilder) -> OffsetBuffer { + std::mem::replace(offsets, OffsetBufferBuilder::new(DEFAULT_CAPACITY)).finish() +} + +#[inline] +fn flush_primitive( + values: &mut Vec, + nulls: Option, +) -> PrimitiveArray { + PrimitiveArray::new(flush_values(values).into(), nulls) +} + +const DEFAULT_CAPACITY: usize = 1024; diff --git a/arrow-avro/src/reader/vlq.rs b/arrow-avro/src/reader/vlq.rs index 80f1c60eec7d..b198a0d66f24 100644 --- a/arrow-avro/src/reader/vlq.rs +++ b/arrow-avro/src/reader/vlq.rs @@ -44,3 +44,91 @@ impl VLQDecoder { None } } + +/// Read a varint from `buf` returning the decoded `u64` and the number of bytes read +#[inline] +pub(crate) fn read_varint(buf: &[u8]) -> Option<(u64, usize)> { + let first = *buf.first()?; + if first < 0x80 { + return Some((first as u64, 1)); + } + + if let Some(array) = buf.get(..10) { + return read_varint_array(array.try_into().unwrap()); + } + + read_varint_slow(buf) +} + +/// Based on +/// - +/// - +/// - +#[inline] +fn read_varint_array(buf: [u8; 10]) -> Option<(u64, usize)> { + let mut in_progress = 0_u64; + for (idx, b) in buf.into_iter().take(9).enumerate() { + in_progress += (b as u64) << (7 * idx); + if b < 0x80 { + return Some((in_progress, idx + 1)); + } + in_progress -= 0x80 << (7 * idx); + } + + let b = buf[9] as u64; + in_progress += b << (7 * 9); + (b < 0x02).then_some((in_progress, 10)) +} + +#[inline(never)] +#[cold] +fn read_varint_slow(buf: &[u8]) -> Option<(u64, usize)> { + let mut value = 0; + for (count, byte) in buf.iter().take(10).enumerate() { + let byte = buf[count]; + value |= u64::from(byte & 0x7F) << (count * 7); + if byte <= 0x7F { + // Check for u64::MAX overflow. See [`ConsumeVarint`][1] for details. + // [1]: https://github.com/protocolbuffers/protobuf-go/blob/v1.27.1/encoding/protowire/wire.go#L358 + return (count != 9 || byte < 2).then_some((value, count + 1)); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + fn encode_var(mut n: u64, dst: &mut [u8]) -> usize { + let mut i = 0; + + while n >= 0x80 { + dst[i] = 0x80 | (n as u8); + i += 1; + n >>= 7; + } + + dst[i] = n as u8; + i + 1 + } + + fn varint_test(a: u64) { + let mut buf = [0_u8; 10]; + let len = encode_var(a, &mut buf); + assert_eq!(read_varint(&buf[..len]).unwrap(), (a, len)); + assert_eq!(read_varint(&buf).unwrap(), (a, len)); + } + + #[test] + fn test_varint() { + varint_test(0); + varint_test(4395932); + varint_test(u64::MAX); + + for _ in 0..1000 { + varint_test(rand::random()); + } + } +} diff --git a/arrow-buffer/LICENSE.txt b/arrow-buffer/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-buffer/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-buffer/NOTICE.txt b/arrow-buffer/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-buffer/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 49a75b468dbe..aaa86832f692 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -52,8 +52,12 @@ impl BooleanBuffer { /// This method will panic if `buffer` is not large enough pub fn new(buffer: Buffer, offset: usize, len: usize) -> Self { let total_len = offset.saturating_add(len); - let bit_len = buffer.len().saturating_mul(8); - assert!(total_len <= bit_len); + let buffer_len = buffer.len(); + let bit_len = buffer_len.saturating_mul(8); + assert!( + total_len <= bit_len, + "buffer not large enough (offset: {offset}, len: {len}, buffer_len: {buffer_len})" + ); Self { buffer, offset, @@ -96,17 +100,6 @@ impl BooleanBuffer { BitChunks::new(self.values(), self.offset, self.len) } - /// Returns `true` if the bit at index `i` is set - /// - /// # Panics - /// - /// Panics if `i >= self.len()` - #[inline] - #[deprecated(note = "use BooleanBuffer::value")] - pub fn is_set(&self, i: usize) -> bool { - self.value(i) - } - /// Returns the offset of this [`BooleanBuffer`] in bits #[inline] pub fn offset(&self) -> usize { @@ -125,6 +118,12 @@ impl BooleanBuffer { self.len == 0 } + /// Free up unused memory. + pub fn shrink_to_fit(&mut self) { + // TODO(emilk): we could shrink even more in the case where we are a small sub-slice of the full buffer + self.buffer.shrink_to_fit(); + } + /// Returns the boolean value at index `i`. /// /// # Panics diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 8d1a46583fca..d0c8ffa39783 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -20,10 +20,10 @@ use std::fmt::Debug; use std::ptr::NonNull; use std::sync::Arc; -use crate::alloc::{Allocation, Deallocation, ALIGNMENT}; +use crate::alloc::{Allocation, Deallocation}; use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk}; use crate::BufferBuilder; -use crate::{bytes::Bytes, native::ArrowNativeType}; +use crate::{bit_util, bytes::Bytes, native::ArrowNativeType}; use super::ops::bitwise_unary_op_helper; use super::{MutableBuffer, ScalarBuffer}; @@ -99,26 +99,6 @@ impl Buffer { buffer.into() } - /// Creates a buffer from an existing aligned memory region (must already be byte-aligned), this - /// `Buffer` will free this piece of memory when dropped. - /// - /// # Arguments - /// - /// * `ptr` - Pointer to raw parts - /// * `len` - Length of raw parts in **bytes** - /// * `capacity` - Total allocated memory for the pointer `ptr`, in **bytes** - /// - /// # Safety - /// - /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` - /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. - #[deprecated(note = "Use Buffer::from_vec")] - pub unsafe fn from_raw_parts(ptr: NonNull, len: usize, capacity: usize) -> Self { - assert!(len <= capacity); - let layout = Layout::from_size_align(capacity, ALIGNMENT).unwrap(); - Buffer::build_with_arguments(ptr, len, Deallocation::Standard(layout)) - } - /// Creates a buffer from an existing memory region. Ownership of the memory is tracked via reference counting /// and the memory will be freed using the `drop` method of [crate::alloc::Allocation] when the reference count reaches zero. /// @@ -167,6 +147,41 @@ impl Buffer { self.data.capacity() } + /// Tried to shrink the capacity of the buffer as much as possible, freeing unused memory. + /// + /// If the buffer is shared, this is a no-op. + /// + /// If the memory was allocated with a custom allocator, this is a no-op. + /// + /// If the capacity is already less than or equal to the desired capacity, this is a no-op. + /// + /// The memory region will be reallocated using `std::alloc::realloc`. + pub fn shrink_to_fit(&mut self) { + let offset = self.ptr_offset(); + let is_empty = self.is_empty(); + let desired_capacity = if is_empty { + 0 + } else { + // For realloc to work, we cannot free the elements before the offset + offset + self.len() + }; + if desired_capacity < self.capacity() { + if let Some(bytes) = Arc::get_mut(&mut self.data) { + if bytes.try_realloc(desired_capacity).is_ok() { + // Realloc complete - update our pointer into `bytes`: + self.ptr = if is_empty { + bytes.as_ptr() + } else { + // SAFETY: we kept all elements leading up to the offset + unsafe { bytes.as_ptr().add(offset) } + } + } else { + // Failure to reallocate is fine; we just failed to free up memory. + } + } + } + } + /// Returns whether the buffer is empty. #[inline] pub fn is_empty(&self) -> bool { @@ -265,7 +280,7 @@ impl Buffer { /// otherwise a new buffer is allocated and filled with a copy of the bits in the range. pub fn bit_slice(&self, offset: usize, len: usize) -> Self { if offset % 8 == 0 { - return self.slice(offset / 8); + return self.slice_with_length(offset / 8, bit_util::ceil(len, 8)); } bitwise_unary_op_helper(self, offset, len, |a| a) @@ -278,14 +293,6 @@ impl Buffer { BitChunks::new(self.as_slice(), offset, len) } - /// Returns the number of 1-bits in this buffer. - #[deprecated(note = "use count_set_bits_offset instead")] - pub fn count_set_bits(&self) -> usize { - let len_in_bits = self.len() * 8; - // self.offset is already taken into consideration by the bit_chunks implementation - self.count_set_bits_offset(0, len_in_bits) - } - /// Returns the number of 1-bits in this buffer, starting from `offset` with `length` bits /// inspected. Note that both `offset` and `length` are measured in bits. pub fn count_set_bits_offset(&self, offset: usize, len: usize) -> usize { @@ -295,6 +302,8 @@ impl Buffer { /// Returns `MutableBuffer` for mutating the buffer if this buffer is not shared. /// Returns `Err` if this is shared or its allocation is from an external source or /// it is not allocated with alignment [`ALIGNMENT`] + /// + /// [`ALIGNMENT`]: crate::alloc::ALIGNMENT pub fn into_mutable(self) -> Result { let ptr = self.ptr; let length = self.length; @@ -562,6 +571,34 @@ mod tests { assert_eq!(buf2.slice_with_length(2, 1).as_slice(), &[10]); } + #[test] + fn test_shrink_to_fit() { + let original = Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(original.as_slice(), &[0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(original.capacity(), 64); + + let slice = original.slice_with_length(2, 3); + drop(original); // Make sure the buffer isn't shared (or shrink_to_fit won't work) + assert_eq!(slice.as_slice(), &[2, 3, 4]); + assert_eq!(slice.capacity(), 64); + + let mut shrunk = slice; + shrunk.shrink_to_fit(); + assert_eq!(shrunk.as_slice(), &[2, 3, 4]); + assert_eq!(shrunk.capacity(), 5); // shrink_to_fit is allowed to keep the elements before the offset + + // Test that we can handle empty slices: + let empty_slice = shrunk.slice_with_length(1, 0); + drop(shrunk); // Make sure the buffer isn't shared (or shrink_to_fit won't work) + assert_eq!(empty_slice.as_slice(), &[]); + assert_eq!(empty_slice.capacity(), 5); + + let mut shrunk_empty = empty_slice; + shrunk_empty.shrink_to_fit(); + assert_eq!(shrunk_empty.as_slice(), &[]); + assert_eq!(shrunk_empty.capacity(), 0); + } + #[test] #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn test_slice_offset_out_of_bound() { @@ -860,4 +897,37 @@ mod tests { let iter_len = usize::MAX / std::mem::size_of::() + 1; let _ = Buffer::from_iter(std::iter::repeat(0_u64).take(iter_len)); } + + #[test] + fn bit_slice_length_preserved() { + // Create a boring buffer + let buf = Buffer::from_iter(std::iter::repeat(true).take(64)); + + let assert_preserved = |offset: usize, len: usize| { + let new_buf = buf.bit_slice(offset, len); + assert_eq!(new_buf.len(), bit_util::ceil(len, 8)); + + // if the offset is not byte-aligned, we have to create a deep copy to a new buffer + // (since the `offset` value inside a Buffer is byte-granular, not bit-granular), so + // checking the offset should always return 0 if so. If the offset IS byte-aligned, we + // want to make sure it doesn't unnecessarily create a deep copy. + if offset % 8 == 0 { + assert_eq!(new_buf.ptr_offset(), offset / 8); + } else { + assert_eq!(new_buf.ptr_offset(), 0); + } + }; + + // go through every available value for offset + for o in 0..=64 { + // and go through every length that could accompany that offset - we can't have a + // situation where offset + len > 64, because that would go past the end of the buffer, + // so we use the map to ensure it's in range. + for l in (o..=64).map(|l| l - o) { + // and we just want to make sure every one of these keeps its offset and length + // when neeeded + assert_preserved(o, l); + } + } + } } diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 7fcbd89dd262..c4315a1d64cd 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -118,13 +118,6 @@ impl MutableBuffer { Self { data, len, layout } } - /// Create a [`MutableBuffer`] from the provided [`Vec`] without copying - #[inline] - #[deprecated(note = "Use From>")] - pub fn from_vec(vec: Vec) -> Self { - Self::from(vec) - } - /// Allocates a new [MutableBuffer] from given `Bytes`. pub(crate) fn from_bytes(bytes: Bytes) -> Result { let layout = match bytes.deallocation() { @@ -331,15 +324,6 @@ impl MutableBuffer { self.data.as_ptr() } - #[deprecated( - since = "2.0.0", - note = "This method is deprecated in favour of `into` from the trait `Into`." - )] - /// Freezes this buffer and return an immutable version of it. - pub fn freeze(self) -> Buffer { - self.into_buffer() - } - #[inline] pub(super) fn into_buffer(self) -> Buffer { let bytes = unsafe { Bytes::new(self.data, self.len, Deallocation::Standard(self.layout)) }; @@ -483,10 +467,13 @@ impl MutableBuffer { } } +/// Creates a non-null pointer with alignment of [`ALIGNMENT`] +/// +/// This is similar to [`NonNull::dangling`] #[inline] -fn dangling_ptr() -> NonNull { - // SAFETY: ALIGNMENT is a non-zero usize which is then casted - // to a *mut T. Therefore, `ptr` is not null and the conditions for +pub(crate) fn dangling_ptr() -> NonNull { + // SAFETY: ALIGNMENT is a non-zero usize which is then cast + // to a *mut u8. Therefore, `ptr` is not null and the conditions for // calling new_unchecked() are respected. #[cfg(miri)] { diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index c79aef398059..ec12b885eb5a 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -130,6 +130,11 @@ impl NullBuffer { self.buffer.is_empty() } + /// Free up unused memory. + pub fn shrink_to_fit(&mut self) { + self.buffer.shrink_to_fit(); + } + /// Returns the null count for this [`NullBuffer`] #[inline] pub fn null_count(&self) -> usize { @@ -235,6 +240,12 @@ impl From<&[bool]> for NullBuffer { } } +impl From<&[bool; N]> for NullBuffer { + fn from(value: &[bool; N]) -> Self { + value[..].into() + } +} + impl From> for NullBuffer { fn from(value: Vec) -> Self { BooleanBuffer::from(value).into() diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index e9087d30098c..a6be2b67af84 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -133,6 +133,11 @@ impl OffsetBuffer { Self(out.into()) } + /// Free up unused memory. + pub fn shrink_to_fit(&mut self) { + self.0.shrink_to_fit(); + } + /// Returns the inner [`ScalarBuffer`] pub fn inner(&self) -> &ScalarBuffer { &self.0 diff --git a/arrow-buffer/src/buffer/run.rs b/arrow-buffer/src/buffer/run.rs index 3dbbe344a025..cc6d19044feb 100644 --- a/arrow-buffer/src/buffer/run.rs +++ b/arrow-buffer/src/buffer/run.rs @@ -136,6 +136,12 @@ where self.len == 0 } + /// Free up unused memory. + pub fn shrink_to_fit(&mut self) { + // TODO(emilk): we could shrink even more in the case where we are a small sub-slice of the full buffer + self.run_ends.shrink_to_fit(); + } + /// Returns the values of this [`RunEndBuffer`] not including any offset #[inline] pub fn values(&self) -> &[E] { diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 343b8549e93d..ab6c87168e5c 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -72,6 +72,11 @@ impl ScalarBuffer { buffer.slice_with_length(byte_offset, byte_len).into() } + /// Free up unused memory. + pub fn shrink_to_fit(&mut self) { + self.buffer.shrink_to_fit(); + } + /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset` pub fn slice(&self, offset: usize, len: usize) -> Self { Self::new(self.buffer.clone(), offset, len) diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index ba61342d8e39..77724137aef7 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -24,6 +24,7 @@ use std::ptr::NonNull; use std::{fmt::Debug, fmt::Formatter}; use crate::alloc::Deallocation; +use crate::buffer::dangling_ptr; /// A continuous, fixed-size, immutable memory region that knows how to de-allocate itself. /// @@ -96,6 +97,48 @@ impl Bytes { } } + /// Try to reallocate the underlying memory region to a new size (smaller or larger). + /// + /// Only works for bytes allocated with the standard allocator. + /// Returns `Err` if the memory was allocated with a custom allocator, + /// or the call to `realloc` failed, for whatever reason. + /// In case of `Err`, the [`Bytes`] will remain as it was (i.e. have the old size). + pub fn try_realloc(&mut self, new_len: usize) -> Result<(), ()> { + if let Deallocation::Standard(old_layout) = self.deallocation { + if old_layout.size() == new_len { + return Ok(()); // Nothing to do + } + + if let Ok(new_layout) = std::alloc::Layout::from_size_align(new_len, old_layout.align()) + { + let old_ptr = self.ptr.as_ptr(); + + let new_ptr = match new_layout.size() { + 0 => { + // SAFETY: Verified that old_layout.size != new_len (0) + unsafe { std::alloc::dealloc(self.ptr.as_ptr(), old_layout) }; + Some(dangling_ptr()) + } + // SAFETY: the call to `realloc` is safe if all the following hold (from https://doc.rust-lang.org/stable/std/alloc/trait.GlobalAlloc.html#method.realloc): + // * `old_ptr` must be currently allocated via this allocator (guaranteed by the invariant/contract of `Bytes`) + // * `old_layout` must be the same layout that was used to allocate that block of memory (same) + // * `new_len` must be greater than zero + // * `new_len`, when rounded up to the nearest multiple of `layout.align()`, must not overflow `isize` (guaranteed by the success of `Layout::from_size_align`) + _ => NonNull::new(unsafe { std::alloc::realloc(old_ptr, old_layout, new_len) }), + }; + + if let Some(ptr) = new_ptr { + self.ptr = ptr; + self.len = new_len; + self.deallocation = Deallocation::Standard(new_layout); + return Ok(()); + } + } + } + + Err(()) + } + #[inline] pub(crate) fn deallocation(&self) -> &Deallocation { &self.deallocation diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index c563f73cf5b9..eb8e067db0be 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -88,30 +88,6 @@ pub trait ArrowNativeType: /// Returns `None` if [`Self`] is not an integer or conversion would result /// in truncation/overflow fn to_i64(self) -> Option; - - /// Convert native type from i32. - /// - /// Returns `None` if [`Self`] is not `i32` - #[deprecated(note = "please use `Option::Some` instead")] - fn from_i32(_: i32) -> Option { - None - } - - /// Convert native type from i64. - /// - /// Returns `None` if [`Self`] is not `i64` - #[deprecated(note = "please use `Option::Some` instead")] - fn from_i64(_: i64) -> Option { - None - } - - /// Convert native type from i128. - /// - /// Returns `None` if [`Self`] is not `i128` - #[deprecated(note = "please use `Option::Some` instead")] - fn from_i128(_: i128) -> Option { - None - } } macro_rules! native_integer { @@ -147,23 +123,15 @@ macro_rules! native_integer { fn usize_as(i: usize) -> Self { i as _ } - - - $( - #[inline] - fn $from(v: $t) -> Option { - Some(v) - } - )* } }; } native_integer!(i8); native_integer!(i16); -native_integer!(i32, from_i32); -native_integer!(i64, from_i64); -native_integer!(i128, from_i128); +native_integer!(i32); +native_integer!(i64); +native_integer!(i128); native_integer!(u8); native_integer!(u16); native_integer!(u32); diff --git a/arrow-cast/LICENSE.txt b/arrow-cast/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-cast/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-cast/NOTICE.txt b/arrow-cast/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-cast/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-cast/src/cast/decimal.rs b/arrow-cast/src/cast/decimal.rs index d6b2f884f753..ba82ca9040c7 100644 --- a/arrow-cast/src/cast/decimal.rs +++ b/arrow-cast/src/cast/decimal.rs @@ -111,9 +111,13 @@ where O::Native::from_decimal(adjusted) }; - Ok(match cast_options.safe { - true => array.unary_opt(f), - false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?, + Ok(if cast_options.safe { + array.unary_opt(|x| f(x).filter(|v| O::is_valid_decimal_precision(*v, output_precision))) + } else { + array.try_unary(|x| { + f(x).ok_or_else(|| error(x)) + .and_then(|v| O::validate_decimal_precision(v, output_precision).map(|_| v)) + })? }) } @@ -137,15 +141,20 @@ where let f = |x| O::Native::from_decimal(x).and_then(|x| x.mul_checked(mul).ok()); - Ok(match cast_options.safe { - true => array.unary_opt(f), - false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?, + Ok(if cast_options.safe { + array.unary_opt(|x| f(x).filter(|v| O::is_valid_decimal_precision(*v, output_precision))) + } else { + array.try_unary(|x| { + f(x).ok_or_else(|| error(x)) + .and_then(|v| O::validate_decimal_precision(v, output_precision).map(|_| v)) + })? }) } // Only support one type of decimal cast operations pub(crate) fn cast_decimal_to_decimal_same_type( array: &PrimitiveArray, + input_precision: u8, input_scale: i8, output_precision: u8, output_scale: i8, @@ -155,20 +164,11 @@ where T: DecimalType, T::Native: DecimalCast + ArrowNativeTypeOp, { - let array: PrimitiveArray = match input_scale.cmp(&output_scale) { - Ordering::Equal => { - // the scale doesn't change, the native value don't need to be changed + let array: PrimitiveArray = + if input_scale == output_scale && input_precision <= output_precision { array.clone() - } - Ordering::Greater => convert_to_smaller_scale_decimal::( - array, - input_scale, - output_precision, - output_scale, - cast_options, - )?, - Ordering::Less => { - // input_scale < output_scale + } else if input_scale < output_scale { + // the scale doesn't change, but precision may change and cause overflow convert_to_bigger_or_equal_scale_decimal::( array, input_scale, @@ -176,8 +176,15 @@ where output_scale, cast_options, )? - } - }; + } else { + convert_to_smaller_scale_decimal::( + array, + input_scale, + output_precision, + output_scale, + cast_options, + )? + }; Ok(Arc::new(array.with_precision_and_scale( output_precision, @@ -323,8 +330,8 @@ where }) } -pub(crate) fn string_to_decimal_cast( - from: &GenericStringArray, +pub(crate) fn generic_string_to_decimal_cast<'a, T, S>( + from: &'a S, precision: u8, scale: i8, cast_options: &CastOptions, @@ -332,6 +339,7 @@ pub(crate) fn string_to_decimal_cast( where T: DecimalType, T::Native: DecimalCast + ArrowNativeTypeOp, + &'a S: StringArrayType<'a>, { if cast_options.safe { let iter = from.iter().map(|v| { @@ -375,6 +383,37 @@ where } } +pub(crate) fn string_to_decimal_cast( + from: &GenericStringArray, + precision: u8, + scale: i8, + cast_options: &CastOptions, +) -> Result, ArrowError> +where + T: DecimalType, + T::Native: DecimalCast + ArrowNativeTypeOp, +{ + generic_string_to_decimal_cast::>( + from, + precision, + scale, + cast_options, + ) +} + +pub(crate) fn string_view_to_decimal_cast( + from: &StringViewArray, + precision: u8, + scale: i8, + cast_options: &CastOptions, +) -> Result, ArrowError> +where + T: DecimalType, + T::Native: DecimalCast + ArrowNativeTypeOp, +{ + generic_string_to_decimal_cast::(from, precision, scale, cast_options) +} + /// Cast Utf8 to decimal pub(crate) fn cast_string_to_decimal( from: &dyn Array, @@ -399,14 +438,30 @@ where ))); } - Ok(Arc::new(string_to_decimal_cast::( - from.as_any() - .downcast_ref::>() - .unwrap(), - precision, - scale, - cast_options, - )?)) + let result = match from.data_type() { + DataType::Utf8View => string_view_to_decimal_cast::( + from.as_any().downcast_ref::().unwrap(), + precision, + scale, + cast_options, + )?, + DataType::Utf8 | DataType::LargeUtf8 => string_to_decimal_cast::( + from.as_any() + .downcast_ref::>() + .unwrap(), + precision, + scale, + cast_options, + )?, + other => { + return Err(ArrowError::ComputeError(format!( + "Cannot cast {:?} to decimal", + other + ))) + } + }; + + Ok(Arc::new(result)) } pub(crate) fn cast_floating_point_to_decimal128( diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index f7059be170f4..ba470635c6cd 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -182,10 +182,10 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Decimal128(_, _) | Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) | // decimal to signed numeric (Decimal128(_, _) | Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true, - // decimal to Utf8 - (Decimal128(_, _) | Decimal256(_, _), Utf8 | LargeUtf8) => true, - // Utf8 to decimal - (Utf8 | LargeUtf8, Decimal128(_, _) | Decimal256(_, _)) => true, + // decimal to string + (Decimal128(_, _) | Decimal256(_, _), Utf8View | Utf8 | LargeUtf8) => true, + // string to decimal + (Utf8View | Utf8 | LargeUtf8, Decimal128(_, _) | Decimal256(_, _)) => true, (Struct(from_fields), Struct(to_fields)) => { from_fields.len() == to_fields.len() && from_fields.iter().zip(to_fields.iter()).all(|(f1, f2)| { @@ -197,13 +197,18 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Struct(_), _) => false, (_, Struct(_)) => false, (_, Boolean) => { - DataType::is_integer(from_type) || - DataType::is_floating(from_type) + DataType::is_integer(from_type) + || DataType::is_floating(from_type) + || from_type == &Utf8View || from_type == &Utf8 || from_type == &LargeUtf8 } (Boolean, _) => { - DataType::is_integer(to_type) || DataType::is_floating(to_type) || to_type == &Utf8 || to_type == &LargeUtf8 + DataType::is_integer(to_type) + || DataType::is_floating(to_type) + || to_type == &Utf8View + || to_type == &Utf8 + || to_type == &LargeUtf8 } (Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView | Utf8View ) => true, @@ -230,8 +235,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { ) => true, (Utf8 | LargeUtf8, Utf8View) => true, (BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View ) => true, - (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, + (Utf8View | Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), + (_, Utf8View) => from_type.is_numeric(), (_, Binary | LargeBinary) => from_type.is_integer(), @@ -824,18 +830,20 @@ pub fn cast_with_options( (Map(_, ordered1), Map(_, ordered2)) if ordered1 == ordered2 => { cast_map_values(array.as_map(), to_type, cast_options, ordered1.to_owned()) } - (Decimal128(_, s1), Decimal128(p2, s2)) => { + (Decimal128(p1, s1), Decimal128(p2, s2)) => { cast_decimal_to_decimal_same_type::( array.as_primitive(), + *p1, *s1, *p2, *s2, cast_options, ) } - (Decimal256(_, s1), Decimal256(p2, s2)) => { + (Decimal256(p1, s1), Decimal256(p2, s2)) => { cast_decimal_to_decimal_same_type::( array.as_primitive(), + *p1, *s1, *p2, *s2, @@ -917,6 +925,7 @@ pub fn cast_with_options( Float64 => cast_decimal_to_float::(array, |x| { x as f64 / 10_f64.powi(*scale as i32) }), + Utf8View => value_to_string_view(array, cast_options), Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), @@ -982,6 +991,7 @@ pub fn cast_with_options( Float64 => cast_decimal_to_float::(array, |x| { x.to_f64().unwrap() / 10_f64.powi(*scale as i32) }), + Utf8View => value_to_string_view(array, cast_options), Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), @@ -1061,7 +1071,7 @@ pub fn cast_with_options( *scale, cast_options, ), - Utf8 => cast_string_to_decimal::( + Utf8View | Utf8 => cast_string_to_decimal::( array, *precision, *scale, @@ -1150,7 +1160,7 @@ pub fn cast_with_options( *scale, cast_options, ), - Utf8 => cast_string_to_decimal::( + Utf8View | Utf8 => cast_string_to_decimal::( array, *precision, *scale, @@ -1197,6 +1207,7 @@ pub fn cast_with_options( Float16 => cast_numeric_to_bool::(array), Float32 => cast_numeric_to_bool::(array), Float64 => cast_numeric_to_bool::(array), + Utf8View => cast_utf8view_to_boolean(array, cast_options), Utf8 => cast_utf8_to_boolean::(array, cast_options), LargeUtf8 => cast_utf8_to_boolean::(array, cast_options), _ => Err(ArrowError::CastError(format!( @@ -1215,6 +1226,7 @@ pub fn cast_with_options( Float16 => cast_bool_to_numeric::(array, cast_options), Float32 => cast_bool_to_numeric::(array, cast_options), Float64 => cast_bool_to_numeric::(array, cast_options), + Utf8View => value_to_string_view(array, cast_options), Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), _ => Err(ArrowError::CastError(format!( @@ -1462,6 +1474,9 @@ pub fn cast_with_options( (BinaryView, _) => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), + (from_type, Utf8View) if from_type.is_primitive() => { + value_to_string_view(array, cast_options) + } (from_type, LargeUtf8) if from_type.is_primitive() => { value_to_string::(array, cast_options) } @@ -2485,12 +2500,11 @@ where #[cfg(test)] mod tests { + use super::*; use arrow_buffer::{Buffer, IntervalDayTime, NullBuffer}; use chrono::NaiveDate; use half::f16; - use super::*; - macro_rules! generate_cast_test_case { ($INPUT_ARRAY: expr, $OUTPUT_TYPE_ARRAY: ident, $OUTPUT_TYPE: expr, $OUTPUT_VALUES: expr) => { let output = @@ -2682,13 +2696,16 @@ mod tests { // negative test let array = vec![Some(123456), None]; let array = create_decimal_array(array, 10, 0).unwrap(); - let result = cast(&array, &DataType::Decimal128(2, 2)); - assert!(result.is_ok()); - let array = result.unwrap(); - let array: &Decimal128Array = array.as_primitive(); - let err = array.validate_decimal_precision(2); + let result_safe = cast(&array, &DataType::Decimal128(2, 2)); + assert!(result_safe.is_ok()); + let options = CastOptions { + safe: false, + ..Default::default() + }; + + let result_unsafe = cast_with_options(&array, &DataType::Decimal128(2, 2), &options); assert_eq!("Invalid argument error: 12345600 is too large to store in a Decimal128 of precision 2. Max is 99", - err.unwrap_err().to_string()); + result_unsafe.unwrap_err().to_string()); } #[test] @@ -3637,7 +3654,7 @@ mod tests { let array = Int32Array::from(vec![5, 6, 7, 8, 9]); let b = cast( &array, - &DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + &DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))), ) .unwrap(); assert_eq!(5, b.len()); @@ -3661,7 +3678,7 @@ mod tests { let array = Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]); let b = cast( &array, - &DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + &DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))), ) .unwrap(); assert_eq!(5, b.len()); @@ -3689,7 +3706,7 @@ mod tests { let array = array.slice(2, 4); let b = cast( &array, - &DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), + &DataType::List(Arc::new(Field::new_list_field(DataType::Float64, true))), ) .unwrap(); assert_eq!(4, b.len()); @@ -3708,6 +3725,55 @@ mod tests { assert_eq!(10.0, c.value(3)); } + #[test] + fn test_cast_int_to_utf8view() { + let inputs = vec![ + Arc::new(Int8Array::from(vec![None, Some(8), Some(9), Some(10)])) as ArrayRef, + Arc::new(Int16Array::from(vec![None, Some(8), Some(9), Some(10)])) as ArrayRef, + Arc::new(Int32Array::from(vec![None, Some(8), Some(9), Some(10)])) as ArrayRef, + Arc::new(Int64Array::from(vec![None, Some(8), Some(9), Some(10)])) as ArrayRef, + Arc::new(UInt8Array::from(vec![None, Some(8), Some(9), Some(10)])) as ArrayRef, + Arc::new(UInt16Array::from(vec![None, Some(8), Some(9), Some(10)])) as ArrayRef, + Arc::new(UInt32Array::from(vec![None, Some(8), Some(9), Some(10)])) as ArrayRef, + Arc::new(UInt64Array::from(vec![None, Some(8), Some(9), Some(10)])) as ArrayRef, + ]; + let expected: ArrayRef = Arc::new(StringViewArray::from(vec![ + None, + Some("8"), + Some("9"), + Some("10"), + ])); + + for array in inputs { + assert!(can_cast_types(array.data_type(), &DataType::Utf8View)); + let arr = cast(&array, &DataType::Utf8View).unwrap(); + assert_eq!(expected.as_ref(), arr.as_ref()); + } + } + + #[test] + fn test_cast_float_to_utf8view() { + let inputs = vec![ + Arc::new(Float16Array::from(vec![ + Some(f16::from_f64(1.5)), + Some(f16::from_f64(2.5)), + None, + ])) as ArrayRef, + Arc::new(Float32Array::from(vec![Some(1.5), Some(2.5), None])) as ArrayRef, + Arc::new(Float64Array::from(vec![Some(1.5), Some(2.5), None])) as ArrayRef, + ]; + + let expected: ArrayRef = + Arc::new(StringViewArray::from(vec![Some("1.5"), Some("2.5"), None])); + + for array in inputs { + println!("type: {}", array.data_type()); + assert!(can_cast_types(array.data_type(), &DataType::Utf8View)); + let arr = cast(&array, &DataType::Utf8View).unwrap(); + assert_eq!(expected.as_ref(), arr.as_ref()); + } + } + #[test] fn test_cast_utf8_to_i32() { let array = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); @@ -3720,6 +3786,41 @@ mod tests { assert!(!c.is_valid(4)); } + #[test] + fn test_cast_utf8view_to_i32() { + let array = StringViewArray::from(vec!["5", "6", "seven", "8", "9.1"]); + let b = cast(&array, &DataType::Int32).unwrap(); + let c = b.as_primitive::(); + assert_eq!(5, c.value(0)); + assert_eq!(6, c.value(1)); + assert!(!c.is_valid(2)); + assert_eq!(8, c.value(3)); + assert!(!c.is_valid(4)); + } + + #[test] + fn test_cast_utf8view_to_f32() { + let array = StringViewArray::from(vec!["3", "4.56", "seven", "8.9"]); + let b = cast(&array, &DataType::Float32).unwrap(); + let c = b.as_primitive::(); + assert_eq!(3.0, c.value(0)); + assert_eq!(4.56, c.value(1)); + assert!(!c.is_valid(2)); + assert_eq!(8.9, c.value(3)); + } + + #[test] + fn test_cast_utf8view_to_decimal128() { + let array = StringViewArray::from(vec![None, Some("4"), Some("5.6"), Some("7.89")]); + let arr = Arc::new(array) as ArrayRef; + generate_cast_test_case!( + &arr, + Decimal128Array, + &DataType::Decimal128(4, 2), + vec![None, Some(400_i128), Some(560_i128), Some(789_i128)] + ); + } + #[test] fn test_cast_with_options_utf8_to_i32() { let array = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); @@ -3751,6 +3852,14 @@ mod tests { assert_eq!(*as_boolean_array(&casted), expected); } + #[test] + fn test_cast_utf8view_to_bool() { + let strings = StringViewArray::from(vec!["true", "false", "invalid", " Y ", ""]); + let casted = cast(&strings, &DataType::Boolean).unwrap(); + let expected = BooleanArray::from(vec![Some(true), Some(false), None, Some(true), None]); + assert_eq!(*as_boolean_array(&casted), expected); + } + #[test] fn test_cast_with_options_utf8_to_bool() { let strings = StringArray::from(vec!["true", "false", "invalid", " Y ", ""]); @@ -3782,6 +3891,16 @@ mod tests { assert!(!c.is_valid(2)); } + #[test] + fn test_cast_bool_to_utf8view() { + let array = BooleanArray::from(vec![Some(true), Some(false), None]); + let b = cast(&array, &DataType::Utf8View).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!("true", c.value(0)); + assert_eq!("false", c.value(1)); + assert!(!c.is_valid(2)); + } + #[test] fn test_cast_bool_to_utf8() { let array = BooleanArray::from(vec![Some(true), Some(false), None]); @@ -3975,7 +4094,7 @@ mod tests { // Construct a list array from the above two // [[0,0,0], [-1, -2, -1], [2, 100000000]] - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -3986,7 +4105,7 @@ mod tests { let cast_array = cast( &list_array, - &DataType::List(Arc::new(Field::new("item", DataType::UInt16, true))), + &DataType::List(Arc::new(Field::new_list_field(DataType::UInt16, true))), ) .unwrap(); @@ -4026,7 +4145,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 9]); // Construct a list array from the above two - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -4037,8 +4156,7 @@ mod tests { let actual = cast( &list_array, - &DataType::List(Arc::new(Field::new( - "item", + &DataType::List(Arc::new(Field::new_list_field( DataType::Timestamp(TimeUnit::Microsecond, None), true, ))), @@ -4048,11 +4166,10 @@ mod tests { let expected = cast( &cast( &list_array, - &DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + &DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))), ) .unwrap(), - &DataType::List(Arc::new(Field::new( - "item", + &DataType::List(Arc::new(Field::new_list_field( DataType::Timestamp(TimeUnit::Microsecond, None), true, ))), @@ -5146,41 +5263,43 @@ mod tests { assert_eq!("2018-12-25T00:00:00", c.value(1)); } + macro_rules! assert_cast_timestamp_to_string { + ($array:expr, $datatype:expr, $output_array_type: ty, $expected:expr) => {{ + let out = cast(&$array, &$datatype).unwrap(); + let actual = out + .as_any() + .downcast_ref::<$output_array_type>() + .unwrap() + .into_iter() + .collect::>(); + assert_eq!(actual, $expected); + }}; + ($array:expr, $datatype:expr, $output_array_type: ty, $options:expr, $expected:expr) => {{ + let out = cast_with_options(&$array, &$datatype, &$options).unwrap(); + let actual = out + .as_any() + .downcast_ref::<$output_array_type>() + .unwrap() + .into_iter() + .collect::>(); + assert_eq!(actual, $expected); + }}; + } + #[test] fn test_cast_timestamp_to_strings() { // "2018-12-25T00:00:02.001", "1997-05-19T00:00:03.005", None let array = TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]); - let out = cast(&array, &DataType::Utf8).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19T00:00:03.005"), - Some("2018-12-25T00:00:02.001"), - None - ] - ); - let out = cast(&array, &DataType::LargeUtf8).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19T00:00:03.005"), - Some("2018-12-25T00:00:02.001"), - None - ] - ); + let expected = vec![ + Some("1997-05-19T00:00:03.005"), + Some("2018-12-25T00:00:02.001"), + None, + ]; + + assert_cast_timestamp_to_string!(array, DataType::Utf8View, StringViewArray, expected); + assert_cast_timestamp_to_string!(array, DataType::Utf8, StringArray, expected); + assert_cast_timestamp_to_string!(array, DataType::LargeUtf8, LargeStringArray, expected); } #[test] @@ -5193,73 +5312,65 @@ mod tests { .with_timestamp_format(Some(ts_format)) .with_timestamp_tz_format(Some(ts_format)), }; + // "2018-12-25T00:00:02.001", "1997-05-19T00:00:03.005", None let array_without_tz = TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]); - let out = cast_with_options(&array_without_tz, &DataType::Utf8, &cast_options).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19 00:00:03.005000"), - Some("2018-12-25 00:00:02.001000"), - None - ] + let expected = vec![ + Some("1997-05-19 00:00:03.005000"), + Some("2018-12-25 00:00:02.001000"), + None, + ]; + assert_cast_timestamp_to_string!( + array_without_tz, + DataType::Utf8View, + StringViewArray, + cast_options, + expected ); - let out = - cast_with_options(&array_without_tz, &DataType::LargeUtf8, &cast_options).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19 00:00:03.005000"), - Some("2018-12-25 00:00:02.001000"), - None - ] + assert_cast_timestamp_to_string!( + array_without_tz, + DataType::Utf8, + StringArray, + cast_options, + expected + ); + assert_cast_timestamp_to_string!( + array_without_tz, + DataType::LargeUtf8, + LargeStringArray, + cast_options, + expected ); let array_with_tz = TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]) .with_timezone(tz.to_string()); - let out = cast_with_options(&array_with_tz, &DataType::Utf8, &cast_options).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19 05:45:03.005000"), - Some("2018-12-25 05:45:02.001000"), - None - ] + let expected = vec![ + Some("1997-05-19 05:45:03.005000"), + Some("2018-12-25 05:45:02.001000"), + None, + ]; + assert_cast_timestamp_to_string!( + array_with_tz, + DataType::Utf8View, + StringViewArray, + cast_options, + expected ); - let out = cast_with_options(&array_with_tz, &DataType::LargeUtf8, &cast_options).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19 05:45:03.005000"), - Some("2018-12-25 05:45:02.001000"), - None - ] + assert_cast_timestamp_to_string!( + array_with_tz, + DataType::Utf8, + StringArray, + cast_options, + expected + ); + assert_cast_timestamp_to_string!( + array_with_tz, + DataType::LargeUtf8, + LargeStringArray, + cast_options, + expected ); } @@ -7085,12 +7196,12 @@ mod tests { cast_from_null_to_other(&data_type); // Cast null from and to list - let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); cast_from_null_to_other(&data_type); - let data_type = DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); + let data_type = DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))); cast_from_null_to_other(&data_type); let data_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 4); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 4); cast_from_null_to_other(&data_type); // Cast null from and to dictionary @@ -7207,11 +7318,11 @@ mod tests { assert_eq!(actual.data_type(), to_array.data_type()); let invalid_target = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Binary, true)), 2); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Binary, true)), 2); assert!(!can_cast_types(from_array.data_type(), &invalid_target)); let invalid_size = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float16, true)), 5); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Float16, true)), 5); assert!(!can_cast_types(from_array.data_type(), &invalid_size)); } @@ -7364,7 +7475,7 @@ mod tests { [(Some([Some(5)]))], 1, )) as ArrayRef; - let to_field_inner = Arc::new(Field::new("item", DataType::Float32, false)); + let to_field_inner = Arc::new(Field::new_list_field(DataType::Float32, false)); let to_field = Arc::new(Field::new( "dummy", DataType::FixedSizeList(to_field_inner.clone(), 1), @@ -7454,7 +7565,7 @@ mod tests { // 4. Nulls that are correctly sized (same as target list size) // Non-null case - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let values = vec![ Some(vec![Some(1), Some(2), Some(3)]), Some(vec![Some(4), Some(5), Some(6)]), @@ -7530,7 +7641,7 @@ mod tests { let res = cast_with_options( array.as_ref(), - &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3), + &DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 3), &CastOptions { safe: false, ..Default::default() @@ -7544,7 +7655,7 @@ mod tests { // too short and truncate lists that are too long. let res = cast( array.as_ref(), - &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3), + &DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 3), ) .unwrap(); let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( @@ -7566,7 +7677,7 @@ mod tests { ])) as ArrayRef; let res = cast_with_options( array.as_ref(), - &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3), + &DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 3), &CastOptions { safe: false, ..Default::default() @@ -7591,7 +7702,7 @@ mod tests { )) as ArrayRef; let actual = cast( array.as_ref(), - &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 2), + &DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 2), ) .unwrap(); assert_eq!(expected.as_ref(), actual.as_ref()); @@ -7614,14 +7725,14 @@ mod tests { )) as ArrayRef; let actual = cast( array.as_ref(), - &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2), + &DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 2), ) .unwrap(); assert_eq!(expected.as_ref(), actual.as_ref()); let res = cast_with_options( array.as_ref(), - &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int16, true)), 2), + &DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int16, true)), 2), &CastOptions { safe: false, ..Default::default() @@ -7633,7 +7744,7 @@ mod tests { #[test] fn test_cast_list_to_fsl_empty() { - let field = Arc::new(Field::new("item", DataType::Int32, true)); + let field = Arc::new(Field::new_list_field(DataType::Int32, true)); let array = new_empty_array(&DataType::List(field.clone())); let target_type = DataType::FixedSizeList(field.clone(), 3); @@ -7656,7 +7767,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -7680,7 +7791,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -7699,7 +7810,7 @@ mod tests { .unwrap(); let list_data_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 4); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 4); let list_data = ArrayData::builder(list_data_type) .len(2) .add_child_data(value_data) @@ -7717,7 +7828,7 @@ mod tests { .unwrap(); let list_data_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 4); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 4); let list_data = ArrayData::builder(list_data_type) .len(2) .add_child_data(value_data) @@ -7979,7 +8090,7 @@ mod tests { let array1 = make_list_array().slice(1, 2); let array2 = Arc::new(make_list_array()) as ArrayRef; - let dt = DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); + let dt = DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))); let out1 = cast(&array1, &dt).unwrap(); let out2 = cast(&array2, &dt).unwrap(); @@ -7992,7 +8103,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); let value_data = str_array.into_data(); - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -8354,7 +8465,7 @@ mod tests { let input_type = DataType::Decimal128(10, 3); let output_type = DataType::Decimal256(10, 5); assert!(can_cast_types(&input_type, &output_type)); - let array = vec![Some(i128::MAX), Some(i128::MIN)]; + let array = vec![Some(123456), Some(-123456)]; let input_decimal_array = create_decimal_array(array, 10, 3).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; @@ -8364,8 +8475,8 @@ mod tests { Decimal256Array, &output_type, vec![ - Some(i256::from_i128(i128::MAX).mul_wrapping(hundred)), - Some(i256::from_i128(i128::MIN).mul_wrapping(hundred)) + Some(i256::from_i128(123456).mul_wrapping(hundred)), + Some(i256::from_i128(-123456).mul_wrapping(hundred)) ] ); } @@ -9114,7 +9225,31 @@ mod tests { } #[test] - fn test_cast_decimal_to_utf8() { + fn test_cast_decimal_to_string() { + assert!(can_cast_types( + &DataType::Decimal128(10, 4), + &DataType::Utf8View + )); + assert!(can_cast_types( + &DataType::Decimal256(38, 10), + &DataType::Utf8View + )); + + macro_rules! assert_decimal_values { + ($array:expr) => { + let c = $array; + assert_eq!("1123.454", c.value(0)); + assert_eq!("2123.456", c.value(1)); + assert_eq!("-3123.453", c.value(2)); + assert_eq!("-3123.456", c.value(3)); + assert_eq!("0.000", c.value(4)); + assert_eq!("0.123", c.value(5)); + assert_eq!("1234.567", c.value(6)); + assert_eq!("-1234.567", c.value(7)); + assert!(c.is_null(8)); + }; + } + fn test_decimal_to_string( output_type: DataType, array: PrimitiveArray, @@ -9122,18 +9257,19 @@ mod tests { let b = cast(&array, &output_type).unwrap(); assert_eq!(b.data_type(), &output_type); - let c = b.as_string::(); - - assert_eq!("1123.454", c.value(0)); - assert_eq!("2123.456", c.value(1)); - assert_eq!("-3123.453", c.value(2)); - assert_eq!("-3123.456", c.value(3)); - assert_eq!("0.000", c.value(4)); - assert_eq!("0.123", c.value(5)); - assert_eq!("1234.567", c.value(6)); - assert_eq!("-1234.567", c.value(7)); - assert!(c.is_null(8)); + match b.data_type() { + DataType::Utf8View => { + let c = b.as_string_view(); + assert_decimal_values!(c); + } + DataType::Utf8 | DataType::LargeUtf8 => { + let c = b.as_string::(); + assert_decimal_values!(c); + } + _ => (), + } } + let array128: Vec> = vec![ Some(1123454), Some(2123456), @@ -9145,22 +9281,33 @@ mod tests { Some(-123456789), None, ]; + let array256: Vec> = array128 + .iter() + .map(|num| num.map(i256::from_i128)) + .collect(); - let array256: Vec> = array128.iter().map(|v| v.map(i256::from_i128)).collect(); - - test_decimal_to_string::( + test_decimal_to_string::( + DataType::Utf8View, + create_decimal_array(array128.clone(), 7, 3).unwrap(), + ); + test_decimal_to_string::( DataType::Utf8, create_decimal_array(array128.clone(), 7, 3).unwrap(), ); - test_decimal_to_string::( + test_decimal_to_string::( DataType::LargeUtf8, create_decimal_array(array128, 7, 3).unwrap(), ); - test_decimal_to_string::( + + test_decimal_to_string::( + DataType::Utf8View, + create_decimal256_array(array256.clone(), 7, 3).unwrap(), + ); + test_decimal_to_string::( DataType::Utf8, create_decimal256_array(array256.clone(), 7, 3).unwrap(), ); - test_decimal_to_string::( + test_decimal_to_string::( DataType::LargeUtf8, create_decimal256_array(array256, 7, 3).unwrap(), ); @@ -9793,4 +9940,76 @@ mod tests { "Cast non-nullable to non-nullable struct field returning null should fail", ); } + + #[test] + fn test_decimal_to_decimal_throw_error_on_precision_overflow_same_scale() { + let array = vec![Some(123456789)]; + let array = create_decimal_array(array, 24, 2).unwrap(); + println!("{:?}", array); + let input_type = DataType::Decimal128(24, 2); + let output_type = DataType::Decimal128(6, 2); + assert!(can_cast_types(&input_type, &output_type)); + + let options = CastOptions { + safe: false, + ..Default::default() + }; + let result = cast_with_options(&array, &output_type, &options); + assert_eq!(result.unwrap_err().to_string(), + "Invalid argument error: 123456790 is too large to store in a Decimal128 of precision 6. Max is 999999"); + } + + #[test] + fn test_decimal_to_decimal_throw_error_on_precision_overflow_lower_scale() { + let array = vec![Some(123456789)]; + let array = create_decimal_array(array, 24, 2).unwrap(); + println!("{:?}", array); + let input_type = DataType::Decimal128(24, 4); + let output_type = DataType::Decimal128(6, 2); + assert!(can_cast_types(&input_type, &output_type)); + + let options = CastOptions { + safe: false, + ..Default::default() + }; + let result = cast_with_options(&array, &output_type, &options); + assert_eq!(result.unwrap_err().to_string(), + "Invalid argument error: 123456790 is too large to store in a Decimal128 of precision 6. Max is 999999"); + } + + #[test] + fn test_decimal_to_decimal_throw_error_on_precision_overflow_greater_scale() { + let array = vec![Some(123456789)]; + let array = create_decimal_array(array, 24, 2).unwrap(); + println!("{:?}", array); + let input_type = DataType::Decimal128(24, 2); + let output_type = DataType::Decimal128(6, 3); + assert!(can_cast_types(&input_type, &output_type)); + + let options = CastOptions { + safe: false, + ..Default::default() + }; + let result = cast_with_options(&array, &output_type, &options); + assert_eq!(result.unwrap_err().to_string(), + "Invalid argument error: 1234567890 is too large to store in a Decimal128 of precision 6. Max is 999999"); + } + + #[test] + fn test_decimal_to_decimal_throw_error_on_precision_overflow_diff_type() { + let array = vec![Some(123456789)]; + let array = create_decimal_array(array, 24, 2).unwrap(); + println!("{:?}", array); + let input_type = DataType::Decimal128(24, 2); + let output_type = DataType::Decimal256(6, 2); + assert!(can_cast_types(&input_type, &output_type)); + + let options = CastOptions { + safe: false, + ..Default::default() + }; + let result = cast_with_options(&array, &output_type, &options); + assert_eq!(result.unwrap_err().to_string(), + "Invalid argument error: 123456789 is too large to store in a Decimal256 of precision 6. Max is 999999"); + } } diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs index 7d0e7e21c859..7f22c4fd64de 100644 --- a/arrow-cast/src/cast/string.rs +++ b/arrow-cast/src/cast/string.rs @@ -38,6 +38,30 @@ pub(crate) fn value_to_string( Ok(Arc::new(builder.finish())) } +pub(crate) fn value_to_string_view( + array: &dyn Array, + options: &CastOptions, +) -> Result { + let mut builder = StringViewBuilder::with_capacity(array.len()); + let formatter = ArrayFormatter::try_new(array, &options.format_options)?; + let nulls = array.nulls(); + // buffer to avoid reallocating on each value + // TODO: replace with write to builder after https://github.com/apache/arrow-rs/issues/6373 + let mut buffer = String::new(); + for i in 0..array.len() { + match nulls.map(|x| x.is_null(i)).unwrap_or_default() { + true => builder.append_null(), + false => { + // write to buffer first and then copy into target array + buffer.clear(); + formatter.value(i).write(&mut buffer)?; + builder.append_value(&buffer) + } + } + } + Ok(Arc::new(builder.finish())) +} + /// Parse UTF-8 pub(crate) fn parse_string( array: &dyn Array, @@ -344,19 +368,14 @@ pub(crate) fn cast_binary_to_string( } } -/// Casts Utf8 to Boolean -pub(crate) fn cast_utf8_to_boolean( - from: &dyn Array, +/// Casts string to boolean +fn cast_string_to_boolean<'a, StrArray>( + array: &StrArray, cast_options: &CastOptions, ) -> Result where - OffsetSize: OffsetSizeTrait, + StrArray: StringArrayType<'a>, { - let array = from - .as_any() - .downcast_ref::>() - .unwrap(); - let output_array = array .iter() .map(|value| match value { @@ -378,3 +397,27 @@ where Ok(Arc::new(output_array)) } + +pub(crate) fn cast_utf8_to_boolean( + from: &dyn Array, + cast_options: &CastOptions, +) -> Result +where + OffsetSize: OffsetSizeTrait, +{ + let array = from + .as_any() + .downcast_ref::>() + .unwrap(); + + cast_string_to_boolean(&array, cast_options) +} + +pub(crate) fn cast_utf8view_to_boolean( + from: &dyn Array, + cast_options: &CastOptions, +) -> Result { + let array = from.as_any().downcast_ref::().unwrap(); + + cast_string_to_boolean(&array, cast_options) +} diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 4bd94c13fe8d..f4c4639c1c08 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -497,6 +497,10 @@ parser_primitive!(Int64Type); parser_primitive!(Int32Type); parser_primitive!(Int16Type); parser_primitive!(Int8Type); +parser_primitive!(DurationNanosecondType); +parser_primitive!(DurationMicrosecondType); +parser_primitive!(DurationMillisecondType); +parser_primitive!(DurationSecondType); impl Parser for TimestampNanosecondType { fn parse(string: &str) -> Option { diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index 4a3cbda283a5..ad3b952c327d 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -296,7 +296,7 @@ mod tests { fn test_pretty_format_fixed_size_list() { // define a schema. let field_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 3); let schema = Arc::new(Schema::new(vec![Field::new("d1", field_type, true)])); let keys_builder = Int32Array::builder(3); diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index be213c9363c2..8823924eb55b 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -35,18 +35,16 @@ bench = false [dependencies] arrow-array = { workspace = true } -arrow-buffer = { workspace = true } arrow-cast = { workspace = true } -arrow-data = { workspace = true } arrow-schema = { workspace = true } chrono = { workspace = true } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1" } lazy_static = { version = "1.4", default-features = false } -lexical-core = { version = "1.0", default-features = false } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } [dev-dependencies] +arrow-buffer = { workspace = true } tempfile = "3.3" futures = "0.3" tokio = { version = "1.27", default-features = false, features = ["io-util"] } diff --git a/arrow-csv/LICENSE.txt b/arrow-csv/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-csv/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-csv/NOTICE.txt b/arrow-csv/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-csv/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index c91b436f6cce..d3d518316397 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -136,7 +136,7 @@ use lazy_static::lazy_static; use regex::{Regex, RegexSet}; use std::fmt::{self, Debug}; use std::fs::File; -use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom}; +use std::io::{BufRead, BufReader as StdBufReader, Read}; use std::sync::Arc; use crate::map_csv_error; @@ -241,7 +241,7 @@ pub struct Format { } impl Format { - /// Specify whether the CSV file has a header, defaults to `true` + /// Specify whether the CSV file has a header, defaults to `false` /// /// When `true`, the first row of the CSV file is treated as a header row pub fn with_header(mut self, has_header: bool) -> Self { @@ -399,51 +399,6 @@ impl Format { } } -/// Infer the schema of a CSV file by reading through the first n records of the file, -/// with `max_read_records` controlling the maximum number of records to read. -/// -/// If `max_read_records` is not set, the whole file is read to infer its schema. -/// -/// Return inferred schema and number of records used for inference. This function does not change -/// reader cursor offset. -/// -/// The inferred schema will always have each field set as nullable. -#[deprecated(note = "Use Format::infer_schema")] -#[allow(deprecated)] -pub fn infer_file_schema( - mut reader: R, - delimiter: u8, - max_read_records: Option, - has_header: bool, -) -> Result<(Schema, usize), ArrowError> { - let saved_offset = reader.stream_position()?; - let r = infer_reader_schema(&mut reader, delimiter, max_read_records, has_header)?; - // return the reader seek back to the start - reader.seek(SeekFrom::Start(saved_offset))?; - Ok(r) -} - -/// Infer schema of CSV records provided by struct that implements `Read` trait. -/// -/// `max_read_records` controlling the maximum number of records to read. If `max_read_records` is -/// not set, all records are read to infer the schema. -/// -/// Return inferred schema and number of records used for inference. -#[deprecated(note = "Use Format::infer_schema")] -pub fn infer_reader_schema( - reader: R, - delimiter: u8, - max_read_records: Option, - has_header: bool, -) -> Result<(Schema, usize), ArrowError> { - let format = Format { - delimiter: Some(delimiter), - header: has_header, - ..Default::default() - }; - format.infer_schema(reader, max_read_records) -} - /// Infer schema from a list of CSV files by reading through first n records /// with `max_read_records` controlling the maximum number of records to read. /// @@ -824,42 +779,66 @@ fn parse( match key_type.as_ref() { DataType::Int8 => Ok(Arc::new( rows.iter() - .map(|row| row.get(i)) + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) .collect::>(), ) as ArrayRef), DataType::Int16 => Ok(Arc::new( rows.iter() - .map(|row| row.get(i)) + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) .collect::>(), ) as ArrayRef), DataType::Int32 => Ok(Arc::new( rows.iter() - .map(|row| row.get(i)) + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) .collect::>(), ) as ArrayRef), DataType::Int64 => Ok(Arc::new( rows.iter() - .map(|row| row.get(i)) + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) .collect::>(), ) as ArrayRef), DataType::UInt8 => Ok(Arc::new( rows.iter() - .map(|row| row.get(i)) + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) .collect::>(), ) as ArrayRef), DataType::UInt16 => Ok(Arc::new( rows.iter() - .map(|row| row.get(i)) + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) .collect::>(), ) as ArrayRef), DataType::UInt32 => Ok(Arc::new( rows.iter() - .map(|row| row.get(i)) + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) .collect::>(), ) as ArrayRef), DataType::UInt64 => Ok(Arc::new( rows.iter() - .map(|row| row.get(i)) + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) .collect::>(), ) as ArrayRef), _ => Err(ArrowError::ParseError(format!( @@ -1101,14 +1080,6 @@ impl ReaderBuilder { } } - /// Set whether the CSV file has headers - #[deprecated(note = "Use with_header")] - #[doc(hidden)] - pub fn has_header(mut self, has_header: bool) -> Self { - self.format.header = has_header; - self - } - /// Set whether the CSV file has a header pub fn with_header(mut self, has_header: bool) -> Self { self.format.header = has_header; @@ -1236,7 +1207,7 @@ impl ReaderBuilder { mod tests { use super::*; - use std::io::{Cursor, Write}; + use std::io::{Cursor, Seek, SeekFrom, Write}; use tempfile::NamedTempFile; use arrow_array::cast::AsArray; @@ -1528,6 +1499,40 @@ mod tests { assert_eq!(strings.value(29), "Uckfield, East Sussex, UK"); } + #[test] + fn test_csv_with_nullable_dictionary() { + let offset_type = vec![ + DataType::Int8, + DataType::Int16, + DataType::Int32, + DataType::Int64, + DataType::UInt8, + DataType::UInt16, + DataType::UInt32, + DataType::UInt64, + ]; + for data_type in offset_type { + let file = File::open("test/data/dictionary_nullable_test.csv").unwrap(); + let dictionary_type = + DataType::Dictionary(Box::new(data_type), Box::new(DataType::Utf8)); + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("name", dictionary_type.clone(), true), + ])); + + let mut csv = ReaderBuilder::new(schema) + .build(file.try_clone().unwrap()) + .unwrap(); + + let batch = csv.next().unwrap().unwrap(); + assert_eq!(3, batch.num_rows()); + assert_eq!(2, batch.num_columns()); + + let names = arrow_cast::cast(batch.column(1), &dictionary_type).unwrap(); + assert!(!names.is_null(2)); + assert!(names.is_null(1)); + } + } #[test] fn test_nulls() { let schema = Arc::new(Schema::new(vec![ diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index eae2133a4623..c5a0a0b76d59 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -256,14 +256,6 @@ impl WriterBuilder { Self::default() } - /// Set whether to write headers - #[deprecated(note = "Use Self::with_header")] - #[doc(hidden)] - pub fn has_headers(mut self, has_headers: bool) -> Self { - self.has_header = has_headers; - self - } - /// Set whether to write the CSV file with a header pub fn with_header(mut self, header: bool) -> Self { self.has_header = header; @@ -397,17 +389,6 @@ impl WriterBuilder { self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE) } - /// Use RFC3339 format for date/time/timestamps (default) - #[deprecated(note = "Use WriterBuilder::default()")] - pub fn with_rfc3339(mut self) -> Self { - self.date_format = None; - self.datetime_format = None; - self.time_format = None; - self.timestamp_format = None; - self.timestamp_tz_format = None; - self - } - /// Create a new `Writer` pub fn build(self, writer: W) -> Writer { let mut builder = csv::WriterBuilder::new(); diff --git a/arrow-csv/test/data/dictionary_nullable_test.csv b/arrow-csv/test/data/dictionary_nullable_test.csv new file mode 100644 index 000000000000..c9ada5293b70 --- /dev/null +++ b/arrow-csv/test/data/dictionary_nullable_test.csv @@ -0,0 +1,3 @@ +id,name +1, +2,bob diff --git a/arrow-data/LICENSE.txt b/arrow-data/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-data/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-data/NOTICE.txt b/arrow-data/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-data/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 8af2a91cf159..a35b5e8629e9 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -30,11 +30,6 @@ use std::sync::Arc; use crate::{equal, validate_binary_view, validate_string_view}; -/// A collection of [`Buffer`] -#[doc(hidden)] -#[deprecated(note = "Use [Buffer]")] -pub type Buffers<'a> = &'a [Buffer]; - #[inline] pub(crate) fn contains_nulls( null_bit_buffer: Option<&NullBuffer>, diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 702cb1360c2d..fbb295036a9b 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -43,11 +43,11 @@ base64 = { version = "0.22", default-features = false, features = ["std"] } bytes = { version = "1", default-features = false } futures = { version = "0.3", default-features = false, features = ["alloc"] } once_cell = { version = "1", optional = true } -paste = { version = "1.0" } +paste = { version = "1.0" , optional = true } prost = { version = "0.13.1", default-features = false, features = ["prost-derive"] } # For Timestamp type prost-types = { version = "0.13.1", default-features = false } -tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } +tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"], optional = true } tonic = { version = "0.12.3", default-features = false, features = ["transport", "codegen", "prost"] } # CLI-related dependencies @@ -61,11 +61,10 @@ all-features = true [features] default = [] -flight-sql-experimental = ["arrow-arith", "arrow-data", "arrow-ord", "arrow-row", "arrow-select", "arrow-string", "once_cell"] +flight-sql-experimental = ["dep:arrow-arith", "dep:arrow-data", "dep:arrow-ord", "dep:arrow-row", "dep:arrow-select", "dep:arrow-string", "dep:once_cell", "dep:paste"] tls = ["tonic/tls"] - # Enable CLI tools -cli = ["anyhow", "arrow-array/chrono-tz", "arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] +cli = ["arrow-array/chrono-tz", "arrow-cast/prettyprint", "tonic/tls-webpki-roots", "dep:anyhow", "dep:clap", "dep:tracing-log", "dep:tracing-subscriber"] [dev-dependencies] arrow-cast = { workspace = true, features = ["prettyprint"] } @@ -75,6 +74,9 @@ http-body = "1.0.0" hyper-util = "0.1" pin-project-lite = "0.2" tempfile = "3.3" +tracing-log = { version = "0.2" } +tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "env-filter", "fmt"] } +tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } tokio-stream = { version = "0.1", features = ["net"] } tower = { version = "0.5.0", features = ["util"] } uuid = { version = "1.10.0", features = ["v4"] } diff --git a/arrow-flight/LICENSE.txt b/arrow-flight/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-flight/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-flight/NOTICE.txt b/arrow-flight/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-flight/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-flight/README.md b/arrow-flight/README.md index df74bc012a1c..661abfc58691 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -31,14 +31,14 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "53.2.0" +arrow-flight = "53.3.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. This crate provides a Rust implementation of the [Flight.proto](../format/Flight.proto) gRPC protocol and -[examples](https://github.com/apache/arrow-rs/tree/master/arrow-flight/examples) +[examples](https://github.com/apache/arrow-rs/tree/main/arrow-flight/examples) that demonstrate how to build a Flight server implemented with [tonic](https://docs.rs/crate/tonic/latest). ## Feature Flags diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index c7fe89beb93a..6358227a8912 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,5 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.89", default-features = false } -prost-build = { version = "=0.13.3", default-features = false } +prost-build = { version = "=0.13.4", default-features = false } tonic-build = { version = "=0.12.3", default-features = false, features = ["transport", "prost"] } diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index ae3475c7c7d7..315b7b3cb6e5 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -535,8 +535,10 @@ fn prepare_field_for_flight( ) .with_metadata(field.metadata().clone()) } else { + #[allow(deprecated)] let dict_id = dictionary_tracker.set_dict_id(field.as_ref()); + #[allow(deprecated)] Field::new_dict( field.name(), field.data_type().clone(), @@ -583,7 +585,9 @@ fn prepare_schema_for_flight( ) .with_metadata(field.metadata().clone()) } else { + #[allow(deprecated)] let dict_id = dictionary_tracker.set_dict_id(field.as_ref()); + #[allow(deprecated)] Field::new_dict( field.name(), field.data_type().clone(), @@ -650,10 +654,12 @@ struct FlightIpcEncoder { impl FlightIpcEncoder { fn new(options: IpcWriteOptions, error_on_replacement: bool) -> Self { + #[allow(deprecated)] let preserve_dict_id = options.preserve_dict_id(); Self { options, data_gen: IpcDataGenerator::default(), + #[allow(deprecated)] dictionary_tracker: DictionaryTracker::new_with_preserve_dict_id( error_on_replacement, preserve_dict_id, @@ -934,7 +940,7 @@ mod tests { let mut decoder = FlightDataDecoder::new(encoder); let expected_schema = Schema::new(vec![Field::new_list( "dict_list", - Field::new("item", DataType::Utf8, true), + Field::new_list_field(DataType::Utf8, true), true, )]); @@ -1038,7 +1044,7 @@ mod tests { "struct", vec![Field::new_list( "dict_list", - Field::new("item", DataType::Utf8, true), + Field::new_list_field(DataType::Utf8, true), true, )], true, @@ -1218,12 +1224,16 @@ mod tests { let hydrated_struct_fields = vec![Field::new_list( "dict_list", - Field::new("item", DataType::Utf8, true), + Field::new_list_field(DataType::Utf8, true), true, )]; let hydrated_union_fields = vec![ - Field::new_list("dict_list", Field::new("item", DataType::Utf8, true), true), + Field::new_list( + "dict_list", + Field::new_list_field(DataType::Utf8, true), + true, + ), Field::new_struct("struct", hydrated_struct_fields.clone(), true), Field::new("string", DataType::Utf8, true), ]; @@ -1537,6 +1547,7 @@ mod tests { async fn verify_flight_round_trip(mut batches: Vec) { let expected_schema = batches.first().unwrap().schema(); + #[allow(deprecated)] let encoder = FlightDataEncoderBuilder::default() .with_options(IpcWriteOptions::default().with_preserve_dict_id(false)) .with_dictionary_handling(DictionaryHandling::Resend) @@ -1564,6 +1575,7 @@ mod tests { HashMap::from([("some_key".to_owned(), "some_value".to_owned())]), ); + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); let got = prepare_schema_for_flight(&schema, &mut dictionary_tracker, false); @@ -1582,12 +1594,30 @@ mod tests { hydrate_dictionaries(&batch, batch.schema()).expect("failed to optimize"); } - pub fn make_flight_data( + fn make_flight_data( + batch: &RecordBatch, + options: &IpcWriteOptions, + ) -> (Vec, FlightData) { + flight_data_from_arrow_batch(batch, options) + } + + fn flight_data_from_arrow_batch( batch: &RecordBatch, options: &IpcWriteOptions, ) -> (Vec, FlightData) { + let data_gen = IpcDataGenerator::default(); #[allow(deprecated)] - crate::utils::flight_data_from_arrow_batch(batch, options) + let mut dictionary_tracker = + DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); + + let (encoded_dictionaries, encoded_batch) = data_gen + .encoded_batch(batch, &mut dictionary_tracker, options) + .expect("DictionaryTracker configured above to not error on replacement"); + + let flight_dictionaries = encoded_dictionaries.into_iter().map(Into::into).collect(); + let flight_batch = encoded_batch.into(); + + (flight_dictionaries, flight_batch) } #[test] @@ -1741,7 +1771,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - verify_encoded_split(batch, 160).await; + verify_encoded_split(batch, 48).await; } #[tokio::test] diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 9f18416c06ec..1dd2700794f3 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -38,6 +38,8 @@ //! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html #![allow(rustdoc::invalid_html_tags)] #![warn(missing_docs)] +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; use arrow_schema::{ArrowError, Schema}; @@ -141,6 +143,7 @@ pub struct IpcMessage(pub Bytes); fn flight_schema_as_encoded_data(arrow_schema: &Schema, options: &IpcWriteOptions) -> EncodedData { let data_gen = writer::IpcDataGenerator::default(); + #[allow(deprecated)] let mut dict_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); data_gen.schema_to_bytes_with_dictionary_tracker(arrow_schema, &mut dict_tracker, options) diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index e45e505b2b61..a6e228737b3f 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -228,8 +228,8 @@ impl FlightSqlServiceClient { .await .map_err(status_to_arrow_error)? .unwrap(); - let any = Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; - let result: DoPutUpdateResult = any.unpack()?.unwrap(); + let result: DoPutUpdateResult = + Message::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; Ok(result.record_count) } @@ -274,8 +274,8 @@ impl FlightSqlServiceClient { .await .map_err(status_to_arrow_error)? .unwrap(); - let any = Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; - let result: DoPutUpdateResult = any.unpack()?.unwrap(); + let result: DoPutUpdateResult = + Message::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; Ok(result.record_count) } @@ -593,8 +593,8 @@ impl PreparedStatement { .await .map_err(status_to_arrow_error)? .unwrap(); - let any = Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; - let result: DoPutUpdateResult = any.unpack()?.unwrap(); + let result: DoPutUpdateResult = + Message::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; Ok(result.record_count) } diff --git a/arrow-flight/src/sql/metadata/catalogs.rs b/arrow-flight/src/sql/metadata/catalogs.rs index 327fed81077b..e27c63c3932f 100644 --- a/arrow-flight/src/sql/metadata/catalogs.rs +++ b/arrow-flight/src/sql/metadata/catalogs.rs @@ -68,7 +68,8 @@ impl GetCatalogsBuilder { /// builds a `RecordBatch` with the correct schema for a /// [`CommandGetCatalogs`] response pub fn build(self) -> Result { - let Self { catalogs } = self; + let Self { mut catalogs } = self; + catalogs.sort_unstable(); let batch = RecordBatch::try_new( Arc::clone(&GET_CATALOG_SCHEMA), @@ -98,3 +99,30 @@ static GET_CATALOG_SCHEMA: Lazy = Lazy::new(|| { false, )])) }); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_catalogs_are_sorted() { + let batch = ["a_catalog", "c_catalog", "b_catalog"] + .into_iter() + .fold(GetCatalogsBuilder::new(), |mut builder, catalog| { + builder.append(catalog); + builder + }) + .build() + .unwrap(); + let catalogs = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .flatten() + .collect::>(); + assert!(catalogs.is_sorted()); + assert_eq!(catalogs, ["a_catalog", "b_catalog", "c_catalog"]); + } +} diff --git a/arrow-flight/src/sql/metadata/sql_info.rs b/arrow-flight/src/sql/metadata/sql_info.rs index 2ea30df7fc2f..58b228530942 100644 --- a/arrow-flight/src/sql/metadata/sql_info.rs +++ b/arrow-flight/src/sql/metadata/sql_info.rs @@ -172,7 +172,7 @@ static UNION_TYPE: Lazy = Lazy::new(|| { // treat list as nullable b/c that is what the builders make Field::new( "string_list", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))), true, ), Field::new( @@ -184,7 +184,7 @@ static UNION_TYPE: Lazy = Lazy::new(|| { Field::new("keys", DataType::Int32, false), Field::new( "values", - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))), true, ), ])), diff --git a/arrow-flight/src/sql/metadata/xdbc_info.rs b/arrow-flight/src/sql/metadata/xdbc_info.rs index 485bedaebfb0..a3a18ca10888 100644 --- a/arrow-flight/src/sql/metadata/xdbc_info.rs +++ b/arrow-flight/src/sql/metadata/xdbc_info.rs @@ -330,7 +330,7 @@ static GET_XDBC_INFO_SCHEMA: Lazy = Lazy::new(|| { Field::new("literal_suffix", DataType::Utf8, true), Field::new( "create_params", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, false))), + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, false))), true, ), Field::new("nullable", DataType::Int32, false), diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index 6b9befa63600..8ab8a16dbb50 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -719,7 +719,7 @@ where let record_count = self.do_put_statement_update(command, request).await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(PutResult { - app_metadata: result.as_any().encode_to_vec().into(), + app_metadata: result.encode_to_vec().into(), })]); Ok(Response::new(Box::pin(output))) } @@ -727,7 +727,7 @@ where let record_count = self.do_put_statement_ingest(command, request).await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(PutResult { - app_metadata: result.as_any().encode_to_vec().into(), + app_metadata: result.encode_to_vec().into(), })]); Ok(Response::new(Box::pin(output))) } @@ -744,7 +744,7 @@ where let record_count = self.do_put_substrait_plan(command, request).await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(PutResult { - app_metadata: result.as_any().encode_to_vec().into(), + app_metadata: result.encode_to_vec().into(), })]); Ok(Response::new(Box::pin(output))) } @@ -754,7 +754,7 @@ where .await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(PutResult { - app_metadata: result.as_any().encode_to_vec().into(), + app_metadata: result.encode_to_vec().into(), })]); Ok(Response::new(Box::pin(output))) } diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index f6129ddfe248..428dde73ca6c 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -17,8 +17,7 @@ //! Utilities to assist with reading and writing Arrow data as Flight messages -use crate::{FlightData, IpcMessage, SchemaAsIpc, SchemaResult}; -use bytes::Bytes; +use crate::{FlightData, SchemaAsIpc}; use std::collections::HashMap; use std::sync::Arc; @@ -28,30 +27,6 @@ use arrow_ipc::convert::fb_to_schema; use arrow_ipc::{reader, root_as_message, writer, writer::IpcWriteOptions}; use arrow_schema::{ArrowError, Schema, SchemaRef}; -/// Convert a `RecordBatch` to a vector of `FlightData` representing the bytes of the dictionaries -/// and a `FlightData` representing the bytes of the batch's values -#[deprecated( - since = "30.0.0", - note = "Use IpcDataGenerator directly with DictionaryTracker to avoid re-sending dictionaries" -)] -pub fn flight_data_from_arrow_batch( - batch: &RecordBatch, - options: &IpcWriteOptions, -) -> (Vec, FlightData) { - let data_gen = writer::IpcDataGenerator::default(); - let mut dictionary_tracker = - writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); - - let (encoded_dictionaries, encoded_batch) = data_gen - .encoded_batch(batch, &mut dictionary_tracker, options) - .expect("DictionaryTracker configured above to not error on replacement"); - - let flight_dictionaries = encoded_dictionaries.into_iter().map(Into::into).collect(); - let flight_batch = encoded_batch.into(); - - (flight_dictionaries, flight_batch) -} - /// Convert a slice of wire protocol `FlightData`s into a vector of `RecordBatch`es pub fn flight_data_to_batches(flight_data: &[FlightData]) -> Result, ArrowError> { let schema = flight_data.first().ok_or_else(|| { @@ -104,41 +79,6 @@ pub fn flight_data_to_arrow_batch( })? } -/// Convert a `Schema` to `SchemaResult` by converting to an IPC message -#[deprecated( - since = "4.4.0", - note = "Use From trait, e.g.: SchemaAsIpc::new(schema, options).try_into()" -)] -pub fn flight_schema_from_arrow_schema( - schema: &Schema, - options: &IpcWriteOptions, -) -> Result { - SchemaAsIpc::new(schema, options).try_into() -} - -/// Convert a `Schema` to `FlightData` by converting to an IPC message -#[deprecated( - since = "4.4.0", - note = "Use From trait, e.g.: SchemaAsIpc::new(schema, options).into()" -)] -pub fn flight_data_from_arrow_schema(schema: &Schema, options: &IpcWriteOptions) -> FlightData { - SchemaAsIpc::new(schema, options).into() -} - -/// Convert a `Schema` to bytes in the format expected in `FlightInfo.schema` -#[deprecated( - since = "4.4.0", - note = "Use TryFrom trait, e.g.: SchemaAsIpc::new(schema, options).try_into()" -)] -pub fn ipc_message_from_arrow_schema( - schema: &Schema, - options: &IpcWriteOptions, -) -> Result { - let message = SchemaAsIpc::new(schema, options).try_into()?; - let IpcMessage(vals) = message; - Ok(vals) -} - /// Convert `RecordBatch`es to wire protocol `FlightData`s pub fn batches_to_flight_data( schema: &Schema, @@ -150,6 +90,7 @@ pub fn batches_to_flight_data( let mut flight_data = vec![]; let data_gen = writer::IpcDataGenerator::default(); + #[allow(deprecated)] let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); diff --git a/arrow-integration-test/LICENSE.txt b/arrow-integration-test/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-integration-test/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-integration-test/NOTICE.txt b/arrow-integration-test/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-integration-test/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index 32edc4165938..4b896ed391be 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -252,6 +252,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { _ => data_type, }; + #[allow(deprecated)] let mut field = Field::new_dict(name, data_type, nullable, dict_id, dict_is_ordered); field.set_metadata(metadata); Ok(field) @@ -274,17 +275,21 @@ pub fn field_to_json(field: &Field) -> serde_json::Value { }; match field.data_type() { - DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({ - "name": field.name(), - "nullable": field.is_nullable(), - "type": data_type_to_json(value_type), - "children": children, - "dictionary": { - "id": field.dict_id().unwrap(), - "indexType": data_type_to_json(index_type), - "isOrdered": field.dict_is_ordered().unwrap(), - } - }), + DataType::Dictionary(ref index_type, ref value_type) => { + #[allow(deprecated)] + let dict_id = field.dict_id().unwrap(); + serde_json::json!({ + "name": field.name(), + "nullable": field.is_nullable(), + "type": data_type_to_json(value_type), + "children": children, + "dictionary": { + "id": dict_id, + "indexType": data_type_to_json(index_type), + "isOrdered": field.dict_is_ordered().unwrap(), + } + }) + } _ => serde_json::json!({ "name": field.name(), "nullable": field.is_nullable(), diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index ea5b545f2e81..f025009c22de 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -787,6 +787,7 @@ pub fn array_from_json( Ok(Arc::new(array)) } DataType::Dictionary(key_type, value_type) => { + #[allow(deprecated)] let dict_id = field.dict_id().ok_or_else(|| { ArrowError::JsonError(format!("Unable to find dict_id for field {field:?}")) })?; @@ -930,10 +931,12 @@ pub fn dictionary_array_from_json( let null_buf = create_null_buf(&json_col); // build the key data into a buffer, then construct values separately + #[allow(deprecated)] let key_field = Field::new_dict( "key", dict_key.clone(), field.is_nullable(), + #[allow(deprecated)] field .dict_id() .expect("Dictionary fields must have a dict_id value"), @@ -1192,7 +1195,7 @@ mod tests { Field::new("utf8s", DataType::Utf8, true), Field::new( "lists", - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))), true, ), Field::new( @@ -1249,7 +1252,7 @@ mod tests { let value_data = Int32Array::from(vec![None, Some(2), None, None]); let value_offsets = Buffer::from_slice_ref([0, 3, 4, 4]); - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/arrow-integration-test/src/schema.rs b/arrow-integration-test/src/schema.rs index 541a1ec746ac..512f0aed8e54 100644 --- a/arrow-integration-test/src/schema.rs +++ b/arrow-integration-test/src/schema.rs @@ -150,7 +150,7 @@ mod tests { Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), Field::new( "c22", - DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Boolean, true))), false, ), Field::new( @@ -189,6 +189,7 @@ mod tests { Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), + #[allow(deprecated)] Field::new_dict( "c33", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 7be56d919852..8654b4b92734 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -36,20 +36,17 @@ logging = ["tracing-subscriber"] [dependencies] arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json", "ffi"] } arrow-flight = { path = "../arrow-flight", default-features = false } -arrow-buffer = { path = "../arrow-buffer", default-features = false } arrow-integration-test = { path = "../arrow-integration-test", default-features = false } -async-trait = { version = "0.1.41", default-features = false } clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } -hex = { version = "0.4", default-features = false, features = ["std"] } prost = { version = "0.13", default-features = false } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } -tokio = { version = "1.0", default-features = false } +tokio = { version = "1.0", default-features = false, features = [ "rt-multi-thread"] } tonic = { version = "0.12", default-features = false } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true } -num = { version = "0.4", default-features = false, features = ["std"] } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } [dev-dependencies] +arrow-buffer = { path = "../arrow-buffer", default-features = false } tempfile = { version = "3", default-features = false } diff --git a/arrow-integration-testing/README.md b/arrow-integration-testing/README.md index dcf39c27fbc5..86c79f5030ce 100644 --- a/arrow-integration-testing/README.md +++ b/arrow-integration-testing/README.md @@ -53,7 +53,7 @@ pip install -e dev/archery[integration] ### Build the C++ binaries: -Follow the [C++ Direction](https://github.com/apache/arrow/tree/master/docs/source/developers/cpp) and build the integration test binaries with a command like this: +Follow the [C++ Direction](https://github.com/apache/arrow/tree/main/docs/source/developers/cpp) and build the integration test binaries with a command like this: ``` # build cpp binaries diff --git a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs index 3e027faef91f..661f0a047db4 100644 --- a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs +++ b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow::error::Result; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::StreamWriter; diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index cc3dd2110e36..6a901cc63bab 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow::error::{ArrowError, Result}; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::FileWriter; diff --git a/arrow-integration-testing/src/bin/arrow-stream-to-file.rs b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs index 07ac5c7ddd42..8b4bb332781c 100644 --- a/arrow-integration-testing/src/bin/arrow-stream-to-file.rs +++ b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use std::io; use arrow::error::Result; diff --git a/arrow-integration-testing/src/bin/flight-test-integration-client.rs b/arrow-integration-testing/src/bin/flight-test-integration-client.rs index b8bbb952837b..0d16fe3b403f 100644 --- a/arrow-integration-testing/src/bin/flight-test-integration-client.rs +++ b/arrow-integration-testing/src/bin/flight-test-integration-client.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow_integration_testing::flight_client_scenarios; use clap::Parser; type Error = Box; diff --git a/arrow-integration-testing/src/bin/flight-test-integration-server.rs b/arrow-integration-testing/src/bin/flight-test-integration-server.rs index 5310d07d4f8e..94be71309799 100644 --- a/arrow-integration-testing/src/bin/flight-test-integration-server.rs +++ b/arrow-integration-testing/src/bin/flight-test-integration-server.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow_integration_testing::flight_server_scenarios; use clap::Parser; diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index c8289ff446a0..406419028d00 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -29,7 +29,7 @@ use arrow::{ }; use arrow_flight::{ flight_descriptor::DescriptorType, flight_service_client::FlightServiceClient, - utils::flight_data_to_arrow_batch, FlightData, FlightDescriptor, Location, SchemaAsIpc, Ticket, + utils::flight_data_to_arrow_batch, FlightData, FlightDescriptor, IpcMessage, Location, Ticket, }; use futures::{channel::mpsc, sink::SinkExt, stream, StreamExt}; use tonic::{Request, Streaming}; @@ -72,7 +72,20 @@ async fn upload_data( let (mut upload_tx, upload_rx) = mpsc::channel(10); let options = arrow::ipc::writer::IpcWriteOptions::default(); - let mut schema_flight_data: FlightData = SchemaAsIpc::new(&schema, &options).into(); + #[allow(deprecated)] + let mut dict_tracker = + writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); + let data_gen = writer::IpcDataGenerator::default(); + let data = IpcMessage( + data_gen + .schema_to_bytes_with_dictionary_tracker(&schema, &mut dict_tracker, &options) + .ipc_message + .into(), + ); + let mut schema_flight_data = FlightData { + data_header: data.0, + ..Default::default() + }; // arrow_flight::utils::flight_data_from_arrow_schema(&schema, &options); schema_flight_data.flight_descriptor = Some(descriptor.clone()); upload_tx.send(schema_flight_data).await?; @@ -82,7 +95,14 @@ async fn upload_data( if let Some((counter, first_batch)) = original_data_iter.next() { let metadata = counter.to_string().into_bytes(); // Preload the first batch into the channel before starting the request - send_batch(&mut upload_tx, &metadata, first_batch, &options).await?; + send_batch( + &mut upload_tx, + &metadata, + first_batch, + &options, + &mut dict_tracker, + ) + .await?; let outer = client.do_put(Request::new(upload_rx)).await?; let mut inner = outer.into_inner(); @@ -97,7 +117,14 @@ async fn upload_data( // Stream the rest of the batches for (counter, batch) in original_data_iter { let metadata = counter.to_string().into_bytes(); - send_batch(&mut upload_tx, &metadata, batch, &options).await?; + send_batch( + &mut upload_tx, + &metadata, + batch, + &options, + &mut dict_tracker, + ) + .await?; let r = inner .next() @@ -124,12 +151,12 @@ async fn send_batch( metadata: &[u8], batch: &RecordBatch, options: &writer::IpcWriteOptions, + dictionary_tracker: &mut writer::DictionaryTracker, ) -> Result { let data_gen = writer::IpcDataGenerator::default(); - let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, true); let (encoded_dictionaries, encoded_batch) = data_gen - .encoded_batch(batch, &mut dictionary_tracker, options) + .encoded_batch(batch, dictionary_tracker, options) .expect("DictionaryTracker configured above to not error on replacement"); let dictionary_flight_data: Vec = diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index 0f404b2ae289..92989a20393e 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -119,18 +119,32 @@ impl FlightService for FlightServiceImpl { .ok_or_else(|| Status::not_found(format!("Could not find flight. {key}")))?; let options = arrow::ipc::writer::IpcWriteOptions::default(); + #[allow(deprecated)] + let mut dictionary_tracker = + writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); + let data_gen = writer::IpcDataGenerator::default(); + let data = IpcMessage( + data_gen + .schema_to_bytes_with_dictionary_tracker( + &flight.schema, + &mut dictionary_tracker, + &options, + ) + .ipc_message + .into(), + ); + let schema_flight_data = FlightData { + data_header: data.0, + ..Default::default() + }; - let schema = std::iter::once(Ok(SchemaAsIpc::new(&flight.schema, &options).into())); + let schema = std::iter::once(Ok(schema_flight_data)); let batches = flight .chunks .iter() .enumerate() .flat_map(|(counter, batch)| { - let data_gen = writer::IpcDataGenerator::default(); - let mut dictionary_tracker = - writer::DictionaryTracker::new_with_preserve_dict_id(false, true); - let (encoded_dictionaries, encoded_batch) = data_gen .encoded_batch(batch, &mut dictionary_tracker, &options) .expect("DictionaryTracker configured above to not error on replacement"); diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs index c8ce01e9f13b..e669690ef4f5 100644 --- a/arrow-integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -17,6 +17,8 @@ //! Common code used in the integration test binaries +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] #![warn(missing_docs)] use serde_json::Value; diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 94b89a55f2fb..cf91b3a3415f 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -36,7 +36,6 @@ bench = false [dependencies] arrow-array = { workspace = true } arrow-buffer = { workspace = true } -arrow-cast = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } flatbuffers = { version = "24.3.25", default-features = false } diff --git a/arrow-ipc/LICENSE.txt b/arrow-ipc/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-ipc/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-ipc/NOTICE.txt b/arrow-ipc/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-ipc/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 18f5193bf038..37c5a19439c1 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -165,6 +165,7 @@ pub fn schema_to_fb_offset<'a>( impl From> for Field { fn from(field: crate::Field) -> Field { let arrow_field = if let Some(dictionary) = field.dictionary() { + #[allow(deprecated)] Field::new_dict( field.name().unwrap(), get_data_type(field, true), @@ -519,6 +520,7 @@ pub(crate) fn build_field<'a>( match dictionary_tracker { Some(tracker) => Some(get_fb_dictionary( index_type, + #[allow(deprecated)] tracker.set_dict_id(field), field .dict_is_ordered() @@ -527,6 +529,7 @@ pub(crate) fn build_field<'a>( )), None => Some(get_fb_dictionary( index_type, + #[allow(deprecated)] field .dict_id() .expect("Dictionary type must have a dictionary id"), @@ -1026,10 +1029,14 @@ mod tests { Field::new("utf8_view", DataType::Utf8View, false), Field::new("binary", DataType::Binary, false), Field::new("binary_view", DataType::BinaryView, false), - Field::new_list("list[u8]", Field::new("item", DataType::UInt8, false), true), + Field::new_list( + "list[u8]", + Field::new_list_field(DataType::UInt8, false), + true, + ), Field::new_fixed_size_list( "fixed_size_list[u8]", - Field::new("item", DataType::UInt8, false), + Field::new_list_field(DataType::UInt8, false), 2, true, ), @@ -1139,6 +1146,7 @@ mod tests { ), true, ), + #[allow(deprecated)] Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), @@ -1146,6 +1154,7 @@ mod tests { 123, true, ), + #[allow(deprecated)] Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt32)), diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index dcded32882fc..9ff4da30ed8c 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -196,6 +196,7 @@ fn create_array( let index_node = reader.next_node(field)?; let index_buffers = [reader.next_buffer()?, reader.next_buffer()?]; + #[allow(deprecated)] let dict_id = field.dict_id().ok_or_else(|| { ArrowError::ParseError(format!("Field {field} does not have dict id")) })?; @@ -617,6 +618,7 @@ fn read_dictionary_impl( } let id = batch.id(); + #[allow(deprecated)] let fields_using_this_dictionary = schema.fields_with_dict_id(id); let first_field = fields_using_this_dictionary.first().ok_or_else(|| { ArrowError::InvalidArgumentError(format!("dictionary id {id} not found in schema")) @@ -1407,10 +1409,10 @@ mod tests { fn create_test_projection_schema() -> Schema { // define field types - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let fixed_size_list_data_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, false)), 3); let union_fields = UnionFields::new( vec![0, 1], @@ -1424,7 +1426,7 @@ mod tests { let struct_fields = Fields::from(vec![ Field::new("id", DataType::Int32, false), - Field::new_list("list", Field::new("item", DataType::Int8, true), false), + Field::new_list("list", Field::new_list_field(DataType::Int8, true), false), ]); let struct_data_type = DataType::Struct(struct_fields); @@ -1725,6 +1727,7 @@ mod tests { let mut writer = crate::writer::FileWriter::try_new_with_options( &mut buf, batch.schema_ref(), + #[allow(deprecated)] IpcWriteOptions::default().with_preserve_dict_id(false), ) .unwrap(); @@ -1778,7 +1781,7 @@ mod tests { #[test] fn test_roundtrip_struct_empty_fields() { - let nulls = NullBuffer::from(&[true, true, false][..]); + let nulls = NullBuffer::from(&[true, true, false]); let rb = RecordBatch::try_from_iter([( "", Arc::new(StructArray::new_empty_fields(nulls.len(), Some(nulls))) as _, @@ -1869,6 +1872,7 @@ mod tests { let key_dict_keys = Int8Array::from_iter_values([0, 0, 2, 1, 1, 3]); let key_dict_array = DictionaryArray::new(key_dict_keys, values); + #[allow(deprecated)] let keys_field = Arc::new(Field::new_dict( "keys", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1876,6 +1880,7 @@ mod tests { 1, false, )); + #[allow(deprecated)] let values_field = Arc::new(Field::new_dict( "values", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1956,6 +1961,7 @@ mod tests { #[test] fn test_roundtrip_stream_dict_of_list_of_dict() { // list + #[allow(deprecated)] let list_data_type = DataType::List(Arc::new(Field::new_dict( "item", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1967,6 +1973,7 @@ mod tests { test_roundtrip_stream_dict_of_list_of_dict_impl::(list_data_type, offsets); // large list + #[allow(deprecated)] let list_data_type = DataType::LargeList(Arc::new(Field::new_dict( "item", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1985,6 +1992,7 @@ mod tests { let dict_array = DictionaryArray::new(keys, Arc::new(values)); let dict_data = dict_array.into_data(); + #[allow(deprecated)] let list_data_type = DataType::FixedSizeList( Arc::new(Field::new_dict( "item", @@ -2075,6 +2083,7 @@ mod tests { let key_dict_keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3]); let key_dict_array = DictionaryArray::new(key_dict_keys, utf8_view_array.clone()); + #[allow(deprecated)] let keys_field = Arc::new(Field::new_dict( "keys", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8View)), @@ -2085,6 +2094,7 @@ mod tests { let value_dict_keys = Int8Array::from_iter_values([0, 3, 0, 1, 2, 0, 1]); let value_dict_array = DictionaryArray::new(value_dict_keys, bin_view_array); + #[allow(deprecated)] let values_field = Arc::new(Field::new_dict( "values", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::BinaryView)), @@ -2150,6 +2160,7 @@ mod tests { .unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); let (_, encoded) = gen .encoded_batch(&batch, &mut dict_tracker, &Default::default()) @@ -2187,6 +2198,7 @@ mod tests { .unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); let (_, encoded) = gen .encoded_batch(&batch, &mut dict_tracker, &Default::default()) @@ -2326,6 +2338,7 @@ mod tests { ["a", "b"] .iter() .map(|name| { + #[allow(deprecated)] Field::new_dict( name.to_string(), DataType::Dictionary( @@ -2360,6 +2373,7 @@ mod tests { let mut writer = crate::writer::StreamWriter::try_new_with_options( &mut buf, batch.schema().as_ref(), + #[allow(deprecated)] crate::writer::IpcWriteOptions::default().with_preserve_dict_id(false), ) .expect("Failed to create StreamWriter"); diff --git a/arrow-ipc/src/reader/stream.rs b/arrow-ipc/src/reader/stream.rs index de5f5bdd629f..9b0eea9b6198 100644 --- a/arrow-ipc/src/reader/stream.rs +++ b/arrow-ipc/src/reader/stream.rs @@ -324,6 +324,7 @@ mod tests { "test1", DataType::RunEndEncoded( Arc::new(Field::new("run_ends".to_string(), DataType::Int32, false)), + #[allow(deprecated)] Arc::new(Field::new_dict( "values".to_string(), DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), @@ -353,6 +354,7 @@ mod tests { let mut writer = StreamWriter::try_new_with_options( &mut buffer, &schema, + #[allow(deprecated)] IpcWriteOptions::default().with_preserve_dict_id(false), ) .expect("Failed to create StreamWriter"); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index e6fc9d81df67..ee5b9a54cc90 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -23,6 +23,7 @@ use std::cmp::min; use std::collections::HashMap; use std::io::{BufWriter, Write}; +use std::mem::size_of; use std::sync::Arc; use flatbuffers::FlatBufferBuilder; @@ -63,7 +64,11 @@ pub struct IpcWriteOptions { /// Flag indicating whether the writer should preserve the dictionary IDs defined in the /// schema or generate unique dictionary IDs internally during encoding. /// - /// Defaults to `true` + /// Defaults to `false` + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] preserve_dict_id: bool, } @@ -107,12 +112,13 @@ impl IpcWriteOptions { | crate::MetadataVersion::V3 => Err(ArrowError::InvalidArgumentError( "Writing IPC metadata version 3 and lower not supported".to_string(), )), + #[allow(deprecated)] crate::MetadataVersion::V4 => Ok(Self { alignment, write_legacy_ipc_format, metadata_version, batch_compression_type: None, - preserve_dict_id: true, + preserve_dict_id: false, }), crate::MetadataVersion::V5 => { if write_legacy_ipc_format { @@ -120,12 +126,13 @@ impl IpcWriteOptions { "Legacy IPC format only supported on metadata version 4".to_string(), )) } else { + #[allow(deprecated)] Ok(Self { alignment, write_legacy_ipc_format, metadata_version, batch_compression_type: None, - preserve_dict_id: true, + preserve_dict_id: false, }) } } @@ -137,7 +144,12 @@ impl IpcWriteOptions { /// Return whether the writer is configured to preserve the dictionary IDs /// defined in the schema + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn preserve_dict_id(&self) -> bool { + #[allow(deprecated)] self.preserve_dict_id } @@ -148,6 +160,11 @@ impl IpcWriteOptions { /// to the dictionary batches in order to encode them correctly /// /// The default will change to `false` in future releases + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] + #[allow(deprecated)] pub fn with_preserve_dict_id(mut self, preserve_dict_id: bool) -> Self { self.preserve_dict_id = preserve_dict_id; self @@ -156,12 +173,13 @@ impl IpcWriteOptions { impl Default for IpcWriteOptions { fn default() -> Self { + #[allow(deprecated)] Self { alignment: 64, write_legacy_ipc_format: false, metadata_version: crate::MetadataVersion::V5, batch_compression_type: None, - preserve_dict_id: true, + preserve_dict_id: false, } } } @@ -419,6 +437,7 @@ impl IpcDataGenerator { // It's importnat to only take the dict_id at this point, because the dict ID // sequence is assigned depth-first, so we need to first encode children and have // them take their assigned dict IDs before we take the dict ID for this field. + #[allow(deprecated)] let dict_id = dict_id_seq .next() .or_else(|| field.dict_id()) @@ -766,6 +785,10 @@ pub struct DictionaryTracker { written: HashMap, dict_ids: Vec, error_on_replacement: bool, + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] preserve_dict_id: bool, } @@ -781,11 +804,12 @@ impl DictionaryTracker { /// the last seen dictionary ID (or using `0` if no other dictionary IDs have been /// seen) pub fn new(error_on_replacement: bool) -> Self { + #[allow(deprecated)] Self { written: HashMap::new(), dict_ids: Vec::new(), error_on_replacement, - preserve_dict_id: true, + preserve_dict_id: false, } } @@ -794,7 +818,12 @@ impl DictionaryTracker { /// If `error_on_replacement` /// is true, an error will be generated if an update to an /// existing dictionary is attempted. + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn new_with_preserve_dict_id(error_on_replacement: bool, preserve_dict_id: bool) -> Self { + #[allow(deprecated)] Self { written: HashMap::new(), dict_ids: Vec::new(), @@ -810,8 +839,14 @@ impl DictionaryTracker { /// /// If `preserve_dict_id` is false, this will return the value of the last `dict_id` assigned incremented by 1 /// or 0 in the case where no dictionary IDs have yet been assigned + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn set_dict_id(&mut self, field: &Field) -> i64 { + #[allow(deprecated)] let next = if self.preserve_dict_id { + #[allow(deprecated)] field.dict_id().expect("no dict_id in field") } else { self.dict_ids @@ -935,7 +970,9 @@ impl FileWriter { writer.write_all(&super::ARROW_MAGIC)?; writer.write_all(&PADDING[..pad_len])?; // write the schema, set the written bytes to the schema + header + #[allow(deprecated)] let preserve_dict_id = write_options.preserve_dict_id; + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(true, preserve_dict_id); let encoded_message = data_gen.schema_to_bytes_with_dictionary_tracker( @@ -1012,7 +1049,9 @@ impl FileWriter { let mut fbb = FlatBufferBuilder::new(); let dictionaries = fbb.create_vector(&self.dictionary_blocks); let record_batches = fbb.create_vector(&self.record_blocks); + #[allow(deprecated)] let preserve_dict_id = self.write_options.preserve_dict_id; + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(true, preserve_dict_id); let schema = IpcSchemaEncoder::new() @@ -1143,7 +1182,9 @@ impl StreamWriter { write_options: IpcWriteOptions, ) -> Result { let data_gen = IpcDataGenerator::default(); + #[allow(deprecated)] let preserve_dict_id = write_options.preserve_dict_id; + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(false, preserve_dict_id); @@ -1430,7 +1471,13 @@ fn reencode_offsets( let end_offset = offset_slice.last().unwrap(); let offsets = match start_offset.as_usize() { - 0 => offsets.clone(), + 0 => { + let size = size_of::(); + offsets.slice_with_length( + data.offset() * size, + (data.offset() + data.len() + 1) * size, + ) + } _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), }; @@ -2025,6 +2072,7 @@ mod tests { let array = Arc::new(inner) as ArrayRef; // Dict field with id 2 + #[allow(deprecated)] let dctfield = Field::new_dict("dict", array.data_type().clone(), false, 2, false); let union_fields = [(0, Arc::new(dctfield))].into_iter().collect(); @@ -2042,6 +2090,7 @@ mod tests { let batch = RecordBatch::try_new(schema, vec![Arc::new(union)]).unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); gen.encoded_batch(&batch, &mut dict_tracker, &Default::default()) .unwrap(); @@ -2058,6 +2107,7 @@ mod tests { let array = Arc::new(inner) as ArrayRef; // Dict field with id 2 + #[allow(deprecated)] let dctfield = Arc::new(Field::new_dict( "dict", array.data_type().clone(), @@ -2078,6 +2128,7 @@ mod tests { let batch = RecordBatch::try_new(schema, vec![struct_array]).unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); gen.encoded_batch(&batch, &mut dict_tracker, &Default::default()) .unwrap(); @@ -2517,6 +2568,36 @@ mod tests { ls.finish() } + fn generate_nested_list_data_starting_at_zero() -> GenericListArray { + let mut ls = + GenericListBuilder::::new(GenericListBuilder::::new(UInt32Builder::new())); + + for _i in 0..999 { + ls.values().append(true); + ls.append(true); + } + + for j in 0..10 { + for value in [j, j, j, j] { + ls.values().values().append_value(value); + } + ls.values().append(true) + } + ls.append(true); + + for i in 0..9_000 { + for j in 0..10 { + for value in [i + j, i + j, i + j, i + j] { + ls.values().values().append_value(value); + } + ls.values().append(true) + } + ls.append(true); + } + + ls.finish() + } + fn generate_map_array_data() -> MapArray { let keys_builder = UInt32Builder::new(); let values_builder = UInt32Builder::new(); @@ -2556,7 +2637,7 @@ mod tests { #[test] fn encode_lists() { - let val_inner = Field::new("item", DataType::UInt32, true); + let val_inner = Field::new_list_field(DataType::UInt32, true); let val_list_field = Field::new("val", DataType::List(Arc::new(val_inner)), false); let schema = Arc::new(Schema::new(vec![val_list_field])); @@ -2568,7 +2649,7 @@ mod tests { #[test] fn encode_empty_list() { - let val_inner = Field::new("item", DataType::UInt32, true); + let val_inner = Field::new_list_field(DataType::UInt32, true); let val_list_field = Field::new("val", DataType::List(Arc::new(val_inner)), false); let schema = Arc::new(Schema::new(vec![val_list_field])); @@ -2583,7 +2664,7 @@ mod tests { #[test] fn encode_large_lists() { - let val_inner = Field::new("item", DataType::UInt32, true); + let val_inner = Field::new_list_field(DataType::UInt32, true); let val_list_field = Field::new("val", DataType::LargeList(Arc::new(val_inner)), false); let schema = Arc::new(Schema::new(vec![val_list_field])); @@ -2597,8 +2678,8 @@ mod tests { #[test] fn encode_nested_lists() { - let inner_int = Arc::new(Field::new("item", DataType::UInt32, true)); - let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_int), true)); + let inner_int = Arc::new(Field::new_list_field(DataType::UInt32, true)); + let inner_list_field = Arc::new(Field::new_list_field(DataType::List(inner_int), true)); let list_field = Field::new("val", DataType::List(inner_list_field), true); let schema = Arc::new(Schema::new(vec![list_field])); @@ -2608,6 +2689,19 @@ mod tests { roundtrip_ensure_sliced_smaller(in_batch, 1000); } + #[test] + fn encode_nested_lists_starting_at_zero() { + let inner_int = Arc::new(Field::new("item", DataType::UInt32, true)); + let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_int), true)); + let list_field = Field::new("val", DataType::List(inner_list_field), true); + let schema = Arc::new(Schema::new(vec![list_field])); + + let values = Arc::new(generate_nested_list_data_starting_at_zero::()); + + let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + roundtrip_ensure_sliced_smaller(in_batch, 1); + } + #[test] fn encode_map_array() { let keys = Arc::new(Field::new("keys", DataType::UInt32, false)); diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 517bb03d2064..564cb9433b3d 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -48,7 +48,6 @@ chrono = { workspace = true } lexical-core = { version = "1.0", default-features = false} [dev-dependencies] -tempfile = "3.3" flate2 = { version = "1", default-features = false, features = ["rust_backend"] } serde = { version = "1.0", default-features = false, features = ["derive"] } futures = "0.3" diff --git a/arrow-json/LICENSE.txt b/arrow-json/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-json/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-json/NOTICE.txt b/arrow-json/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-json/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index bcacf6f706b8..f857e8813c7e 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -244,13 +244,6 @@ impl ReaderBuilder { Self { batch_size, ..self } } - /// Sets if the decoder should coerce primitive values (bool and number) into string - /// when the Schema's column is Utf8 or LargeUtf8. - #[deprecated(note = "Use with_coerce_primitive")] - pub fn coerce_primitive(self, coerce_primitive: bool) -> Self { - self.with_coerce_primitive(coerce_primitive) - } - /// Sets if the decoder should coerce primitive values (bool and number) into string /// when the Schema's column is Utf8 or LargeUtf8. pub fn with_coerce_primitive(self, coerce_primitive: bool) -> Self { @@ -691,6 +684,10 @@ fn make_decoder( DataType::Time32(TimeUnit::Millisecond) => primitive_decoder!(Time32MillisecondType, data_type), DataType::Time64(TimeUnit::Microsecond) => primitive_decoder!(Time64MicrosecondType, data_type), DataType::Time64(TimeUnit::Nanosecond) => primitive_decoder!(Time64NanosecondType, data_type), + DataType::Duration(TimeUnit::Nanosecond) => primitive_decoder!(DurationNanosecondType, data_type), + DataType::Duration(TimeUnit::Microsecond) => primitive_decoder!(DurationMicrosecondType, data_type), + DataType::Duration(TimeUnit::Millisecond) => primitive_decoder!(DurationMillisecondType, data_type), + DataType::Duration(TimeUnit::Second) => primitive_decoder!(DurationSecondType, data_type), DataType::Decimal128(p, s) => Ok(Box::new(DecimalArrayDecoder::::new(p, s))), DataType::Decimal256(p, s) => Ok(Box::new(DecimalArrayDecoder::::new(p, s))), DataType::Boolean => Ok(Box::::default()), @@ -1330,6 +1327,37 @@ mod tests { test_time::(); } + fn test_duration() { + let buf = r#" + {"a": 1, "b": "2"} + {"a": 3, "b": null} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", T::DATA_TYPE, true), + Field::new("b", T::DATA_TYPE, true), + ])); + + let batches = do_read(buf, 1024, true, false, schema); + assert_eq!(batches.len(), 1); + + let col_a = batches[0].column_by_name("a").unwrap().as_primitive::(); + assert_eq!(col_a.null_count(), 0); + assert_eq!(col_a.values(), &[1, 3].map(T::Native::usize_as)); + + let col2 = batches[0].column_by_name("b").unwrap().as_primitive::(); + assert_eq!(col2.null_count(), 1); + assert_eq!(col2.values(), &[2, 0].map(T::Native::usize_as)); + } + + #[test] + fn test_durations() { + test_duration::(); + test_duration::(); + test_duration::(); + test_duration::(); + } + #[test] fn test_delta_checkpoint() { let json = "{\"protocol\":{\"minReaderVersion\":1,\"minWriterVersion\":2}}"; @@ -1726,12 +1754,12 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), + &DataType::List(Arc::new(Field::new_list_field(DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), + &DataType::List(Arc::new(Field::new_list_field(DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); @@ -1770,7 +1798,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![Field::new( "items", - DataType::List(FieldRef::new(Field::new("item", DataType::Null, true))), + DataType::List(FieldRef::new(Field::new_list_field(DataType::Null, true))), true, )])); @@ -1794,9 +1822,8 @@ mod tests { let schema = Arc::new(Schema::new(vec![Field::new( "items", - DataType::List(FieldRef::new(Field::new( - "item", - DataType::List(FieldRef::new(Field::new("item", DataType::Null, true))), + DataType::List(FieldRef::new(Field::new_list_field( + DataType::List(FieldRef::new(Field::new_list_field(DataType::Null, true))), true, ))), true, diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index ace7b0ea5cb6..07eb40106de0 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -77,7 +77,7 @@ impl InferredType { /// Shorthand for building list data type of `ty` fn list_type_of(ty: DataType) -> DataType { - DataType::List(Arc::new(Field::new("item", ty, true))) + DataType::List(Arc::new(Field::new_list_field(ty, true))) } /// Coerce data type during inference diff --git a/arrow-json/src/writer/mod.rs b/arrow-json/src/writer/mod.rs index a37aa5ff8c2c..ee6d83a0a1f0 100644 --- a/arrow-json/src/writer/mod.rs +++ b/arrow-json/src/writer/mod.rs @@ -1771,7 +1771,7 @@ mod tests { #[test] fn test_writer_fixed_size_list() { let size = 3; - let field = FieldRef::new(Field::new("item", DataType::Int32, true)); + let field = FieldRef::new(Field::new_list_field(DataType::Int32, true)); let schema = SchemaRef::new(Schema::new(vec![Field::new( "list", DataType::FixedSizeList(field, size), diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index c9c30074fe6e..8d74d2f97d72 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -39,8 +39,7 @@ arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } -num = { version = "0.4", default-features = false, features = ["std"] } -half = { version = "2.1", default-features = false, features = ["num-traits"] } [dev-dependencies] +half = { version = "2.1", default-features = false, features = ["num-traits"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow-ord/LICENSE.txt b/arrow-ord/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-ord/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-ord/NOTICE.txt b/arrow-ord/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-ord/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs index f571e26c444c..2727ff996150 100644 --- a/arrow-ord/src/cmp.rs +++ b/arrow-ord/src/cmp.rs @@ -656,7 +656,10 @@ pub fn compare_byte_view( /// /// # Safety /// The left/right_idx must within range of each array -#[deprecated(note = "Use `GenericByteViewArray::compare_unchecked` instead")] +#[deprecated( + since = "52.2.0", + note = "Use `GenericByteViewArray::compare_unchecked` instead" +)] pub unsafe fn compare_byte_view_unchecked( left: &GenericByteViewArray, left_idx: usize, diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index d60bc3b8de88..bb82f54d4918 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -821,7 +821,7 @@ mod tests { .into_data(); let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 6, 9]); let list_data_type = - DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index 6430c8f0e405..55e397cd8aa4 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -265,7 +265,7 @@ fn compare_struct( Ok(f) } -#[deprecated(note = "Use make_comparator")] +#[deprecated(since = "52.0.0", note = "Use make_comparator")] #[doc(hidden)] pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result { make_comparator(left, right, SortOptions::default()) @@ -394,7 +394,7 @@ pub fn make_comparator( } #[cfg(test)] -pub mod tests { +mod tests { use super::*; use arrow_array::builder::{Int32Builder, ListBuilder}; use arrow_buffer::{i256, IntervalDayTime, OffsetBuffer}; @@ -849,7 +849,7 @@ pub mod tests { fn test_struct() { let fields = Fields::from(vec![ Field::new("a", DataType::Int32, true), - Field::new_list("b", Field::new("item", DataType::Int32, true), true), + Field::new_list("b", Field::new_list_field(DataType::Int32, true), true), ]); let a = Int32Array::from(vec![Some(1), Some(2), None, None]); diff --git a/arrow-ord/src/partition.rs b/arrow-ord/src/partition.rs index 8c87eefadbf0..ec1647393239 100644 --- a/arrow-ord/src/partition.rs +++ b/arrow-ord/src/partition.rs @@ -24,7 +24,6 @@ use arrow_buffer::BooleanBuffer; use arrow_schema::ArrowError; use crate::cmp::distinct; -use crate::sort::SortColumn; /// A computed set of partitions, see [`partition`] #[derive(Debug, Clone)] @@ -160,21 +159,6 @@ fn find_boundaries(v: &dyn Array) -> Result { Ok(distinct(&v1, &v2)?.values().clone()) } -/// Use [`partition`] instead. Given a list of already sorted columns, find -/// partition ranges that would partition lexicographically equal values across -/// columns. -/// -/// The returned vec would be of size k where k is cardinality of the sorted values; Consecutive -/// values will be connected: (a, b) and (b, c), where start = 0 and end = n for the first and last -/// range. -#[deprecated(note = "Use partition")] -pub fn lexicographical_partition_ranges( - columns: &[SortColumn], -) -> Result> + '_, ArrowError> { - let cols: Vec<_> = columns.iter().map(|x| x.values.clone()).collect(); - Ok(partition(&cols)?.ranges().into_iter()) -} - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 0834f2d13384..03d08df30959 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -34,4 +34,4 @@ crate-type = ["cdylib"] [dependencies] arrow = { path = "../arrow", features = ["pyarrow"] } -pyo3 = { version = "0.22", features = ["extension-module"] } +pyo3 = { version = "0.23", features = ["extension-module"] } diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index e12c1389e66f..d4908fff0897 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -43,7 +43,7 @@ fn to_py_err(err: ArrowError) -> PyErr { #[pyfunction] fn double(array: &Bound, py: Python) -> PyResult { // import - let array = make_array(ArrayData::from_pyarrow_bound(&array)?); + let array = make_array(ArrayData::from_pyarrow_bound(array)?); // perform some operation let array = array diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 3754afb4dbc6..90d99684d265 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -33,12 +33,6 @@ name = "arrow_row" path = "src/lib.rs" bench = false -[target.'cfg(target_arch = "wasm32")'.dependencies] -ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } - -[target.'cfg(not(target_arch = "wasm32"))'.dependencies] -ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } - [dependencies] arrow-array = { workspace = true } arrow-buffer = { workspace = true } diff --git a/arrow-row/LICENSE.txt b/arrow-row/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-row/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-row/NOTICE.txt b/arrow-row/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-row/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 5780bdbfefb9..d0fad12210db 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -2317,7 +2317,7 @@ mod tests { let values_len = offsets.last().unwrap().to_usize().unwrap(); let values = values(values_len); let nulls = NullBuffer::from_iter((0..len).map(|_| rng.gen_bool(valid_percent))); - let field = Arc::new(Field::new("item", values.data_type().clone(), true)); + let field = Arc::new(Field::new_list_field(values.data_type().clone(), true)); ListArray::new(field, offsets, values, Some(nulls)) } diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 628d4a683cac..1e1f9fbde0e4 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -47,3 +47,8 @@ features = ["ffi"] [dev-dependencies] serde_json = "1.0" bincode = { version = "1.3.3", default-features = false } +criterion = { version = "0.5", default-features = false } + +[[bench]] +name = "ffi" +harness = false \ No newline at end of file diff --git a/arrow-schema/LICENSE.txt b/arrow-schema/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-schema/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-schema/NOTICE.txt b/arrow-schema/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-schema/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-schema/benches/ffi.rs b/arrow-schema/benches/ffi.rs new file mode 100644 index 000000000000..1285acb883ea --- /dev/null +++ b/arrow-schema/benches/ffi.rs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::ffi::FFI_ArrowSchema; +use arrow_schema::{DataType, Field}; +use criterion::*; +use std::sync::Arc; + +fn criterion_benchmark(c: &mut Criterion) { + let fields = vec![ + Arc::new(Field::new("c1", DataType::Utf8, false)), + Arc::new(Field::new("c2", DataType::Utf8, false)), + Arc::new(Field::new("c3", DataType::Utf8, false)), + Arc::new(Field::new("c4", DataType::Utf8, false)), + Arc::new(Field::new("c5", DataType::Utf8, false)), + ]; + let data_type = DataType::Struct(fields.into()); + c.bench_function("ffi_arrow_schema_try_from", |b| { + b.iter(|| FFI_ArrowSchema::try_from(&data_type)); + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index ff5832dfa68c..a6333c804805 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -40,7 +40,7 @@ use crate::{ArrowError, Field, FieldRef, Fields, UnionFields}; /// # use arrow_schema::{DataType, Field}; /// # use std::sync::Arc; /// // create a new list of 32-bit signed integers directly -/// let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); +/// let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); /// // Create the same list type with constructor /// let list_data_type2 = DataType::new_list(DataType::Int32, true); /// assert_eq!(list_data_type, list_data_type2); @@ -837,21 +837,21 @@ mod tests { #[test] fn test_list_datatype_equality() { // tests that list type equality is checked while ignoring list names - let list_a = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_a = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_b = DataType::List(Arc::new(Field::new("array", DataType::Int32, true))); - let list_c = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); - let list_d = DataType::List(Arc::new(Field::new("item", DataType::UInt32, true))); + let list_c = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); + let list_d = DataType::List(Arc::new(Field::new_list_field(DataType::UInt32, true))); assert!(list_a.equals_datatype(&list_b)); assert!(!list_a.equals_datatype(&list_c)); assert!(!list_b.equals_datatype(&list_c)); assert!(!list_a.equals_datatype(&list_d)); let list_e = - DataType::FixedSizeList(Arc::new(Field::new("item", list_a.clone(), false)), 3); + DataType::FixedSizeList(Arc::new(Field::new_list_field(list_a.clone(), false)), 3); let list_f = DataType::FixedSizeList(Arc::new(Field::new("array", list_b.clone(), false)), 3); let list_g = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::FixedSizeBinary(3), true)), + Arc::new(Field::new_list_field(DataType::FixedSizeBinary(3), true)), 3, ); assert!(list_e.equals_datatype(&list_f)); diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 4378950329f3..bf557d8941dc 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -90,8 +90,8 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let data_type = self.parse_next_type()?; self.expect_token(Token::RParen)?; - Ok(DataType::List(Arc::new(Field::new( - "item", data_type, true, + Ok(DataType::List(Arc::new(Field::new_list_field( + data_type, true, )))) } @@ -100,8 +100,8 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let data_type = self.parse_next_type()?; self.expect_token(Token::RParen)?; - Ok(DataType::LargeList(Arc::new(Field::new( - "item", data_type, true, + Ok(DataType::LargeList(Arc::new(Field::new_list_field( + data_type, true, )))) } @@ -113,7 +113,7 @@ impl<'a> Parser<'a> { let data_type = self.parse_next_type()?; self.expect_token(Token::RParen)?; Ok(DataType::FixedSizeList( - Arc::new(Field::new("item", data_type, true)), + Arc::new(Field::new_list_field(data_type, true)), length, )) } diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 70650d769cf6..96c80974982c 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -38,6 +38,7 @@ use crate::{ ArrowError, DataType, Field, FieldRef, IntervalUnit, Schema, TimeUnit, UnionFields, UnionMode, }; use bitflags::bitflags; +use std::borrow::Cow; use std::sync::Arc; use std::{ collections::HashMap, @@ -685,57 +686,59 @@ impl TryFrom<&DataType> for FFI_ArrowSchema { } } -fn get_format_string(dtype: &DataType) -> Result { +fn get_format_string(dtype: &DataType) -> Result, ArrowError> { match dtype { - DataType::Null => Ok("n".to_string()), - DataType::Boolean => Ok("b".to_string()), - DataType::Int8 => Ok("c".to_string()), - DataType::UInt8 => Ok("C".to_string()), - DataType::Int16 => Ok("s".to_string()), - DataType::UInt16 => Ok("S".to_string()), - DataType::Int32 => Ok("i".to_string()), - DataType::UInt32 => Ok("I".to_string()), - DataType::Int64 => Ok("l".to_string()), - DataType::UInt64 => Ok("L".to_string()), - DataType::Float16 => Ok("e".to_string()), - DataType::Float32 => Ok("f".to_string()), - DataType::Float64 => Ok("g".to_string()), - DataType::BinaryView => Ok("vz".to_string()), - DataType::Binary => Ok("z".to_string()), - DataType::LargeBinary => Ok("Z".to_string()), - DataType::Utf8View => Ok("vu".to_string()), - DataType::Utf8 => Ok("u".to_string()), - DataType::LargeUtf8 => Ok("U".to_string()), - DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{num_bytes}")), - DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{num_elems}")), - DataType::Decimal128(precision, scale) => Ok(format!("d:{precision},{scale}")), - DataType::Decimal256(precision, scale) => Ok(format!("d:{precision},{scale},256")), - DataType::Date32 => Ok("tdD".to_string()), - DataType::Date64 => Ok("tdm".to_string()), - DataType::Time32(TimeUnit::Second) => Ok("tts".to_string()), - DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".to_string()), - DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".to_string()), - DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".to_string()), - DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".to_string()), - DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".to_string()), - DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".to_string()), - DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".to_string()), - DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(format!("tss:{tz}")), - DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(format!("tsm:{tz}")), - DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(format!("tsu:{tz}")), - DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(format!("tsn:{tz}")), - DataType::Duration(TimeUnit::Second) => Ok("tDs".to_string()), - DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".to_string()), - DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".to_string()), - DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".to_string()), - DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".to_string()), - DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".to_string()), - DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".to_string()), - DataType::List(_) => Ok("+l".to_string()), - DataType::LargeList(_) => Ok("+L".to_string()), - DataType::Struct(_) => Ok("+s".to_string()), - DataType::Map(_, _) => Ok("+m".to_string()), - DataType::RunEndEncoded(_, _) => Ok("+r".to_string()), + DataType::Null => Ok("n".into()), + DataType::Boolean => Ok("b".into()), + DataType::Int8 => Ok("c".into()), + DataType::UInt8 => Ok("C".into()), + DataType::Int16 => Ok("s".into()), + DataType::UInt16 => Ok("S".into()), + DataType::Int32 => Ok("i".into()), + DataType::UInt32 => Ok("I".into()), + DataType::Int64 => Ok("l".into()), + DataType::UInt64 => Ok("L".into()), + DataType::Float16 => Ok("e".into()), + DataType::Float32 => Ok("f".into()), + DataType::Float64 => Ok("g".into()), + DataType::BinaryView => Ok("vz".into()), + DataType::Binary => Ok("z".into()), + DataType::LargeBinary => Ok("Z".into()), + DataType::Utf8View => Ok("vu".into()), + DataType::Utf8 => Ok("u".into()), + DataType::LargeUtf8 => Ok("U".into()), + DataType::FixedSizeBinary(num_bytes) => Ok(Cow::Owned(format!("w:{num_bytes}"))), + DataType::FixedSizeList(_, num_elems) => Ok(Cow::Owned(format!("+w:{num_elems}"))), + DataType::Decimal128(precision, scale) => Ok(Cow::Owned(format!("d:{precision},{scale}"))), + DataType::Decimal256(precision, scale) => { + Ok(Cow::Owned(format!("d:{precision},{scale},256"))) + } + DataType::Date32 => Ok("tdD".into()), + DataType::Date64 => Ok("tdm".into()), + DataType::Time32(TimeUnit::Second) => Ok("tts".into()), + DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".into()), + DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".into()), + DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".into()), + DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".into()), + DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".into()), + DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".into()), + DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".into()), + DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(Cow::Owned(format!("tss:{tz}"))), + DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(Cow::Owned(format!("tsm:{tz}"))), + DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(Cow::Owned(format!("tsu:{tz}"))), + DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(Cow::Owned(format!("tsn:{tz}"))), + DataType::Duration(TimeUnit::Second) => Ok("tDs".into()), + DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".into()), + DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".into()), + DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".into()), + DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".into()), + DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".into()), + DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()), + DataType::List(_) => Ok("+l".into()), + DataType::LargeList(_) => Ok("+L".into()), + DataType::Struct(_) => Ok("+s".into()), + DataType::Map(_, _) => Ok("+m".into()), + DataType::RunEndEncoded(_, _) => Ok("+r".into()), DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type), DataType::Union(fields, mode) => { let formats = fields @@ -743,8 +746,8 @@ fn get_format_string(dtype: &DataType) -> Result { .map(|(t, _)| t.to_string()) .collect::>(); match mode { - UnionMode::Dense => Ok(format!("{}:{}", "+ud", formats.join(","))), - UnionMode::Sparse => Ok(format!("{}:{}", "+us", formats.join(","))), + UnionMode::Dense => Ok(Cow::Owned(format!("{}:{}", "+ud", formats.join(",")))), + UnionMode::Sparse => Ok(Cow::Owned(format!("{}:{}", "+us", formats.join(",")))), } } other => Err(ArrowError::CDataInterface(format!( @@ -920,6 +923,7 @@ mod tests { #[test] fn test_dictionary_ordered() { + #[allow(deprecated)] let schema = Schema::new(vec![Field::new_dict( "dict", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index b532ea8616b6..7d47c0ae1dea 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -38,6 +38,10 @@ pub struct Field { name: String, data_type: DataType, nullable: bool, + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] dict_id: i64, dict_is_ordered: bool, /// A map of key-value pairs containing additional custom meta data. @@ -117,8 +121,12 @@ impl Hash for Field { } impl Field { + /// Default list member field name + pub const LIST_FIELD_DEFAULT_NAME: &'static str = "item"; + /// Creates a new field with the given name, type, and nullability pub fn new(name: impl Into, data_type: DataType, nullable: bool) -> Self { + #[allow(deprecated)] Field { name: name.into(), data_type, @@ -144,10 +152,14 @@ impl Field { /// ); /// ``` pub fn new_list_field(data_type: DataType, nullable: bool) -> Self { - Self::new("item", data_type, nullable) + Self::new(Self::LIST_FIELD_DEFAULT_NAME, data_type, nullable) } /// Creates a new field that has additional dictionary information + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With the dict_id field disappearing this function signature will change by removing the dict_id parameter." + )] pub fn new_dict( name: impl Into, data_type: DataType, @@ -155,6 +167,7 @@ impl Field { dict_id: i64, dict_is_ordered: bool, ) -> Self { + #[allow(deprecated)] Field { name: name.into(), data_type, @@ -383,19 +396,30 @@ impl Field { /// Returns a vector containing all (potentially nested) `Field` instances selected by the /// dictionary ID they use #[inline] + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] pub(crate) fn fields_with_dict_id(&self, id: i64) -> Vec<&Field> { self.fields() .into_iter() .filter(|&field| { - matches!(field.data_type(), DataType::Dictionary(_, _)) && field.dict_id == id + #[allow(deprecated)] + let matching_dict_id = field.dict_id == id; + matches!(field.data_type(), DataType::Dictionary(_, _)) && matching_dict_id }) .collect() } /// Returns the dictionary ID, if this is a dictionary type. #[inline] + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] pub const fn dict_id(&self) -> Option { match self.data_type { + #[allow(deprecated)] DataType::Dictionary(_, _) => Some(self.dict_id), _ => None, } @@ -425,6 +449,7 @@ impl Field { /// assert!(field.is_nullable()); /// ``` pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> { + #[allow(deprecated)] if from.dict_id != self.dict_id { return Err(ArrowError::SchemaError(format!( "Fail to merge schema field '{}' because from dict_id = {} does not match {}", @@ -567,9 +592,11 @@ impl Field { /// * self.metadata is a superset of other.metadata /// * all other fields are equal pub fn contains(&self, other: &Field) -> bool { + #[allow(deprecated)] + let matching_dict_id = self.dict_id == other.dict_id; self.name == other.name && self.data_type.contains(&other.data_type) - && self.dict_id == other.dict_id + && matching_dict_id && self.dict_is_ordered == other.dict_is_ordered // self need to be nullable or both of them are not nullable && (self.nullable || !other.nullable) @@ -618,6 +645,7 @@ mod test { fn test_new_dict_with_string() { // Fields should allow owned Strings to support reuse let s = "c1"; + #[allow(deprecated)] Field::new_dict(s, DataType::Int64, false, 4, false); } @@ -735,6 +763,7 @@ mod test { #[test] fn test_fields_with_dict_id() { + #[allow(deprecated)] let dict1 = Field::new_dict( "dict1", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), @@ -742,6 +771,7 @@ mod test { 10, false, ); + #[allow(deprecated)] let dict2 = Field::new_dict( "dict2", DataType::Dictionary(DataType::Int32.into(), DataType::Int8.into()), @@ -778,9 +808,11 @@ mod test { false, ); + #[allow(deprecated)] for field in field.fields_with_dict_id(10) { assert_eq!(dict1, *field); } + #[allow(deprecated)] for field in field.fields_with_dict_id(20) { assert_eq!(dict2, *field); } @@ -795,6 +827,7 @@ mod test { #[test] fn test_field_comparison_case() { // dictionary-encoding properties not used for field comparison + #[allow(deprecated)] let dict1 = Field::new_dict( "dict1", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), @@ -802,6 +835,7 @@ mod test { 10, false, ); + #[allow(deprecated)] let dict2 = Field::new_dict( "dict1", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), @@ -813,6 +847,7 @@ mod test { assert_eq!(dict1, dict2); assert_eq!(get_field_hash(&dict1), get_field_hash(&dict2)); + #[allow(deprecated)] let dict1 = Field::new_dict( "dict0", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 5b9ce2a6da61..904b933cd299 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -18,7 +18,7 @@ use std::ops::Deref; use std::sync::Arc; -use crate::{ArrowError, DataType, Field, FieldRef, SchemaBuilder}; +use crate::{ArrowError, DataType, Field, FieldRef}; /// A cheaply cloneable, owned slice of [`FieldRef`] /// @@ -256,33 +256,6 @@ impl Fields { .collect(); Ok(filtered) } - - /// Remove a field by index and return it. - /// - /// # Panic - /// - /// Panics if `index` is out of bounds. - /// - /// # Example - /// ``` - /// use arrow_schema::{DataType, Field, Fields}; - /// let mut fields = Fields::from(vec![ - /// Field::new("a", DataType::Boolean, false), - /// Field::new("b", DataType::Int8, false), - /// Field::new("c", DataType::Utf8, false), - /// ]); - /// assert_eq!(fields.len(), 3); - /// assert_eq!(fields.remove(1), Field::new("b", DataType::Int8, false).into()); - /// assert_eq!(fields.len(), 2); - /// ``` - #[deprecated(note = "Use SchemaBuilder::remove")] - #[doc(hidden)] - pub fn remove(&mut self, index: usize) -> FieldRef { - let mut builder = SchemaBuilder::from(Fields::from(&*self.0)); - let field = builder.remove(index); - *self = builder.finish().fields; - field - } } impl Default for Fields { @@ -496,7 +469,12 @@ mod tests { Field::new("floats", DataType::Struct(floats.clone()), true), true, ), - Field::new_fixed_size_list("f", Field::new("item", DataType::Int32, false), 3, false), + Field::new_fixed_size_list( + "f", + Field::new_list_field(DataType::Int32, false), + 3, + false, + ), Field::new_map( "g", "entries", diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index cc3a8a308a83..6c79da53f981 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -389,7 +389,12 @@ impl Schema { /// Returns a vector of immutable references to all [`Field`] instances selected by /// the dictionary ID they use. + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn fields_with_dict_id(&self, dict_id: i64) -> Vec<&Field> { + #[allow(deprecated)] self.fields .iter() .flat_map(|f| f.fields_with_dict_id(dict_id)) @@ -434,33 +439,6 @@ impl Schema { .iter() .all(|(k, v1)| self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default()) } - - /// Remove field by index and return it. Recommend to use [`SchemaBuilder`] - /// if you are looking to remove multiple columns, as this will save allocations. - /// - /// # Panic - /// - /// Panics if `index` is out of bounds. - /// - /// # Example - /// - /// ``` - /// use arrow_schema::{DataType, Field, Schema}; - /// let mut schema = Schema::new(vec![ - /// Field::new("a", DataType::Boolean, false), - /// Field::new("b", DataType::Int8, false), - /// Field::new("c", DataType::Utf8, false), - /// ]); - /// assert_eq!(schema.fields.len(), 3); - /// assert_eq!(schema.remove(1), Field::new("b", DataType::Int8, false).into()); - /// assert_eq!(schema.fields.len(), 2); - /// ``` - #[deprecated(note = "Use SchemaBuilder::remove")] - #[doc(hidden)] - #[allow(deprecated)] - pub fn remove(&mut self, index: usize) -> FieldRef { - self.fields.remove(index) - } } impl fmt::Display for Schema { @@ -665,7 +643,9 @@ mod tests { assert_eq!(first_name.name(), "first_name"); assert_eq!(first_name.data_type(), &DataType::Utf8); assert!(!first_name.is_nullable()); - assert_eq!(first_name.dict_id(), None); + #[allow(deprecated)] + let dict_id = first_name.dict_id(); + assert_eq!(dict_id, None); assert_eq!(first_name.dict_is_ordered(), None); let metadata = first_name.metadata(); @@ -682,7 +662,9 @@ mod tests { interests.data_type(), &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) ); - assert_eq!(interests.dict_id(), Some(123)); + #[allow(deprecated)] + let dict_id = interests.dict_id(); + assert_eq!(dict_id, Some(123)); assert_eq!(interests.dict_is_ordered(), Some(true)); } @@ -718,6 +700,7 @@ mod tests { fn schema_field_with_dict_id() { let schema = person_schema(); + #[allow(deprecated)] let fields_dict_123: Vec<_> = schema .fields_with_dict_id(123) .iter() @@ -725,7 +708,9 @@ mod tests { .collect(); assert_eq!(fields_dict_123, vec!["interests"]); - assert!(schema.fields_with_dict_id(456).is_empty()); + #[allow(deprecated)] + let is_empty = schema.fields_with_dict_id(456).is_empty(); + assert!(is_empty); } fn person_schema() -> Schema { @@ -745,6 +730,7 @@ mod tests { ])), false, ), + #[allow(deprecated)] Field::new_dict( "interests", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), diff --git a/arrow-select/LICENSE.txt b/arrow-select/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-select/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-select/NOTICE.txt b/arrow-select/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-select/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 4c6a5c0668f1..c91732848653 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -431,17 +431,17 @@ where R::Native: AddAssign, { let run_ends: &RunEndBuffer = array.run_ends(); - let mut values_filter = BooleanBufferBuilder::new(run_ends.len()); let mut new_run_ends = vec![R::default_value(); run_ends.len()]; let mut start = 0u64; - let mut i = 0; + let mut j = 0; let mut count = R::default_value(); let filter_values = predicate.filter.values(); + let run_ends = run_ends.inner(); - for mut end in run_ends.inner().into_iter().map(|i| (*i).into() as u64) { + let pred: BooleanArray = BooleanBuffer::collect_bool(run_ends.len(), |i| { let mut keep = false; - + let mut end = run_ends[i].into() as u64; let difference = end.saturating_sub(filter_values.len() as u64); end -= difference; @@ -450,23 +450,18 @@ where count += R::Native::from(pred); keep |= pred } - // this is to avoid branching - new_run_ends[i] = count; - i += keep as usize; + new_run_ends[j] = count; + j += keep as usize; - values_filter.append(keep); start = end; - } - - new_run_ends.truncate(i); + keep + }) + .into(); - if values_filter.is_empty() { - new_run_ends.clear(); - } + new_run_ends.truncate(j); let values = array.values(); - let pred = BooleanArray::new(values_filter.finish(), None); let values = filter(&values, &pred)?; let run_ends = PrimitiveArray::::new(new_run_ends.into(), None); @@ -522,14 +517,14 @@ fn filter_bits(buffer: &BooleanBuffer, predicate: &FilterPredicate) -> Buffer { unsafe { MutableBuffer::from_trusted_len_iter_bool(bits).into() } } IterationStrategy::SlicesIterator => { - let mut builder = BooleanBufferBuilder::new(bit_util::ceil(predicate.count, 8)); + let mut builder = BooleanBufferBuilder::new(predicate.count); for (start, end) in SlicesIterator::new(&predicate.filter) { builder.append_packed_range(start + offset..end + offset, src) } builder.into() } IterationStrategy::Slices(slices) => { - let mut builder = BooleanBufferBuilder::new(bit_util::ceil(predicate.count, 8)); + let mut builder = BooleanBufferBuilder::new(predicate.count); for (start, end) in slices { builder.append_packed_range(*start + offset..*end + offset, src) } @@ -1336,7 +1331,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8, 8]); let list_data_type = - DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) @@ -1360,7 +1355,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0i64, 3, 3]); let list_data_type = - DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, false))); let expected = ArrayData::builder(list_data_type) .len(2) .add_buffer(value_offsets) diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index a0520e969a6b..4a47017b79ab 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -265,6 +265,67 @@ fn interleave_fallback( Ok(make_array(array_data.freeze())) } +/// Interleave rows by index from multiple [`RecordBatch`] instances and return a new [`RecordBatch`]. +/// +/// This function will call [`interleave`] on each array of the [`RecordBatch`] instances and assemble a new [`RecordBatch`]. +/// +/// # Example +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{StringArray, Int32Array, RecordBatch, UInt32Array}; +/// # use arrow_schema::{DataType, Field, Schema}; +/// # use arrow_select::interleave::interleave_record_batch; +/// +/// let schema = Arc::new(Schema::new(vec![ +/// Field::new("a", DataType::Int32, true), +/// Field::new("b", DataType::Utf8, true), +/// ])); +/// +/// let batch1 = RecordBatch::try_new( +/// schema.clone(), +/// vec![ +/// Arc::new(Int32Array::from(vec![0, 1, 2])), +/// Arc::new(StringArray::from(vec!["a", "b", "c"])), +/// ], +/// ).unwrap(); +/// +/// let batch2 = RecordBatch::try_new( +/// schema.clone(), +/// vec![ +/// Arc::new(Int32Array::from(vec![3, 4, 5])), +/// Arc::new(StringArray::from(vec!["d", "e", "f"])), +/// ], +/// ).unwrap(); +/// +/// let indices = vec![(0, 1), (1, 2), (0, 0), (1, 1)]; +/// let interleaved = interleave_record_batch(&[&batch1, &batch2], &indices).unwrap(); +/// +/// let expected = RecordBatch::try_new( +/// schema, +/// vec![ +/// Arc::new(Int32Array::from(vec![1, 5, 0, 4])), +/// Arc::new(StringArray::from(vec!["b", "f", "a", "e"])), +/// ], +/// ).unwrap(); +/// assert_eq!(interleaved, expected); +/// ``` +pub fn interleave_record_batch( + record_batches: &[&RecordBatch], + indices: &[(usize, usize)], +) -> Result { + let schema = record_batches[0].schema(); + let columns = (0..schema.fields().len()) + .map(|i| { + let column_values: Vec<&dyn Array> = record_batches + .iter() + .map(|batch| batch.column(i).as_ref()) + .collect(); + interleave(&column_values, indices) + }) + .collect::, _>>()?; + RecordBatch::try_new(schema, columns) +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 07630a49fa11..71a7c77a8f92 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1606,7 +1606,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref(&value_offsets); // Construct a list array from the above two let list_data_type = - DataType::$list_data_type(Arc::new(Field::new("item", DataType::Int32, false))); + DataType::$list_data_type(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -1672,7 +1672,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref(&value_offsets); // Construct a list array from the above two let list_data_type = - DataType::$list_data_type(Arc::new(Field::new("item", DataType::Int32, true))); + DataType::$list_data_type(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -1739,7 +1739,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref(&value_offsets); // Construct a list array from the above two let list_data_type = - DataType::$list_data_type(Arc::new(Field::new("item", DataType::Int32, true))); + DataType::$list_data_type(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -1904,7 +1904,8 @@ mod tests { // Construct offsets let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -2222,7 +2223,7 @@ mod tests { fn test_take_fixed_size_list_null_indices() { let indices = Int32Array::from_iter([Some(0), None]); let values = Arc::new(Int32Array::from(vec![0, 1, 2, 3])); - let arr_field = Arc::new(Field::new("item", values.data_type().clone(), true)); + let arr_field = Arc::new(Field::new_list_field(values.data_type().clone(), true)); let values = FixedSizeListArray::try_new(arr_field, 2, values, None).unwrap(); let r = take(&values, &indices, None).unwrap(); diff --git a/arrow-string/LICENSE.txt b/arrow-string/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow-string/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow-string/NOTICE.txt b/arrow-string/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow-string/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 6a28d44ea7aa..49fc244e72cc 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -710,7 +710,7 @@ mod tests { .build() .unwrap(); let list_data_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, false)), 3); let nulls = NullBuffer::from(vec![true, false, true]); let list_data = ArrayData::builder(list_data_type) .len(3) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 0a5aa77dbb95..e30e09146c6d 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -18,13 +18,16 @@ //! Provide SQL's LIKE operators for Arrow's string arrays use crate::predicate::Predicate; + use arrow_array::cast::AsArray; use arrow_array::*; use arrow_schema::*; use arrow_select::take::take; -use iterator::ArrayIter; + use std::sync::Arc; +pub use arrow_array::StringArrayType; + #[derive(Debug)] enum Op { Like(bool), @@ -150,39 +153,6 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result: ArrayAccessor + Sized { - /// Returns true if all data within this string array is ASCII - fn is_ascii(&self) -> bool; - /// Constructs a new iterator - fn iter(&self) -> ArrayIter; -} - -impl<'a, O: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray { - fn is_ascii(&self) -> bool { - GenericStringArray::::is_ascii(self) - } - - fn iter(&self) -> ArrayIter { - GenericStringArray::::iter(self) - } -} -impl<'a> StringArrayType<'a> for &'a StringViewArray { - fn is_ascii(&self) -> bool { - StringViewArray::is_ascii(self) - } - - fn iter(&self) -> ArrayIter { - StringViewArray::iter(self) - } -} - fn apply<'a, T: StringArrayType<'a> + 'a>( op: Op, l: T, diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index 5ad452a17b12..d14662be7280 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -447,8 +447,7 @@ pub fn regexp_match( if regex.is_none() { return Ok(new_null_array( - &DataType::List(Arc::new(Field::new( - "item", + &DataType::List(Arc::new(Field::new_list_field( array.data_type().clone(), true, ))), diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs index bfdafb790f39..fa6a47147521 100644 --- a/arrow-string/src/substring.rs +++ b/arrow-string/src/substring.rs @@ -636,7 +636,7 @@ mod tests { let data = ArrayData::builder(DataType::FixedSizeBinary(5)) .len(2) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(&values)) .offset(1) .null_bit_buffer(Some(Buffer::from(bits_v))) .build() diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index a0fd96415a1d..8860cd61c5b3 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -54,9 +54,7 @@ arrow-select = { workspace = true } arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -pyo3 = { version = "0.22.2", default-features = false, optional = true } - -chrono = { workspace = true, optional = true } +pyo3 = { version = "0.23", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] @@ -72,7 +70,7 @@ prettyprint = ["arrow-cast/prettyprint"] # not the core arrow code itself. Be aware that `rand` must be kept as # an optional dependency for supporting compile to wasm32-unknown-unknown # target without assuming an environment containing JavaScript. -test_utils = ["rand", "dep:chrono"] +test_utils = ["dep:rand"] pyarrow = ["pyo3", "ffi"] # force_validate runs full data validation for all arrays that are created # this is not enabled by default as it is too computationally expensive @@ -87,7 +85,6 @@ chrono = { workspace = true } criterion = { version = "0.5", default-features = false } half = { version = "2.1", default-features = false } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } -tempfile = { version = "3", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } [build-dependencies] diff --git a/arrow/LICENSE.txt b/arrow/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/arrow/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/arrow/NOTICE.txt b/arrow/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/arrow/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/arrow/README.md b/arrow/README.md index 557a0b474e4b..a1444005ec00 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -25,7 +25,7 @@ This crate contains the official Native Rust implementation of [Apache Arrow][arrow] in memory format, governed by the Apache Software Foundation. The [API documentation](https://docs.rs/arrow/latest) contains examples and full API. -There are several [examples](https://github.com/apache/arrow-rs/tree/master/arrow/examples) to start from as well. +There are several [examples](https://github.com/apache/arrow-rs/tree/main/arrow/examples) to start from as well. The API documentation for most recent, unreleased code is available [here](https://arrow.apache.org/rust/arrow/index.html). @@ -57,7 +57,7 @@ The `arrow` crate provides the following features which may be enabled in your ` - `ipc` (default) - support for reading [Arrow IPC Format](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc), also used as the wire protocol in [arrow-flight](https://crates.io/crates/arrow-flight) - `ipc_compression` - Enables reading and writing compressed IPC streams (also enables `ipc`) - `prettyprint` - support for formatting record batches as textual columns - implementations of some [compute](https://github.com/apache/arrow-rs/tree/master/arrow/src/compute/kernels) + implementations of some [compute](https://github.com/apache/arrow-rs/tree/main/arrow/src/compute/kernels) - `chrono-tz` - support of parsing timezone using [chrono-tz](https://docs.rs/chrono-tz/0.6.0/chrono_tz/) - `ffi` - bindings for the Arrow C [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) - `pyarrow` - bindings for pyo3 to call arrow-rs from python diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index ec7990d3d764..5c4fcff13dee 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -250,6 +250,9 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("cast decimal128 to decimal128 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(30, 5))) }); + c.bench_function("cast decimal128 to decimal128 512 lower precision", |b| { + b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(6, 5))) + }); c.bench_function("cast decimal128 to decimal256 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal256(50, 5))) }); diff --git a/arrow/benches/concatenate_kernel.rs b/arrow/benches/concatenate_kernel.rs index 0c553f8b3f3c..034f5f2a305c 100644 --- a/arrow/benches/concatenate_kernel.rs +++ b/arrow/benches/concatenate_kernel.rs @@ -86,14 +86,14 @@ fn add_benchmark(c: &mut Criterion) { }); let v1 = FixedSizeListArray::try_new( - Arc::new(Field::new("item", DataType::Int32, true)), + Arc::new(Field::new_list_field(DataType::Int32, true)), 1024, Arc::new(create_primitive_array::(1024 * 1024, 0.0)), None, ) .unwrap(); let v2 = FixedSizeListArray::try_new( - Arc::new(Field::new("item", DataType::Int32, true)), + Arc::new(Field::new_list_field(DataType::Int32, true)), 1024, Arc::new(create_primitive_array::(1024 * 1024, 0.0)), None, diff --git a/arrow/benches/json_reader.rs b/arrow/benches/json_reader.rs index 8f3898c51f9d..c698a93fe869 100644 --- a/arrow/benches/json_reader.rs +++ b/arrow/benches/json_reader.rs @@ -102,22 +102,22 @@ fn small_bench_list(c: &mut Criterion) { let schema = Arc::new(Schema::new(vec![ Field::new( "c1", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))), true, ), Field::new( "c2", - DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Float64, true))), true, ), Field::new( "c3", - DataType::List(Arc::new(Field::new("item", DataType::UInt32, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::UInt32, true))), true, ), Field::new( "c4", - DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Boolean, true))), true, ), ])); diff --git a/arrow/benches/lexsort.rs b/arrow/benches/lexsort.rs index cd952299df47..bb1c6081eaf9 100644 --- a/arrow/benches/lexsort.rs +++ b/arrow/benches/lexsort.rs @@ -83,7 +83,7 @@ impl Column { Column::RequiredI32List => { let field = Field::new( "_1", - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))), true, ); create_random_array(&field, size, 0., 1.).unwrap() @@ -91,7 +91,7 @@ impl Column { Column::OptionalI32List => { let field = Field::new( "_1", - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))), true, ); create_random_array(&field, size, 0.2, 1.).unwrap() @@ -99,7 +99,7 @@ impl Column { Column::Required4CharStringList => { let field = Field::new( "_1", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, false))), + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, false))), true, ); create_random_array(&field, size, 0., 1.).unwrap() @@ -107,7 +107,7 @@ impl Column { Column::Optional4CharStringList => { let field = Field::new( "_1", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))), true, ); create_random_array(&field, size, 0.2, 1.).unwrap() diff --git a/arrow/examples/builders.rs b/arrow/examples/builders.rs index 5c8cd51c55a0..8043ad82fca6 100644 --- a/arrow/examples/builders.rs +++ b/arrow/examples/builders.rs @@ -76,7 +76,7 @@ fn main() { let array_data = ArrayData::builder(DataType::Utf8) .len(3) .add_buffer(Buffer::from(offsets.to_byte_slice())) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(&values)) .null_bit_buffer(Some(Buffer::from([0b00000101]))) .build() .unwrap(); @@ -97,7 +97,7 @@ fn main() { let value_offsets = Buffer::from([0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 5002e5bf181a..7fc5acdc1b19 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -336,7 +336,7 @@ //! //! If you think you have found an instance where this is possible, please file //! a ticket in our [issue tracker] and it will be triaged and fixed. For more information on -//! arrow's use of unsafe, see [here](https://github.com/apache/arrow-rs/tree/master/arrow#safety). +//! arrow's use of unsafe, see [here](https://github.com/apache/arrow-rs/tree/main/arrow#safety). //! //! # Higher-level Processing //! diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 6effe1c03e01..4ccbd0541d3f 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -111,7 +111,7 @@ impl IntoPyArrow for T { } fn validate_class(expected: &str, value: &Bound) -> PyResult<()> { - let pyarrow = PyModule::import_bound(value.py(), "pyarrow")?; + let pyarrow = PyModule::import(value.py(), "pyarrow")?; let class = pyarrow.getattr(expected)?; if !value.is_instance(&class)? { let expected_module = class.getattr("__module__")?.extract::()?; @@ -177,7 +177,7 @@ impl ToPyArrow for DataType { fn to_pyarrow(&self, py: Python) -> PyResult { let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; - let module = py.import_bound("pyarrow")?; + let module = py.import("pyarrow")?; let class = module.getattr("DataType")?; let dtype = class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; Ok(dtype.into()) @@ -213,7 +213,7 @@ impl ToPyArrow for Field { fn to_pyarrow(&self, py: Python) -> PyResult { let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; - let module = py.import_bound("pyarrow")?; + let module = py.import("pyarrow")?; let class = module.getattr("Field")?; let dtype = class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; Ok(dtype.into()) @@ -249,7 +249,7 @@ impl ToPyArrow for Schema { fn to_pyarrow(&self, py: Python) -> PyResult { let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; - let module = py.import_bound("pyarrow")?; + let module = py.import("pyarrow")?; let class = module.getattr("Schema")?; let schema = class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; Ok(schema.into()) @@ -309,7 +309,7 @@ impl ToPyArrow for ArrayData { let array = FFI_ArrowArray::new(self); let schema = FFI_ArrowSchema::try_from(self.data_type()).map_err(to_py_err)?; - let module = py.import_bound("pyarrow")?; + let module = py.import("pyarrow")?; let class = module.getattr("Array")?; let array = class.call_method1( "_import_from_c", @@ -318,7 +318,7 @@ impl ToPyArrow for ArrayData { addr_of!(schema) as Py_uintptr_t, ), )?; - Ok(array.to_object(py)) + Ok(array.unbind()) } } @@ -335,7 +335,7 @@ impl ToPyArrow for Vec { .iter() .map(|v| v.to_pyarrow(py)) .collect::>>()?; - Ok(values.to_object(py)) + Ok(PyList::new(py, values)?.unbind().into()) } } @@ -451,7 +451,7 @@ impl FromPyArrow for ArrowArrayStreamReader { // make the conversion through PyArrow's private API // this changes the pointer's memory and is thus unsafe. // In particular, `_export_to_c` can go out of bounds - let args = PyTuple::new_bound(value.py(), [stream_ptr as Py_uintptr_t]); + let args = PyTuple::new(value.py(), [stream_ptr as Py_uintptr_t])?; value.call_method1("_export_to_c", args)?; let stream_reader = ArrowArrayStreamReader::try_new(stream) @@ -469,9 +469,9 @@ impl IntoPyArrow for Box { let mut stream = FFI_ArrowArrayStream::new(self); let stream_ptr = (&mut stream) as *mut FFI_ArrowArrayStream; - let module = py.import_bound("pyarrow")?; + let module = py.import("pyarrow")?; let class = module.getattr("RecordBatchReader")?; - let args = PyTuple::new_bound(py, [stream_ptr as Py_uintptr_t]); + let args = PyTuple::new(py, [stream_ptr as Py_uintptr_t])?; let reader = class.call_method1("_import_from_c", args)?; Ok(PyObject::from(reader)) @@ -500,11 +500,17 @@ impl<'source, T: FromPyArrow> FromPyObject<'source> for PyArrowType { } } -impl IntoPy for PyArrowType { - fn into_py(self, py: Python) -> PyObject { +impl<'py, T: IntoPyArrow> IntoPyObject<'py> for PyArrowType { + type Target = PyAny; + + type Output = Bound<'py, Self::Target>; + + type Error = PyErr; + + fn into_pyobject(self, py: Python<'py>) -> Result { match self.0.into_pyarrow(py) { - Ok(obj) => obj, - Err(err) => err.to_object(py), + Ok(obj) => Result::Ok(obj.into_bound(py)), + Err(err) => Result::Err(err), } } } diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 56bbdefd522d..5f63812e51c0 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -538,7 +538,7 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new( "b", - DataType::List(Arc::new(Field::new("item", DataType::LargeUtf8, false))), + DataType::List(Arc::new(Field::new_list_field(DataType::LargeUtf8, false))), false, ), Field::new("a", DataType::Int32, false), @@ -569,10 +569,8 @@ mod tests { Field::new("b", DataType::Boolean, true), Field::new( "c", - DataType::LargeList(Arc::new(Field::new( - "item", - DataType::List(Arc::new(Field::new( - "item", + DataType::LargeList(Arc::new(Field::new_list_field( + DataType::List(Arc::new(Field::new_list_field( DataType::FixedSizeBinary(6), true, ))), diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 8f86cbeab717..ef5ca6041700 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -315,7 +315,7 @@ fn make_fixed_size_list_array() -> FixedSizeListArray { // Construct a fixed size list array from the above two let list_data_type = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 2); + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 2); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data) @@ -325,11 +325,11 @@ fn make_fixed_size_list_array() -> FixedSizeListArray { } fn make_fixed_size_binary_array() -> FixedSizeBinaryArray { - let values: [u8; 15] = *b"hellotherearrow"; + let values: &[u8; 15] = b"hellotherearrow"; let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) .len(3) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(Buffer::from(values)) .build() .unwrap(); FixedSizeBinaryArray::from(array_data) @@ -348,7 +348,7 @@ fn make_list_array() -> ListArray { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -371,7 +371,8 @@ fn make_large_list_array() -> LargeListArray { let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -466,12 +467,12 @@ fn get_all_types() -> Vec { LargeBinary, Utf8, LargeUtf8, - List(Arc::new(Field::new("item", DataType::Int8, true))), - List(Arc::new(Field::new("item", DataType::Utf8, true))), - FixedSizeList(Arc::new(Field::new("item", DataType::Int8, true)), 10), - FixedSizeList(Arc::new(Field::new("item", DataType::Utf8, false)), 10), - LargeList(Arc::new(Field::new("item", DataType::Int8, true))), - LargeList(Arc::new(Field::new("item", DataType::Utf8, false))), + List(Arc::new(Field::new_list_field(DataType::Int8, true))), + List(Arc::new(Field::new_list_field(DataType::Utf8, true))), + FixedSizeList(Arc::new(Field::new_list_field(DataType::Int8, true)), 10), + FixedSizeList(Arc::new(Field::new_list_field(DataType::Utf8, false)), 10), + LargeList(Arc::new(Field::new_list_field(DataType::Int8, true))), + LargeList(Arc::new(Field::new_list_field(DataType::Utf8, false))), Struct(Fields::from(vec![ Field::new("f1", DataType::Int32, true), Field::new("f2", DataType::Utf8, true), diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 7ed4dae1ed08..94fb85030bf3 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -409,8 +409,7 @@ fn test_empty_offsets_list_equal() { let values = Int32Array::from(empty); let empty_offsets: [u8; 0] = []; - let a: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( - "item", + let a: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new_list_field( DataType::Int32, true, )))) @@ -422,8 +421,7 @@ fn test_empty_offsets_list_equal() { .unwrap() .into(); - let b: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( - "item", + let b: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new_list_field( DataType::Int32, true, )))) @@ -437,8 +435,7 @@ fn test_empty_offsets_list_equal() { test_equal(&a, &b, true); - let c: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( - "item", + let c: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new_list_field( DataType::Int32, true, )))) @@ -475,8 +472,7 @@ fn test_list_null() { // a list where the nullness of values is determined by the list's bitmap let c_values = Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]); - let c: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( - "item", + let c: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new_list_field( DataType::Int32, true, )))) @@ -498,8 +494,7 @@ fn test_list_null() { None, None, ]); - let d: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( - "item", + let d: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new_list_field( DataType::Int32, true, )))) diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 08f23c200d52..c6de9f4a3417 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -600,7 +600,7 @@ fn test_list_append() { ]); let list_value_offsets = Buffer::from_slice_ref([0i32, 3, 5, 11, 13, 13, 15, 15, 17]); let expected_list_data = ArrayData::try_new( - DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))), 8, None, 0, @@ -677,7 +677,7 @@ fn test_list_nulls_append() { let list_value_offsets = Buffer::from_slice_ref([0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); let expected_list_data = ArrayData::try_new( - DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))), 12, Some(Buffer::from(&[0b11011011, 0b1110])), 0, @@ -940,7 +940,7 @@ fn test_list_of_strings_append() { ]); let list_value_offsets = Buffer::from_slice_ref([0, 3, 5, 6, 9, 10, 13]); let expected_list_data = ArrayData::try_new( - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))), 6, None, 0, @@ -1141,7 +1141,7 @@ fn test_fixed_size_list_append() { Some(12), ]); let expected_fixed_size_list_data = ArrayData::try_new( - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::UInt16, true)), 2), + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::UInt16, true)), 2), 12, Some(Buffer::from(&[0b11011101, 0b101])), 0, diff --git a/arrow/tests/shrink_to_fit.rs b/arrow/tests/shrink_to_fit.rs new file mode 100644 index 000000000000..5d7c2cf98bc9 --- /dev/null +++ b/arrow/tests/shrink_to_fit.rs @@ -0,0 +1,159 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::{ + array::{Array, ArrayRef, ListArray, PrimitiveArray}, + buffer::OffsetBuffer, + datatypes::{Field, UInt8Type}, +}; + +/// Test that `shrink_to_fit` frees memory after concatenating a large number of arrays. +#[test] +fn test_shrink_to_fit_after_concat() { + let array_len = 6_000; + let num_concats = 100; + + let primitive_array: PrimitiveArray = (0..array_len) + .map(|v| (v % 255) as u8) + .collect::>() + .into(); + let primitive_array: ArrayRef = Arc::new(primitive_array); + + let list_array: ArrayRef = Arc::new(ListArray::new( + Field::new_list_field(primitive_array.data_type().clone(), false).into(), + OffsetBuffer::from_lengths([primitive_array.len()]), + primitive_array.clone(), + None, + )); + + // Num bytes allocated globally and by this thread, respectively. + let (concatenated, _bytes_allocated_globally, bytes_allocated_by_this_thread) = + memory_use(|| { + let mut concatenated = concatenate(num_concats, list_array.clone()); + concatenated.shrink_to_fit(); // This is what we're testing! + dbg!(concatenated.data_type()); + concatenated + }); + let expected_len = num_concats * array_len; + assert_eq!(bytes_used(concatenated.clone()), expected_len); + eprintln!("The concatenated array is {expected_len} B long. Amount of memory used by this thread: {bytes_allocated_by_this_thread} B"); + + assert!( + expected_len <= bytes_allocated_by_this_thread, + "We must allocate at least as much space as the concatenated array" + ); + assert!( + bytes_allocated_by_this_thread <= expected_len + expected_len / 100, + "We shouldn't have more than 1% memory overhead. In fact, we are using {bytes_allocated_by_this_thread} B of memory for {expected_len} B of data" + ); +} + +fn concatenate(num_times: usize, array: ArrayRef) -> ArrayRef { + let mut concatenated = array.clone(); + for _ in 0..num_times - 1 { + concatenated = arrow::compute::kernels::concat::concat(&[&*concatenated, &*array]).unwrap(); + } + concatenated +} + +fn bytes_used(array: ArrayRef) -> usize { + let mut array = array; + loop { + match array.data_type() { + arrow::datatypes::DataType::UInt8 => break, + arrow::datatypes::DataType::List(_) => { + let list = array.as_any().downcast_ref::().unwrap(); + array = list.values().clone(); + } + _ => unreachable!(), + } + } + + array.len() +} + +// --- Memory tracking --- + +use std::{ + alloc::Layout, + sync::{ + atomic::{AtomicUsize, Ordering::Relaxed}, + Arc, + }, +}; + +static LIVE_BYTES_GLOBAL: AtomicUsize = AtomicUsize::new(0); + +thread_local! { + static LIVE_BYTES_IN_THREAD: AtomicUsize = const { AtomicUsize::new(0) } ; +} + +pub struct TrackingAllocator { + allocator: std::alloc::System, +} + +#[global_allocator] +pub static GLOBAL_ALLOCATOR: TrackingAllocator = TrackingAllocator { + allocator: std::alloc::System, +}; + +#[allow(unsafe_code)] +// SAFETY: +// We just do book-keeping and then let another allocator do all the actual work. +unsafe impl std::alloc::GlobalAlloc for TrackingAllocator { + #[allow(clippy::let_and_return)] + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + // SAFETY: + // Just deferring + let ptr = unsafe { self.allocator.alloc(layout) }; + if !ptr.is_null() { + LIVE_BYTES_IN_THREAD.with(|bytes| bytes.fetch_add(layout.size(), Relaxed)); + LIVE_BYTES_GLOBAL.fetch_add(layout.size(), Relaxed); + } + ptr + } + + unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { + LIVE_BYTES_IN_THREAD.with(|bytes| bytes.fetch_sub(layout.size(), Relaxed)); + LIVE_BYTES_GLOBAL.fetch_sub(layout.size(), Relaxed); + + // SAFETY: + // Just deferring + unsafe { self.allocator.dealloc(ptr, layout) }; + } + + // No need to override `alloc_zeroed` or `realloc`, + // since they both by default just defer to `alloc` and `dealloc`. +} + +fn live_bytes_local() -> usize { + LIVE_BYTES_IN_THREAD.with(|bytes| bytes.load(Relaxed)) +} + +fn live_bytes_global() -> usize { + LIVE_BYTES_GLOBAL.load(Relaxed) +} + +/// Returns `(num_bytes_allocated, num_bytes_allocated_by_this_thread)`. +fn memory_use(run: impl Fn() -> R) -> (R, usize, usize) { + let used_bytes_start_local = live_bytes_local(); + let used_bytes_start_global = live_bytes_global(); + let ret = run(); + let bytes_used_local = live_bytes_local() - used_bytes_start_local; + let bytes_used_global = live_bytes_global() - used_bytes_start_global; + (ret, bytes_used_global, bytes_used_local) +} diff --git a/dev/release/README.md b/dev/release/README.md index d2d9e48bbb6b..6e6817bffb12 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -27,7 +27,7 @@ This file documents the release process for the "Rust Arrow Crates": `arrow`, `a The Rust Arrow Crates are interconnected (e.g. `parquet` has an optional dependency on `arrow`) so we increment and release all of them together. -If any code has been merged to master that has a breaking API change, as defined +If any code has been merged to main that has a breaking API change, as defined in [Rust RFC 1105] he major version number is incremented (e.g. `9.0.2` to `10.0.2`). Otherwise the new minor version incremented (e.g. `9.0.2` to `9.1.0`). @@ -46,19 +46,19 @@ crates.io, the Rust ecosystem's package manager. We create a `CHANGELOG.md` so our users know what has been changed between releases. The CHANGELOG is created automatically using -[update_change_log.sh](https://github.com/apache/arrow-rs/blob/master/dev/release/update_change_log.sh) +[update_change_log.sh](https://github.com/apache/arrow-rs/blob/main/dev/release/update_change_log.sh) This script creates a changelog using github issues and the labels associated with them. ## Prepare CHANGELOG and version: -Now prepare a PR to update `CHANGELOG.md` and versions on `master` to reflect the planned release. +Now prepare a PR to update `CHANGELOG.md` and versions on `main` to reflect the planned release. Do this in the root of this repository. For example [#2323](https://github.com/apache/arrow-rs/pull/2323) ```bash -git checkout master +git checkout main git pull git checkout -b @@ -72,6 +72,8 @@ export ARROW_GITHUB_API_TOKEN= # manually edit ./dev/release/update_change_log.sh to reflect the release version # create the changelog ./dev/release/update_change_log.sh +# commit the intial changes +git commit -a -m 'Create changelog' # run automated script to copy labels to issues based on referenced PRs # (NOTE 1: this must be done by a committer / other who has @@ -80,14 +82,12 @@ export ARROW_GITHUB_API_TOKEN= # NOTE 2: this must be done after creating the initial CHANGELOG file python dev/release/label_issues.py -# review change log / edit issues and labels if needed, rerun -git commit -a -m 'Create changelog' - -# Manually edit ./dev/release/update_change_log.sh to reflect the release version -# Create the changelog +# review change log / edit issues and labels if needed, rerun, repeat as necessary +# note you need to revert changes to CHANGELOG-old.md if you want to rerun the script CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh -# Review change log / edit issues and labels if needed, rerun -git commit -a -m 'Create changelog' + +# Commit the changes +git commit -a -m 'Update changelog' git push ``` @@ -96,7 +96,7 @@ Note that when reviewing the change log, rather than editing the `CHANGELOG.md`, it is preferred to update the issues and their labels (e.g. add `invalid` label to exclude them from release notes) -Merge this PR to `master` prior to the next step. +Merge this PR to `main` prior to the next step. ## Prepare release candidate tarball @@ -115,7 +115,7 @@ Create and push the tag thusly: ```shell git fetch apache -git tag apache/master +git tag apache/main # push tag to apache git push apache ``` diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh index a77ddbe75701..8b92509104c8 100755 --- a/dev/release/create-tarball.sh +++ b/dev/release/create-tarball.sh @@ -109,7 +109,7 @@ The vote will be open for at least 72 hours. [1]: https://github.com/apache/arrow-rs/tree/${release_hash} [2]: ${url} [3]: https://github.com/apache/arrow-rs/blob/${release_hash}/CHANGELOG.md -[4]: https://github.com/apache/arrow-rs/blob/master/dev/release/verify-release-candidate.sh +[4]: https://github.com/apache/arrow-rs/blob/main/dev/release/verify-release-candidate.sh MAIL echo "---------------------------------------------------------" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index ab6460659d73..d00cc498625f 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="53.1.0" -FUTURE_RELEASE="53.2.0" +SINCE_TAG="53.2.0" +FUTURE_RELEASE="53.3.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 86d1392ebf61..bcc8e0b92243 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -23,7 +23,7 @@ license = "MIT/Apache-2.0" readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] -repository = "https://github.com/apache/arrow-rs/tree/master/object_store" +repository = "https://github.com/apache/arrow-rs/tree/main/object_store" rust-version = "1.64.0" [package.metadata.docs.rs] @@ -55,13 +55,14 @@ ring = { version = "0.17", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "2.0", default-features = false, features = ["std"], optional = true } tokio = { version = "1.29.0", features = ["sync", "macros", "rt", "time", "io-util"] } md-5 = { version = "0.10.6", default-features = false, optional = true } +httparse = { version = "1.8.0", default-features = false, features = ["std"], optional = true } [target.'cfg(target_family="unix")'.dev-dependencies] nix = { version = "0.29.0", features = ["fs"] } [features] cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] -azure = ["cloud"] +azure = ["cloud", "httparse"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud", "md-5"] http = ["cloud"] @@ -75,6 +76,10 @@ hyper-util = "0.1" http-body-util = "0.1" rand = "0.8" tempfile = "3.1.0" +regex = "1.11.1" +# The "gzip" feature for reqwest is enabled for an integration test. +reqwest = { version = "0.12", features = ["gzip"] } +http = "1.1.0" [[test]] name = "get_range_file" diff --git a/object_store/dev/release/README.md b/object_store/dev/release/README.md index 4077dcad9653..912ff4cd8bac 100644 --- a/object_store/dev/release/README.md +++ b/object_store/dev/release/README.md @@ -27,7 +27,7 @@ This file documents the release process for the `object_store` crate. At the time of writing, we release a new version of `object_store` on demand rather than on a regular schedule. As we are still in an early phase, we use the 0.x version scheme. If any code has -been merged to master that has a breaking API change, as defined in [Rust RFC 1105] +been merged to main that has a breaking API change, as defined in [Rust RFC 1105] the minor version number is incremented changed (e.g. `0.3.0` to `0.4.0`). Otherwise the patch version is incremented (e.g. `0.3.0` to `0.3.1`). @@ -45,14 +45,14 @@ crates.io, the Rust ecosystem's package manager. We create a `CHANGELOG.md` so our users know what has been changed between releases. The CHANGELOG is created automatically using -[update_change_log.sh](https://github.com/apache/arrow-rs/blob/master/object_store/dev/release/update_change_log.sh) +[update_change_log.sh](https://github.com/apache/arrow-rs/blob/main/object_store/dev/release/update_change_log.sh) This script creates a changelog using github issues and the labels associated with them. ## Prepare CHANGELOG and version: -Now prepare a PR to update `CHANGELOG.md` and versions on `master` to reflect the planned release. +Now prepare a PR to update `CHANGELOG.md` and versions on `main` to reflect the planned release. Note this process is done in the `object_store` directory. See [#6227] for an example @@ -62,7 +62,7 @@ Note this process is done in the `object_store` directory. See [#6227] for an e # NOTE: Run commands in object_store sub directory (not main repo checkout) # cd object_store -git checkout master +git checkout main git pull git checkout -b @@ -82,7 +82,7 @@ export CHANGELOG_GITHUB_TOKEN= # Commit changes git commit -a -m 'Create changelog' -# push changes to fork and create a PR to master +# push changes to fork and create a PR to main git push ``` @@ -90,7 +90,7 @@ Note that when reviewing the change log, rather than editing the `CHANGELOG.md`, it is preferred to update the issues and their labels (e.g. add `invalid` label to exclude them from release notes) -Merge this PR to `master` prior to the next step. +Merge this PR to `main` prior to the next step. ## Prepare release candidate tarball @@ -109,7 +109,7 @@ Create and push the tag thusly: ```shell git fetch apache -git tag apache/master +git tag apache/main # push tag to apache git push apache ``` @@ -170,7 +170,7 @@ The vote will be open for at least 72 hours. [1]: https://github.com/apache/arrow-rs/tree/b945b15de9085f5961a478d4f35b0c5c3427e248 [2]: https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-object-store-rs-0.11.1-rc1 [3]: https://github.com/apache/arrow-rs/blob/b945b15de9085f5961a478d4f35b0c5c3427e248/object_store/CHANGELOG.md -[4]: https://github.com/apache/arrow-rs/blob/master/object_store/dev/release/verify-release-candidate.sh +[4]: https://github.com/apache/arrow-rs/blob/main/object_store/dev/release/verify-release-candidate.sh ``` For the release to become "official" it needs at least three Apache Arrow PMC members to vote +1 on it. diff --git a/object_store/dev/release/create-tarball.sh b/object_store/dev/release/create-tarball.sh index bbffde89b043..efc26fd0ef0f 100755 --- a/object_store/dev/release/create-tarball.sh +++ b/object_store/dev/release/create-tarball.sh @@ -101,7 +101,7 @@ The vote will be open for at least 72 hours. [1]: https://github.com/apache/arrow-rs/tree/${release_hash} [2]: ${url} [3]: https://github.com/apache/arrow-rs/blob/${release_hash}/object_store/CHANGELOG.md -[4]: https://github.com/apache/arrow-rs/blob/master/object_store/dev/release/verify-release-candidate.sh +[4]: https://github.com/apache/arrow-rs/blob/main/object_store/dev/release/verify-release-candidate.sh MAIL echo "---------------------------------------------------------" diff --git a/object_store/src/aws/builder.rs b/object_store/src/aws/builder.rs index eb79f5e6dc28..840245a7b5d4 100644 --- a/object_store/src/aws/builder.rs +++ b/object_store/src/aws/builder.rs @@ -170,6 +170,8 @@ pub struct AmazonS3Builder { encryption_bucket_key_enabled: Option>, /// base64-encoded 256-bit customer encryption key for SSE-C. encryption_customer_key_base64: Option, + /// When set to true, charge requester for bucket operations + request_payer: ConfigValue, } /// Configuration keys for [`AmazonS3Builder`] @@ -330,6 +332,13 @@ pub enum AmazonS3ConfigKey { /// - `s3_express` S3Express, + /// Enable Support for S3 Requester Pays + /// + /// Supported keys: + /// - `aws_request_payer` + /// - `request_payer` + RequestPayer, + /// Client options Client(ClientConfigKey), @@ -358,6 +367,7 @@ impl AsRef for AmazonS3ConfigKey { Self::CopyIfNotExists => "aws_copy_if_not_exists", Self::ConditionalPut => "aws_conditional_put", Self::DisableTagging => "aws_disable_tagging", + Self::RequestPayer => "aws_request_payer", Self::Client(opt) => opt.as_ref(), Self::Encryption(opt) => opt.as_ref(), } @@ -389,6 +399,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_copy_if_not_exists" | "copy_if_not_exists" => Ok(Self::CopyIfNotExists), "aws_conditional_put" | "conditional_put" => Ok(Self::ConditionalPut), "aws_disable_tagging" | "disable_tagging" => Ok(Self::DisableTagging), + "aws_request_payer" | "request_payer" => Ok(Self::RequestPayer), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), "aws_server_side_encryption" => Ok(Self::Encryption( @@ -510,6 +521,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::ConditionalPut => { self.conditional_put = Some(ConfigValue::Deferred(value.into())) } + AmazonS3ConfigKey::RequestPayer => { + self.request_payer = ConfigValue::Deferred(value.into()) + } AmazonS3ConfigKey::Encryption(key) => match key { S3EncryptionConfigKey::ServerSideEncryption => { self.encryption_type = Some(ConfigValue::Deferred(value.into())) @@ -567,6 +581,7 @@ impl AmazonS3Builder { self.conditional_put.as_ref().map(ToString::to_string) } AmazonS3ConfigKey::DisableTagging => Some(self.disable_tagging.to_string()), + AmazonS3ConfigKey::RequestPayer => Some(self.request_payer.to_string()), AmazonS3ConfigKey::Encryption(key) => match key { S3EncryptionConfigKey::ServerSideEncryption => { self.encryption_type.as_ref().map(ToString::to_string) @@ -845,6 +860,14 @@ impl AmazonS3Builder { self } + /// Set whether to charge requester for bucket operations. + /// + /// + pub fn with_request_payer(mut self, enabled: bool) -> Self { + self.request_payer = ConfigValue::Parsed(enabled); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -996,6 +1019,7 @@ impl AmazonS3Builder { copy_if_not_exists, conditional_put: put_precondition, encryption_headers, + request_payer: self.request_payer.get()?, }; let client = Arc::new(S3Client::new(config)?); diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 895308f5880e..81015e82b39c 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -29,7 +29,7 @@ use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::s3::{ CompleteMultipartUpload, CompleteMultipartUploadResult, CopyPartResult, - InitiateMultipartUploadResult, ListResponse, + InitiateMultipartUploadResult, ListResponse, PartMetadata, }; use crate::client::GetOptionsExt; use crate::multipart::PartId; @@ -62,6 +62,7 @@ use std::sync::Arc; const VERSION_HEADER: &str = "x-amz-version-id"; const SHA256_CHECKSUM: &str = "x-amz-checksum-sha256"; const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-amz-meta-"; +const ALGORITHM: &str = "x-amz-checksum-algorithm"; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] @@ -202,6 +203,7 @@ pub(crate) struct S3Config { pub checksum: Option, pub copy_if_not_exists: Option, pub conditional_put: Option, + pub request_payer: bool, pub(super) encryption_headers: S3EncryptionHeaders, } @@ -245,11 +247,12 @@ struct SessionCredential<'a> { config: &'a S3Config, } -impl<'a> SessionCredential<'a> { +impl SessionCredential<'_> { fn authorizer(&self) -> Option> { let mut authorizer = AwsAuthorizer::new(self.credential.as_deref()?, "s3", &self.config.region) - .with_sign_payload(self.config.sign_payload); + .with_sign_payload(self.config.sign_payload) + .with_request_payer(self.config.request_payer); if self.session_token { let token = HeaderName::from_static("x-amz-s3session-token"); @@ -288,6 +291,7 @@ pub(crate) struct Request<'a> { payload: Option, use_session_creds: bool, idempotent: bool, + retry_on_conflict: bool, retry_error_body: bool, } @@ -315,6 +319,13 @@ impl<'a> Request<'a> { Self { idempotent, ..self } } + pub(crate) fn retry_on_conflict(self, retry_on_conflict: bool) -> Self { + Self { + retry_on_conflict, + ..self + } + } + pub(crate) fn retry_error_body(self, retry_error_body: bool) -> Self { Self { retry_error_body, @@ -380,10 +391,9 @@ impl<'a> Request<'a> { let payload_sha256 = sha256.finish(); if let Some(Checksum::SHA256) = self.config.checksum { - self.builder = self.builder.header( - "x-amz-checksum-sha256", - BASE64_STANDARD.encode(payload_sha256), - ); + self.builder = self + .builder + .header(SHA256_CHECKSUM, BASE64_STANDARD.encode(payload_sha256)); } self.payload_sha256 = Some(payload_sha256); } @@ -410,6 +420,7 @@ impl<'a> Request<'a> { self.builder .with_aws_sigv4(credential.authorizer(), sha) .retryable(&self.config.retry_config) + .retry_on_conflict(self.retry_on_conflict) .idempotent(self.idempotent) .retry_error_body(self.retry_error_body) .payload(self.payload) @@ -446,6 +457,7 @@ impl S3Client { config: &self.config, use_session_creds: true, idempotent: false, + retry_on_conflict: false, retry_error_body: false, } } @@ -605,8 +617,15 @@ impl S3Client { location: &Path, opts: PutMultipartOpts, ) -> Result { - let response = self - .request(Method::POST, location) + let mut request = self.request(Method::POST, location); + if let Some(algorithm) = self.config.checksum { + match algorithm { + Checksum::SHA256 => { + request = request.header(ALGORITHM, "SHA256"); + } + } + } + let response = request .query(&[("uploads", "")]) .with_encryption_headers() .with_attributes(opts.attributes) @@ -657,8 +676,13 @@ impl S3Client { request = request.with_encryption_headers(); } let response = request.send().await?; + let checksum_sha256 = response + .headers() + .get(SHA256_CHECKSUM) + .and_then(|v| v.to_str().ok()) + .map(|v| v.to_string()); - let content_id = match is_copy { + let e_tag = match is_copy { false => get_etag(response.headers()).context(MetadataSnafu)?, true => { let response = response @@ -670,6 +694,17 @@ impl S3Client { response.e_tag } }; + + let content_id = if self.config.checksum == Some(Checksum::SHA256) { + let meta = PartMetadata { + e_tag, + checksum_sha256, + }; + quick_xml::se::to_string(&meta).unwrap() + } else { + e_tag + }; + Ok(PartId { content_id }) } diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 33972c6fa14a..ee2f8e2ec953 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -101,11 +101,14 @@ pub struct AwsAuthorizer<'a> { region: &'a str, token_header: Option, sign_payload: bool, + request_payer: bool, } static DATE_HEADER: HeaderName = HeaderName::from_static("x-amz-date"); static HASH_HEADER: HeaderName = HeaderName::from_static("x-amz-content-sha256"); static TOKEN_HEADER: HeaderName = HeaderName::from_static("x-amz-security-token"); +static REQUEST_PAYER_HEADER: HeaderName = HeaderName::from_static("x-amz-request-payer"); +static REQUEST_PAYER_HEADER_VALUE: HeaderValue = HeaderValue::from_static("requester"); const ALGORITHM: &str = "AWS4-HMAC-SHA256"; impl<'a> AwsAuthorizer<'a> { @@ -118,6 +121,7 @@ impl<'a> AwsAuthorizer<'a> { date: None, sign_payload: true, token_header: None, + request_payer: false, } } @@ -134,6 +138,14 @@ impl<'a> AwsAuthorizer<'a> { self } + /// Set whether to include requester pays headers + /// + /// + pub fn with_request_payer(mut self, request_payer: bool) -> Self { + self.request_payer = request_payer; + self + } + /// Authorize `request` with an optional pre-calculated SHA256 digest by attaching /// the relevant [AWS SigV4] headers /// @@ -180,6 +192,15 @@ impl<'a> AwsAuthorizer<'a> { let header_digest = HeaderValue::from_str(&digest).unwrap(); request.headers_mut().insert(&HASH_HEADER, header_digest); + if self.request_payer { + // For DELETE, GET, HEAD, POST, and PUT requests, include x-amz-request-payer : + // requester in the header + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/ObjectsinRequesterPaysBuckets.html + request + .headers_mut() + .insert(&REQUEST_PAYER_HEADER, REQUEST_PAYER_HEADER_VALUE.clone()); + } + let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); let scope = self.scope(date); @@ -226,6 +247,13 @@ impl<'a> AwsAuthorizer<'a> { .append_pair("X-Amz-Expires", &expires_in.as_secs().to_string()) .append_pair("X-Amz-SignedHeaders", "host"); + if self.request_payer { + // For signed URLs, include x-amz-request-payer=requester in the request + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/ObjectsinRequesterPaysBuckets.html + url.query_pairs_mut() + .append_pair("x-amz-request-payer", "requester"); + } + // For S3, you must include the X-Amz-Security-Token query parameter in the URL if // using credentials sourced from the STS service. if let Some(ref token) = self.credential.token { @@ -763,12 +791,53 @@ mod tests { region: "us-east-1", sign_payload: true, token_header: None, + request_payer: false, }; signer.authorize(&mut request, None); assert_eq!(request.headers().get(&AUTHORIZATION).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") } + #[test] + fn test_sign_with_signed_payload_request_payer() { + let client = Client::new(); + + // Test credentials from https://docs.aws.amazon.com/AmazonS3/latest/userguide/RESTAuthentication.html + let credential = AwsCredential { + key_id: "AKIAIOSFODNN7EXAMPLE".to_string(), + secret_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + token: None, + }; + + // method = 'GET' + // service = 'ec2' + // host = 'ec2.amazonaws.com' + // region = 'us-east-1' + // endpoint = 'https://ec2.amazonaws.com' + // request_parameters = '' + let date = DateTime::parse_from_rfc3339("2022-08-06T18:01:34Z") + .unwrap() + .with_timezone(&Utc); + + let mut request = client + .request(Method::GET, "https://ec2.amazon.com/") + .build() + .unwrap(); + + let signer = AwsAuthorizer { + date: Some(date), + credential: &credential, + service: "ec2", + region: "us-east-1", + sign_payload: true, + token_header: None, + request_payer: true, + }; + + signer.authorize(&mut request, None); + assert_eq!(request.headers().get(&AUTHORIZATION).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-request-payer, Signature=7030625a9e9b57ed2a40e63d749f4a4b7714b6e15004cab026152f870dd8565d") + } + #[test] fn test_sign_with_unsigned_payload() { let client = Client::new(); @@ -802,6 +871,7 @@ mod tests { region: "us-east-1", token_header: None, sign_payload: false, + request_payer: false, }; authorizer.authorize(&mut request, None); @@ -828,6 +898,7 @@ mod tests { region: "us-east-1", token_header: None, sign_payload: false, + request_payer: false, }; let mut url = Url::parse("https://examplebucket.s3.amazonaws.com/test.txt").unwrap(); @@ -848,6 +919,48 @@ mod tests { ); } + #[test] + fn signed_get_url_request_payer() { + // Values from https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html + let credential = AwsCredential { + key_id: "AKIAIOSFODNN7EXAMPLE".to_string(), + secret_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + token: None, + }; + + let date = DateTime::parse_from_rfc3339("2013-05-24T00:00:00Z") + .unwrap() + .with_timezone(&Utc); + + let authorizer = AwsAuthorizer { + date: Some(date), + credential: &credential, + service: "s3", + region: "us-east-1", + token_header: None, + sign_payload: false, + request_payer: true, + }; + + let mut url = Url::parse("https://examplebucket.s3.amazonaws.com/test.txt").unwrap(); + authorizer.sign(Method::GET, &mut url, Duration::from_secs(86400)); + + assert_eq!( + url, + Url::parse( + "https://examplebucket.s3.amazonaws.com/test.txt?\ + X-Amz-Algorithm=AWS4-HMAC-SHA256&\ + X-Amz-Credential=AKIAIOSFODNN7EXAMPLE%2F20130524%2Fus-east-1%2Fs3%2Faws4_request&\ + X-Amz-Date=20130524T000000Z&\ + X-Amz-Expires=86400&\ + X-Amz-SignedHeaders=host&\ + x-amz-request-payer=requester&\ + X-Amz-Signature=9ad7c781cc30121f199b47d35ed3528473e4375b63c5d91cd87c927803e4e00a" + ) + .unwrap() + ); + } + #[test] fn test_sign_port() { let client = Client::new(); @@ -880,6 +993,7 @@ mod tests { region: "us-east-1", token_header: None, sign_payload: true, + request_payer: false, }; authorizer.authorize(&mut request, None); diff --git a/object_store/src/aws/dynamo.rs b/object_store/src/aws/dynamo.rs index ece3b8a357c6..6283e76c1f87 100644 --- a/object_store/src/aws/dynamo.rs +++ b/object_store/src/aws/dynamo.rs @@ -471,7 +471,7 @@ enum ReturnValues { /// This provides cheap, ordered serialization of maps struct Map<'a, K, V>(&'a [(K, V)]); -impl<'a, K: Serialize, V: Serialize> Serialize for Map<'a, K, V> { +impl Serialize for Map<'_, K, V> { fn serialize(&self, serializer: S) -> Result where S: Serializer, diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index b238d90eb6d7..7f449c49963c 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -136,7 +136,8 @@ impl Signer for AmazonS3 { /// ``` async fn signed_url(&self, method: Method, path: &Path, expires_in: Duration) -> Result { let credential = self.credentials().get_credential().await?; - let authorizer = AwsAuthorizer::new(&credential, "s3", &self.client.config.region); + let authorizer = AwsAuthorizer::new(&credential, "s3", &self.client.config.region) + .with_request_payer(self.client.config.request_payer); let path_url = self.path_url(path); let mut url = Url::parse(&path_url).map_err(|e| crate::Error::Generic { @@ -169,10 +170,7 @@ impl ObjectStore for AmazonS3 { match (opts.mode, &self.client.config.conditional_put) { (PutMode::Overwrite, _) => request.idempotent(true).do_put().await, (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), - ( - PutMode::Create, - Some(S3ConditionalPut::ETagMatch | S3ConditionalPut::ETagPutIfNotExists), - ) => { + (PutMode::Create, Some(S3ConditionalPut::ETagMatch)) => { match request.header(&IF_NONE_MATCH, "*").do_put().await { // Technically If-None-Match should return NotModified but some stores, // such as R2, instead return PreconditionFailed @@ -196,9 +194,26 @@ impl ObjectStore for AmazonS3 { source: "ETag required for conditional put".to_string().into(), })?; match put { - S3ConditionalPut::ETagPutIfNotExists => Err(Error::NotImplemented), S3ConditionalPut::ETagMatch => { - request.header(&IF_MATCH, etag.as_str()).do_put().await + match request + .header(&IF_MATCH, etag.as_str()) + // Real S3 will occasionally report 409 Conflict + // if there are concurrent `If-Match` requests + // in flight, so we need to be prepared to retry + // 409 responses. + .retry_on_conflict(true) + .do_put() + .await + { + // Real S3 reports NotFound rather than PreconditionFailed when the + // object doesn't exist. Convert to PreconditionFailed for + // consistency with R2. This also matches what the HTTP spec + // says the behavior should be. + Err(Error::NotFound { path, source }) => { + Err(Error::Precondition { path, source }) + } + r => r, + } } S3ConditionalPut::Dynamo(d) => { d.conditional_op(&self.client, location, Some(&etag), move || { @@ -478,6 +493,66 @@ mod tests { const NON_EXISTENT_NAME: &str = "nonexistentname"; + #[tokio::test] + async fn write_multipart_file_with_signature() { + maybe_skip_integration!(); + + let store = AmazonS3Builder::from_env() + .with_checksum_algorithm(Checksum::SHA256) + .build() + .unwrap(); + + let str = "test.bin"; + let path = Path::parse(str).unwrap(); + let opts = PutMultipartOpts::default(); + let mut upload = store.put_multipart_opts(&path, opts).await.unwrap(); + + upload + .put_part(PutPayload::from(vec![0u8; 10_000_000])) + .await + .unwrap(); + upload + .put_part(PutPayload::from(vec![0u8; 5_000_000])) + .await + .unwrap(); + + let res = upload.complete().await.unwrap(); + assert!(res.e_tag.is_some(), "Should have valid etag"); + + store.delete(&path).await.unwrap(); + } + + #[tokio::test] + async fn write_multipart_file_with_signature_object_lock() { + maybe_skip_integration!(); + + let bucket = "test-object-lock"; + let store = AmazonS3Builder::from_env() + .with_bucket_name(bucket) + .with_checksum_algorithm(Checksum::SHA256) + .build() + .unwrap(); + + let str = "test.bin"; + let path = Path::parse(str).unwrap(); + let opts = PutMultipartOpts::default(); + let mut upload = store.put_multipart_opts(&path, opts).await.unwrap(); + + upload + .put_part(PutPayload::from(vec![0u8; 10_000_000])) + .await + .unwrap(); + upload + .put_part(PutPayload::from(vec![0u8; 5_000_000])) + .await + .unwrap(); + + let res = upload.complete().await.unwrap(); + assert!(res.e_tag.is_some(), "Should have valid etag"); + + store.delete(&path).await.unwrap(); + } + #[tokio::test] async fn s3_test() { maybe_skip_integration!(); @@ -486,6 +561,7 @@ mod tests { let integration = config.build().unwrap(); let config = &integration.client.config; let test_not_exists = config.copy_if_not_exists.is_some(); + let test_conditional_put = config.conditional_put.is_some(); put_get_delete_list(&integration).await; get_opts(&integration).await; @@ -494,6 +570,7 @@ mod tests { rename_and_copy(&integration).await; stream_get(&integration).await; multipart(&integration, &integration).await; + multipart_race_condition(&integration, true).await; signing(&integration).await; s3_encryption(&integration).await; put_get_attributes(&integration).await; @@ -516,9 +593,8 @@ mod tests { if test_not_exists { copy_if_not_exists(&integration).await; } - if let Some(conditional_put) = &config.conditional_put { - let supports_update = !matches!(conditional_put, S3ConditionalPut::ETagPutIfNotExists); - put_opts(&integration, supports_update).await; + if test_conditional_put { + put_opts(&integration, true).await; } // run integration test with unsigned payload enabled diff --git a/object_store/src/aws/precondition.rs b/object_store/src/aws/precondition.rs index e5058052790d..b261ad0dbfb1 100644 --- a/object_store/src/aws/precondition.rs +++ b/object_store/src/aws/precondition.rs @@ -138,17 +138,6 @@ pub enum S3ConditionalPut { /// [HTTP precondition]: https://datatracker.ietf.org/doc/html/rfc9110#name-preconditions ETagMatch, - /// Like `ETagMatch`, but with support for `PutMode::Create` and not - /// `PutMode::Option`. - /// - /// This is the limited form of conditional put supported by Amazon S3 - /// as of August 2024 ([announcement]). - /// - /// Encoded as `etag-put-if-not-exists` ignoring whitespace. - /// - /// [announcement]: https://aws.amazon.com/about-aws/whats-new/2024/08/amazon-s3-conditional-writes/ - ETagPutIfNotExists, - /// The name of a DynamoDB table to use for coordination /// /// Encoded as either `dynamo:` or `dynamo::` @@ -164,7 +153,6 @@ impl std::fmt::Display for S3ConditionalPut { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ETagMatch => write!(f, "etag"), - Self::ETagPutIfNotExists => write!(f, "etag-put-if-not-exists"), Self::Dynamo(lock) => write!(f, "dynamo: {}", lock.table_name()), } } @@ -174,7 +162,6 @@ impl S3ConditionalPut { fn from_str(s: &str) -> Option { match s.trim() { "etag" => Some(Self::ETagMatch), - "etag-put-if-not-exists" => Some(Self::ETagPutIfNotExists), trimmed => match trimmed.split_once(':')? { ("dynamo", s) => Some(Self::Dynamo(DynamoCommit::from_str(s)?)), _ => None, diff --git a/object_store/src/azure/builder.rs b/object_store/src/azure/builder.rs index 1c4589ba1ec6..08c9a232393d 100644 --- a/object_store/src/azure/builder.rs +++ b/object_store/src/azure/builder.rs @@ -240,6 +240,14 @@ pub enum AzureConfigKey { /// - `authority_id` AuthorityId, + /// Authority host used in oauth flows + /// + /// Supported keys: + /// - `azure_storage_authority_host` + /// - `azure_authority_host` + /// - `authority_host` + AuthorityHost, + /// Shared access signature. /// /// The signature is expected to be percent-encoded, much like they are provided @@ -383,6 +391,7 @@ impl AsRef for AzureConfigKey { Self::ClientId => "azure_storage_client_id", Self::ClientSecret => "azure_storage_client_secret", Self::AuthorityId => "azure_storage_tenant_id", + Self::AuthorityHost => "azure_storage_authority_host", Self::SasKey => "azure_storage_sas_key", Self::Token => "azure_storage_token", Self::UseEmulator => "azure_storage_use_emulator", @@ -427,6 +436,9 @@ impl FromStr for AzureConfigKey { | "azure_authority_id" | "tenant_id" | "authority_id" => Ok(Self::AuthorityId), + "azure_storage_authority_host" | "azure_authority_host" | "authority_host" => { + Ok(Self::AuthorityHost) + } "azure_storage_sas_key" | "azure_storage_sas_token" | "sas_key" | "sas_token" => { Ok(Self::SasKey) } @@ -556,6 +568,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::ClientId => self.client_id = Some(value.into()), AzureConfigKey::ClientSecret => self.client_secret = Some(value.into()), AzureConfigKey::AuthorityId => self.tenant_id = Some(value.into()), + AzureConfigKey::AuthorityHost => self.authority_host = Some(value.into()), AzureConfigKey::SasKey => self.sas_key = Some(value.into()), AzureConfigKey::Token => self.bearer_token = Some(value.into()), AzureConfigKey::MsiEndpoint => self.msi_endpoint = Some(value.into()), @@ -602,6 +615,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::ClientId => self.client_id.clone(), AzureConfigKey::ClientSecret => self.client_secret.clone(), AzureConfigKey::AuthorityId => self.tenant_id.clone(), + AzureConfigKey::AuthorityHost => self.authority_host.clone(), AzureConfigKey::SasKey => self.sas_key.clone(), AzureConfigKey::Token => self.bearer_token.clone(), AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index e78f8db7a8c8..69ff39526bef 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -31,13 +31,14 @@ use crate::{ PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, RetryConfig, TagSet, }; use async_trait::async_trait; -use base64::prelude::BASE64_STANDARD; +use base64::prelude::{BASE64_STANDARD, BASE64_STANDARD_NO_PAD}; use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use hyper::http::HeaderName; +use rand::Rng as _; use reqwest::{ - header::{HeaderValue, CONTENT_LENGTH, IF_MATCH, IF_NONE_MATCH}, + header::{HeaderMap, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE, IF_MATCH, IF_NONE_MATCH}, Client as ReqwestClient, Method, RequestBuilder, Response, }; use serde::{Deserialize, Serialize}; @@ -79,6 +80,34 @@ pub(crate) enum Error { path: String, }, + #[snafu(display("Error performing bulk delete request: {}", source))] + BulkDeleteRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error receiving bulk delete request body: {}", source))] + BulkDeleteRequestBody { source: reqwest::Error }, + + #[snafu(display( + "Bulk delete request failed due to invalid input: {} (code: {})", + reason, + code + ))] + BulkDeleteRequestInvalidInput { code: String, reason: String }, + + #[snafu(display("Got invalid bulk delete response: {}", reason))] + InvalidBulkDeleteResponse { reason: String }, + + #[snafu(display( + "Bulk delete request failed for key {}: {} (code: {})", + path, + reason, + code + ))] + DeleteFailed { + path: String, + code: String, + reason: String, + }, + #[snafu(display("Error performing list request: {}", source))] ListRequest { source: crate::client::retry::Error }, @@ -247,6 +276,223 @@ impl<'a> PutRequest<'a> { } } +#[inline] +fn extend(dst: &mut Vec, data: &[u8]) { + dst.extend_from_slice(data); +} + +// Write header names as title case. The header name is assumed to be ASCII. +// We need it because Azure is not always treating headers as case insensitive. +fn title_case(dst: &mut Vec, name: &[u8]) { + dst.reserve(name.len()); + + // Ensure first character is uppercased + let mut prev = b'-'; + for &(mut c) in name { + if prev == b'-' { + c.make_ascii_uppercase(); + } + dst.push(c); + prev = c; + } +} + +fn write_headers(headers: &HeaderMap, dst: &mut Vec) { + for (name, value) in headers { + // We need special case handling here otherwise Azure returns 400 + // due to `Content-Id` instead of `Content-ID` + if name == "content-id" { + extend(dst, b"Content-ID"); + } else { + title_case(dst, name.as_str().as_bytes()); + } + extend(dst, b": "); + extend(dst, value.as_bytes()); + extend(dst, b"\r\n"); + } +} + +// https://docs.oasis-open.org/odata/odata/v4.0/errata02/os/complete/part1-protocol/odata-v4.0-errata02-os-part1-protocol-complete.html#_Toc406398359 +fn serialize_part_delete_request( + dst: &mut Vec, + boundary: &str, + idx: usize, + request: reqwest::Request, + relative_url: String, +) { + // Encode start marker for part + extend(dst, b"--"); + extend(dst, boundary.as_bytes()); + extend(dst, b"\r\n"); + + // Encode part headers + let mut part_headers = HeaderMap::new(); + part_headers.insert(CONTENT_TYPE, HeaderValue::from_static("application/http")); + part_headers.insert( + "Content-Transfer-Encoding", + HeaderValue::from_static("binary"), + ); + // Azure returns 400 if we send `Content-Id` instead of `Content-ID` + part_headers.insert("Content-ID", HeaderValue::from(idx)); + write_headers(&part_headers, dst); + extend(dst, b"\r\n"); + + // Encode the subrequest request-line + extend(dst, b"DELETE "); + extend(dst, format!("/{} ", relative_url).as_bytes()); + extend(dst, b"HTTP/1.1"); + extend(dst, b"\r\n"); + + // Encode subrequest headers + write_headers(request.headers(), dst); + extend(dst, b"\r\n"); + extend(dst, b"\r\n"); +} + +fn parse_multipart_response_boundary(response: &Response) -> Result { + let invalid_response = |msg: &str| Error::InvalidBulkDeleteResponse { + reason: msg.to_string(), + }; + + let content_type = response + .headers() + .get(CONTENT_TYPE) + .ok_or_else(|| invalid_response("missing Content-Type"))?; + + let boundary = content_type + .as_ref() + .strip_prefix(b"multipart/mixed; boundary=") + .ok_or_else(|| invalid_response("invalid Content-Type value"))? + .to_vec(); + + let boundary = + String::from_utf8(boundary).map_err(|_| invalid_response("invalid multipart boundary"))?; + + Ok(boundary) +} + +fn invalid_response(msg: &str) -> Error { + Error::InvalidBulkDeleteResponse { + reason: msg.to_string(), + } +} + +#[derive(Debug)] +struct MultipartField { + headers: HeaderMap, + content: Bytes, +} + +fn parse_multipart_body_fields(body: Bytes, boundary: &[u8]) -> Result> { + let start_marker = [b"--", boundary, b"\r\n"].concat(); + let next_marker = &start_marker[..start_marker.len() - 2]; + let end_marker = [b"--", boundary, b"--\r\n"].concat(); + + // There should be at most 256 responses per batch + let mut fields = Vec::with_capacity(256); + let mut remaining: &[u8] = body.as_ref(); + loop { + remaining = remaining + .strip_prefix(start_marker.as_slice()) + .ok_or_else(|| invalid_response("missing start marker for field"))?; + + // The documentation only mentions two headers for fields, we leave some extra margin + let mut scratch = [httparse::EMPTY_HEADER; 10]; + let mut headers = HeaderMap::new(); + match httparse::parse_headers(remaining, &mut scratch) { + Ok(httparse::Status::Complete((pos, headers_slice))) => { + remaining = &remaining[pos..]; + for header in headers_slice { + headers.insert( + HeaderName::from_bytes(header.name.as_bytes()).expect("valid"), + HeaderValue::from_bytes(header.value).expect("valid"), + ); + } + } + _ => return Err(invalid_response("unable to parse field headers").into()), + }; + + let next_pos = remaining + .windows(next_marker.len()) + .position(|window| window == next_marker) + .ok_or_else(|| invalid_response("early EOF while seeking to next boundary"))?; + + fields.push(MultipartField { + headers, + content: body.slice_ref(&remaining[..next_pos]), + }); + + remaining = &remaining[next_pos..]; + + // Support missing final CRLF + if remaining == end_marker || remaining == &end_marker[..end_marker.len() - 2] { + break; + } + } + Ok(fields) +} + +async fn parse_blob_batch_delete_body( + batch_body: Bytes, + boundary: String, + paths: &[Path], +) -> Result>> { + let mut results: Vec> = paths.iter().cloned().map(Ok).collect(); + + for field in parse_multipart_body_fields(batch_body, boundary.as_bytes())? { + let id = field + .headers + .get("content-id") + .and_then(|v| std::str::from_utf8(v.as_bytes()).ok()) + .and_then(|v| v.parse::().ok()); + + // Parse part response headers + // Documentation mentions 5 headers and states that other standard HTTP headers + // may be provided, in order to not incurr in more complexity to support an arbitrary + // amount of headers we chose a conservative amount and error otherwise + // https://learn.microsoft.com/en-us/rest/api/storageservices/delete-blob?tabs=microsoft-entra-id#response-headers + let mut headers = [httparse::EMPTY_HEADER; 48]; + let mut part_response = httparse::Response::new(&mut headers); + match part_response.parse(&field.content) { + Ok(httparse::Status::Complete(_)) => {} + _ => return Err(invalid_response("unable to parse response").into()), + }; + + match (id, part_response.code) { + (Some(_id), Some(code)) if (200..300).contains(&code) => {} + (Some(id), Some(404)) => { + results[id] = Err(crate::Error::NotFound { + path: paths[id].as_ref().to_string(), + source: Error::DeleteFailed { + path: paths[id].as_ref().to_string(), + code: 404.to_string(), + reason: part_response.reason.unwrap_or_default().to_string(), + } + .into(), + }); + } + (Some(id), Some(code)) => { + results[id] = Err(Error::DeleteFailed { + path: paths[id].as_ref().to_string(), + code: code.to_string(), + reason: part_response.reason.unwrap_or_default().to_string(), + } + .into()); + } + (None, Some(code)) => { + return Err(Error::BulkDeleteRequestInvalidInput { + code: code.to_string(), + reason: part_response.reason.unwrap_or_default().to_string(), + } + .into()) + } + _ => return Err(invalid_response("missing part response status code").into()), + } + } + + Ok(results) +} + #[derive(Debug)] pub(crate) struct AzureClient { config: AzureConfig, @@ -311,10 +557,11 @@ impl AzureClient { pub(crate) async fn put_block( &self, path: &Path, - part_idx: usize, + _part_idx: usize, payload: PutPayload, ) -> Result { - let content_id = format!("{part_idx:20}"); + let part_idx = u128::from_be_bytes(rand::thread_rng().gen()); + let content_id = format!("{part_idx:032x}"); let block_id = BASE64_STANDARD.encode(&content_id); self.put_request(path, payload) @@ -380,6 +627,86 @@ impl AzureClient { Ok(()) } + fn build_bulk_delete_body( + &self, + boundary: &str, + paths: &[Path], + credential: &Option>, + ) -> Vec { + let mut body_bytes = Vec::with_capacity(paths.len() * 2048); + + for (idx, path) in paths.iter().enumerate() { + let url = self.config.path_url(path); + + // Build subrequest with proper authorization + let request = self + .client + .request(Method::DELETE, url) + .header(CONTENT_LENGTH, HeaderValue::from(0)) + // Each subrequest must be authorized individually [1] and we use + // the CredentialExt for this. + // [1]: https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch?tabs=microsoft-entra-id#request-body + .with_azure_authorization(credential, &self.config.account) + .build() + .unwrap(); + + // Url for part requests must be relative and without base + let relative_url = self.config.service.make_relative(request.url()).unwrap(); + + serialize_part_delete_request(&mut body_bytes, boundary, idx, request, relative_url) + } + + // Encode end marker + extend(&mut body_bytes, b"--"); + extend(&mut body_bytes, boundary.as_bytes()); + extend(&mut body_bytes, b"--"); + extend(&mut body_bytes, b"\r\n"); + body_bytes + } + + pub(crate) async fn bulk_delete_request(&self, paths: Vec) -> Result>> { + if paths.is_empty() { + return Ok(Vec::new()); + } + + let credential = self.get_credential().await?; + + // https://www.ietf.org/rfc/rfc2046 + let random_bytes = rand::random::<[u8; 16]>(); // 128 bits + let boundary = format!("batch_{}", BASE64_STANDARD_NO_PAD.encode(random_bytes)); + + let body_bytes = self.build_bulk_delete_body(&boundary, &paths, &credential); + + // Send multipart request + let url = self.config.path_url(&Path::from("/")); + let batch_response = self + .client + .request(Method::POST, url) + .query(&[("restype", "container"), ("comp", "batch")]) + .header( + CONTENT_TYPE, + HeaderValue::from_str(format!("multipart/mixed; boundary={}", boundary).as_str()) + .unwrap(), + ) + .header(CONTENT_LENGTH, HeaderValue::from(body_bytes.len())) + .body(body_bytes) + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(BulkDeleteRequestSnafu {})?; + + let boundary = parse_multipart_response_boundary(&batch_response)?; + + let batch_body = batch_response + .bytes() + .await + .context(BulkDeleteRequestBodySnafu {})?; + + let results = parse_blob_batch_delete_body(batch_body, boundary, &paths).await?; + + Ok(results) + } + /// Make an Azure Copy request pub(crate) async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { let credential = self.get_credential().await?; @@ -814,8 +1141,10 @@ pub(crate) struct UserDelegationKey { #[cfg(test)] mod tests { use bytes::Bytes; + use regex::bytes::Regex; use super::*; + use crate::StaticCredentialProvider; #[test] fn deserde_azure() { @@ -1005,4 +1334,159 @@ mod tests { let _delegated_key_response_internal: UserDelegationKey = quick_xml::de::from_str(S).unwrap(); } + + #[tokio::test] + async fn test_build_bulk_delete_body() { + let credential_provider = Arc::new(StaticCredentialProvider::new( + AzureCredential::BearerToken("static-token".to_string()), + )); + + let config = AzureConfig { + account: "testaccount".to_string(), + container: "testcontainer".to_string(), + credentials: credential_provider, + service: "http://example.com".try_into().unwrap(), + retry_config: Default::default(), + is_emulator: false, + skip_signature: false, + disable_tagging: false, + client_options: Default::default(), + }; + + let client = AzureClient::new(config).unwrap(); + + let credential = client.get_credential().await.unwrap(); + let paths = &[Path::from("a"), Path::from("b"), Path::from("c")]; + + let boundary = "batch_statictestboundary".to_string(); + + let body_bytes = client.build_bulk_delete_body(&boundary, paths, &credential); + + // Replace Date header value with a static date + let re = Regex::new("Date:[^\r]+").unwrap(); + let body_bytes = re + .replace_all(&body_bytes, b"Date: Tue, 05 Nov 2024 15:01:15 GMT") + .to_vec(); + + let expected_body = b"--batch_statictestboundary\r +Content-Type: application/http\r +Content-Transfer-Encoding: binary\r +Content-ID: 0\r +\r +DELETE /testcontainer/a HTTP/1.1\r +Content-Length: 0\r +Date: Tue, 05 Nov 2024 15:01:15 GMT\r +X-Ms-Version: 2023-11-03\r +Authorization: Bearer static-token\r +\r +\r +--batch_statictestboundary\r +Content-Type: application/http\r +Content-Transfer-Encoding: binary\r +Content-ID: 1\r +\r +DELETE /testcontainer/b HTTP/1.1\r +Content-Length: 0\r +Date: Tue, 05 Nov 2024 15:01:15 GMT\r +X-Ms-Version: 2023-11-03\r +Authorization: Bearer static-token\r +\r +\r +--batch_statictestboundary\r +Content-Type: application/http\r +Content-Transfer-Encoding: binary\r +Content-ID: 2\r +\r +DELETE /testcontainer/c HTTP/1.1\r +Content-Length: 0\r +Date: Tue, 05 Nov 2024 15:01:15 GMT\r +X-Ms-Version: 2023-11-03\r +Authorization: Bearer static-token\r +\r +\r +--batch_statictestboundary--\r\n" + .to_vec(); + + assert_eq!(expected_body, body_bytes); + } + + #[tokio::test] + async fn test_parse_blob_batch_delete_body() { + let response_body = b"--batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed\r +Content-Type: application/http\r +Content-ID: 0\r +\r +HTTP/1.1 202 Accepted\r +x-ms-delete-type-permanent: true\r +x-ms-request-id: 778fdc83-801e-0000-62ff-0334671e284f\r +x-ms-version: 2018-11-09\r +\r +--batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed\r +Content-Type: application/http\r +Content-ID: 1\r +\r +HTTP/1.1 202 Accepted\r +x-ms-delete-type-permanent: true\r +x-ms-request-id: 778fdc83-801e-0000-62ff-0334671e2851\r +x-ms-version: 2018-11-09\r +\r +--batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed\r +Content-Type: application/http\r +Content-ID: 2\r +\r +HTTP/1.1 404 The specified blob does not exist.\r +x-ms-error-code: BlobNotFound\r +x-ms-request-id: 778fdc83-801e-0000-62ff-0334671e2852\r +x-ms-version: 2018-11-09\r +Content-Length: 216\r +Content-Type: application/xml\r +\r + +BlobNotFoundThe specified blob does not exist. +RequestId:778fdc83-801e-0000-62ff-0334671e2852 +Time:2018-06-14T16:46:54.6040685Z\r +--batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed--\r\n"; + + let response: reqwest::Response = http::Response::builder() + .status(202) + .header("Transfer-Encoding", "chunked") + .header( + "Content-Type", + "multipart/mixed; boundary=batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed", + ) + .header("x-ms-request-id", "778fdc83-801e-0000-62ff-033467000000") + .header("x-ms-version", "2018-11-09") + .body(Bytes::from(response_body.as_slice())) + .unwrap() + .into(); + + let boundary = parse_multipart_response_boundary(&response).unwrap(); + let body = response.bytes().await.unwrap(); + + let paths = &[Path::from("a"), Path::from("b"), Path::from("c")]; + + let results = parse_blob_batch_delete_body(body, boundary, paths) + .await + .unwrap(); + + assert!(results[0].is_ok()); + assert_eq!(&paths[0], results[0].as_ref().unwrap()); + + assert!(results[1].is_ok()); + assert_eq!(&paths[1], results[1].as_ref().unwrap()); + + assert!(results[2].is_err()); + let err = results[2].as_ref().unwrap_err(); + let crate::Error::NotFound { source, .. } = err else { + unreachable!("must be not found") + }; + let Some(Error::DeleteFailed { path, code, reason }) = source.downcast_ref::() + else { + unreachable!("must be client error") + }; + + assert_eq!(paths[2].as_ref(), path); + assert_eq!("404", code); + assert_eq!("The specified blob does not exist.", reason); + } } diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index f89a184f9523..81b6667bc058 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -30,7 +30,7 @@ use crate::{ PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, UploadPart, }; use async_trait::async_trait; -use futures::stream::BoxStream; +use futures::stream::{BoxStream, StreamExt, TryStreamExt}; use reqwest::Method; use std::fmt::Debug; use std::sync::Arc; @@ -119,6 +119,26 @@ impl ObjectStore for MicrosoftAzure { self.client.delete_request(location, &()).await } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + locations + .try_chunks(256) + .map(move |locations| async { + // Early return the error. We ignore the paths that have already been + // collected into the chunk. + let locations = locations.map_err(|e| e.1)?; + self.client + .bulk_delete_request(locations) + .await + .map(futures::stream::iter) + }) + .buffered(20) + .try_flatten() + .boxed() + } + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { self.client.list(prefix) } @@ -294,6 +314,7 @@ mod tests { stream_get(&integration).await; put_opts(&integration, true).await; multipart(&integration, &integration).await; + multipart_race_condition(&integration, false).await; signing(&integration).await; let validate = !integration.client.config().disable_tagging; diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 76d1c1f22f58..1b7ce5aa7a78 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -671,6 +671,10 @@ impl ClientOptions { builder = builder.danger_accept_invalid_certs(true) } + // Reqwest will remove the `Content-Length` header if it is configured to + // transparently decompress the body via the non-default `gzip` feature. + builder = builder.no_gzip(); + builder .https_only(!self.allow_http.get()?) .build() diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index 601bffdec158..a8a8e58de4d0 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -200,6 +200,7 @@ pub(crate) struct RetryableRequest { sensitive: bool, idempotent: Option, + retry_on_conflict: bool, payload: Option, retry_error_body: bool, @@ -217,6 +218,15 @@ impl RetryableRequest { } } + /// Set whether this request should be retried on a 409 Conflict response. + #[cfg(feature = "aws")] + pub(crate) fn retry_on_conflict(self, retry_on_conflict: bool) -> Self { + Self { + retry_on_conflict, + ..self + } + } + /// Set whether this request contains sensitive data /// /// This will avoid printing out the URL in error messages @@ -340,7 +350,8 @@ impl RetryableRequest { let status = r.status(); if retries == max_retries || now.elapsed() > retry_timeout - || !status.is_server_error() + || !(status.is_server_error() + || (self.retry_on_conflict && status == StatusCode::CONFLICT)) { return Err(match status.is_client_error() { true => match r.text().await { @@ -467,6 +478,7 @@ impl RetryExt for reqwest::RequestBuilder { idempotent: None, payload: None, sensitive: false, + retry_on_conflict: false, retry_error_body: false, } } diff --git a/object_store/src/client/s3.rs b/object_store/src/client/s3.rs index dba752cb1251..7fe956b2376e 100644 --- a/object_store/src/client/s3.rs +++ b/object_store/src/client/s3.rs @@ -106,14 +106,32 @@ pub(crate) struct CompleteMultipartUpload { pub part: Vec, } +#[derive(Serialize, Deserialize)] +pub(crate) struct PartMetadata { + pub e_tag: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum_sha256: Option, +} + impl From> for CompleteMultipartUpload { fn from(value: Vec) -> Self { let part = value .into_iter() .enumerate() - .map(|(part_number, part)| MultipartPart { - e_tag: part.content_id, - part_number: part_number + 1, + .map(|(part_idx, part)| { + let md = match quick_xml::de::from_str::(&part.content_id) { + Ok(md) => md, + // fallback to old way + Err(_) => PartMetadata { + e_tag: part.content_id.clone(), + checksum_sha256: None, + }, + }; + MultipartPart { + e_tag: md.e_tag, + part_number: part_idx + 1, + checksum_sha256: md.checksum_sha256, + } }) .collect(); Self { part } @@ -126,6 +144,9 @@ pub(crate) struct MultipartPart { pub e_tag: String, #[serde(rename = "PartNumber")] pub part_number: usize, + #[serde(rename = "ChecksumSHA256")] + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum_sha256: Option, } #[derive(Debug, Deserialize)] diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 039ec46b68c2..5199135ba6b0 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -297,6 +297,7 @@ mod test { // https://github.com/fsouza/fake-gcs-server/issues/852 stream_get(&integration).await; multipart(&integration, &integration).await; + multipart_race_condition(&integration, true).await; // Fake GCS server doesn't currently honor preconditions get_opts(&integration).await; put_opts(&integration, true).await; diff --git a/object_store/src/integration.rs b/object_store/src/integration.rs index 30177878306f..20e95fddc478 100644 --- a/object_store/src/integration.rs +++ b/object_store/src/integration.rs @@ -24,6 +24,8 @@ //! //! They are intended solely for testing purposes. +use core::str; + use crate::multipart::MultipartStore; use crate::path::Path; use crate::{ @@ -1109,3 +1111,88 @@ async fn delete_fixtures(storage: &DynObjectStore) { .await .unwrap(); } + +/// Tests a race condition where 2 threads are performing multipart writes to the same path +pub async fn multipart_race_condition(storage: &dyn ObjectStore, last_writer_wins: bool) { + let path = Path::from("test_multipart_race_condition"); + + let mut multipart_upload_1 = storage.put_multipart(&path).await.unwrap(); + let mut multipart_upload_2 = storage.put_multipart(&path).await.unwrap(); + + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 0)).into()) + .await + .unwrap(); + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 0)).into()) + .await + .unwrap(); + + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 1)).into()) + .await + .unwrap(); + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 1)).into()) + .await + .unwrap(); + + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 2)).into()) + .await + .unwrap(); + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 2)).into()) + .await + .unwrap(); + + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 3)).into()) + .await + .unwrap(); + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 3)).into()) + .await + .unwrap(); + + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 4)).into()) + .await + .unwrap(); + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 4)).into()) + .await + .unwrap(); + + multipart_upload_1.complete().await.unwrap(); + + if last_writer_wins { + multipart_upload_2.complete().await.unwrap(); + } else { + let err = multipart_upload_2.complete().await.unwrap_err(); + + assert!(matches!(err, crate::Error::Generic { .. }), "{err}"); + } + + let get_result = storage.get(&path).await.unwrap(); + let bytes = get_result.bytes().await.unwrap(); + let string_contents = str::from_utf8(&bytes).unwrap(); + + if last_writer_wins { + assert!(string_contents.starts_with( + format!( + "2:{:05300000},2:{:05300000},2:{:05300000},2:{:05300000},2:{:05300000},", + 0, 1, 2, 3, 4 + ) + .as_str() + )); + } else { + assert!(string_contents.starts_with( + format!( + "1:{:05300000},1:{:05300000},1:{:05300000},1:{:05300000},1:{:05300000},", + 0, 1, 2, 3, 4 + ) + .as_str() + )); + } +} diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 11324b1e5b92..78fce9c26224 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -1004,7 +1004,7 @@ fn get_inode(metadata: &Metadata) -> u64 { #[cfg(not(unix))] /// On platforms where an inode isn't available, fallback to just relying on size and mtime -fn get_inode(metadata: &Metadata) -> u64 { +fn get_inode(_metadata: &Metadata) -> u64 { 0 } @@ -1060,7 +1060,10 @@ mod tests { use std::fs; use futures::TryStreamExt; - use tempfile::{NamedTempFile, TempDir}; + use tempfile::TempDir; + + #[cfg(target_family = "unix")] + use tempfile::NamedTempFile; use crate::integration::*; @@ -1248,6 +1251,7 @@ mod tests { fs.list_with_delimiter(None).await.unwrap(); } + #[cfg(target_family = "unix")] async fn check_list(integration: &LocalFileSystem, prefix: Option<&Path>, expected: &[&str]) { let result: Vec<_> = integration.list(prefix).try_collect().await.unwrap(); diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index b458bdddfbf5..a467e3b88a26 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -468,12 +468,6 @@ impl InMemory { Self { storage } } - /// Creates a clone of the store - #[deprecated(note = "Use fork() instead")] - pub async fn clone(&self) -> Self { - self.fork() - } - async fn entry(&self, location: &Path) -> Result { let storage = self.storage.read(); let value = storage diff --git a/object_store/src/path/parts.rs b/object_store/src/path/parts.rs index df7097cbe9db..de2e1a75c955 100644 --- a/object_store/src/path/parts.rs +++ b/object_store/src/path/parts.rs @@ -126,7 +126,7 @@ impl From for PathPart<'static> { } } -impl<'a> AsRef for PathPart<'a> { +impl AsRef for PathPart<'_> { fn as_ref(&self) -> &str { self.raw.as_ref() } diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index 9b10fea5e0bb..227887d78fd7 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -26,10 +26,6 @@ use crate::{ PutOptions, PutPayload, PutResult, Result, }; -#[doc(hidden)] -#[deprecated(note = "Use PrefixStore")] -pub type PrefixObjectStore = PrefixStore; - /// Store wrapper that applies a constant prefix to all paths handled by the store. #[derive(Debug, Clone)] pub struct PrefixStore { diff --git a/object_store/tests/http.rs b/object_store/tests/http.rs new file mode 100644 index 000000000000..a9b3145bb660 --- /dev/null +++ b/object_store/tests/http.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests the HTTP store implementation + +#[cfg(feature = "http")] +use object_store::{http::HttpBuilder, path::Path, GetOptions, GetRange, ObjectStore}; + +/// Tests that even when reqwest has the `gzip` feature enabled, the HTTP store +/// does not error on a missing `Content-Length` header. +#[tokio::test] +#[cfg(feature = "http")] +async fn test_http_store_gzip() { + let http_store = HttpBuilder::new() + .with_url("https://raw.githubusercontent.com/apache/arrow-rs/refs/heads/main") + .build() + .unwrap(); + + let _ = http_store + .get_opts( + &Path::parse("LICENSE.txt").unwrap(), + GetOptions { + range: Some(GetRange::Bounded(0..100)), + ..Default::default() + }, + ) + .await + .unwrap(); +} diff --git a/parquet-testing b/parquet-testing index 550368ca77b9..4439a223a315 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 550368ca77b97231efead39251a96bd6f8f08c6e +Subproject commit 4439a223a315cf874746d3b5da25e6a6b2a2b16e diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 4064baba0947..19f890710778 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -67,7 +67,7 @@ hashbrown = { version = "0.15", default-features = false } twox-hash = { version = "1.6", default-features = false } paste = { version = "1.0" } half = { version = "2.1", default-features = false, features = ["num-traits"] } -sysinfo = { version = "0.32.0", optional = true, default-features = false, features = ["system"] } +sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] } crc32fast = { version = "1.4.2", optional = true, default-features = false } [dev-dependencies] diff --git a/parquet/LICENSE.txt b/parquet/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/parquet/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/parquet/NOTICE.txt b/parquet/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/parquet/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file diff --git a/parquet/README.md b/parquet/README.md index a0441ee6026d..e9f52ff279d5 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -59,7 +59,7 @@ The `parquet` crate provides the following features which may be enabled in your - `lz4` (default) - support for parquet using `lz4` compression - `zstd` (default) - support for parquet using `zstd` compression - `snap` (default) - support for parquet using `snappy` compression -- `cli` - parquet [CLI tools](https://github.com/apache/arrow-rs/tree/master/parquet/src/bin) +- `cli` - parquet [CLI tools](https://github.com/apache/arrow-rs/tree/main/parquet/src/bin) - `crc` - enables functionality to automatically verify checksums of each page (if present) when decoding - `experimental` - Experimental APIs which may change, even between minor releases diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index c424d000694a..e5165fee212c 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -680,7 +680,7 @@ fn create_string_list_reader( column_desc: ColumnDescPtr, ) -> Box { let items = create_byte_array_reader(page_iterator, column_desc); - let field = Field::new("item", DataType::Utf8, true); + let field = Field::new_list_field(DataType::Utf8, true); let data_type = DataType::List(Arc::new(field)); Box::new(ListArrayReader::::new(items, data_type, 2, 1, true)) } diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index cf39ee66f31a..bfa333db722c 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -189,17 +189,17 @@ fn create_list_primitive_bench_batch( let fields = vec![ Field::new( "_1", - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))), true, ), Field::new( "_2", - DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Boolean, true))), true, ), Field::new( "_3", - DataType::LargeList(Arc::new(Field::new("item", DataType::Utf8, true))), + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Utf8, true))), true, ), ]; @@ -220,17 +220,17 @@ fn create_list_primitive_bench_batch_non_null( let fields = vec![ Field::new( "_1", - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))), false, ), Field::new( "_2", - DataType::List(Arc::new(Field::new("item", DataType::Boolean, false))), + DataType::List(Arc::new(Field::new_list_field(DataType::Boolean, false))), false, ), Field::new( "_3", - DataType::LargeList(Arc::new(Field::new("item", DataType::Utf8, false))), + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Utf8, false))), false, ), ]; @@ -274,10 +274,8 @@ fn _create_nested_bench_batch( ), Field::new( "_2", - DataType::LargeList(Arc::new(Field::new( - "item", - DataType::List(Arc::new(Field::new( - "item", + DataType::LargeList(Arc::new(Field::new_list_field( + DataType::List(Arc::new(Field::new_list_field( DataType::Struct(Fields::from(vec![ Field::new( "_1", diff --git a/parquet/examples/async_read_parquet.rs b/parquet/examples/async_read_parquet.rs index e59cad8055cb..0a2e9ba994dd 100644 --- a/parquet/examples/async_read_parquet.rs +++ b/parquet/examples/async_read_parquet.rs @@ -45,7 +45,7 @@ async fn main() -> Result<()> { builder = builder.with_projection(mask); // Highlight: set `RowFilter`, it'll push down filter predicates to skip IO and decode. - // For more specific usage: please refer to https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/src/physical_plan/file_format/parquet/row_filter.rs. + // For more specific usage: please refer to https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs. let scalar = Int32Array::from(vec![1]); let filter = ArrowPredicateFn::new( ProjectionMask::roots(file_metadata.schema_descr(), [0]), diff --git a/parquet/examples/write_parquet.rs b/parquet/examples/write_parquet.rs index 1b51d40c8134..ebdd9527b6f1 100644 --- a/parquet/examples/write_parquet.rs +++ b/parquet/examples/write_parquet.rs @@ -28,7 +28,7 @@ use parquet::arrow::ArrowWriter as ParquetWriter; use parquet::basic::Encoding; use parquet::errors::Result; use parquet::file::properties::{BloomFilterPosition, WriterProperties}; -use sysinfo::{MemoryRefreshKind, ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System}; +use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System}; #[derive(ValueEnum, Clone)] enum BloomFilterPositionArg { @@ -97,8 +97,7 @@ fn main() -> Result<()> { let file = File::create(args.path).unwrap(); let mut writer = ParquetWriter::try_new(file, schema.clone(), Some(properties))?; - let mut system = - System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything())); + let mut system = System::new_with_specifics(RefreshKind::everything()); eprintln!( "{} Writing {} batches of {} rows. RSS = {}", now(), diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index 4be07ed68f1d..6b437be943d4 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -508,8 +508,7 @@ mod tests { ); // [[], [1], [2, 3], null, [4], null, [6, 7, 8]] - let data = ArrayDataBuilder::new(ArrowType::List(Arc::new(Field::new( - "item", + let data = ArrayDataBuilder::new(ArrowType::List(Arc::new(Field::new_list_field( decimals.data_type().clone(), false, )))) diff --git a/parquet/src/arrow/array_reader/fixed_size_list_array.rs b/parquet/src/arrow/array_reader/fixed_size_list_array.rs index 75099d018fc9..43a9037d4a74 100644 --- a/parquet/src/arrow/array_reader/fixed_size_list_array.rs +++ b/parquet/src/arrow/array_reader/fixed_size_list_array.rs @@ -277,7 +277,7 @@ mod tests { let mut list_array_reader = FixedSizeListArrayReader::new( Box::new(item_array_reader), 3, - ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 3), + ArrowType::FixedSizeList(Arc::new(Field::new_list_field(ArrowType::Int32, true)), 3), 2, 1, true, @@ -323,7 +323,7 @@ mod tests { let mut list_array_reader = FixedSizeListArrayReader::new( Box::new(item_array_reader), 2, - ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 2), + ArrowType::FixedSizeList(Arc::new(Field::new_list_field(ArrowType::Int32, true)), 2), 1, 1, false, @@ -347,9 +347,9 @@ mod tests { // [[null, null]], // ] let l2_type = - ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 2); + ArrowType::FixedSizeList(Arc::new(Field::new_list_field(ArrowType::Int32, true)), 2); let l1_type = - ArrowType::FixedSizeList(Arc::new(Field::new("item", l2_type.clone(), false)), 1); + ArrowType::FixedSizeList(Arc::new(Field::new_list_field(l2_type.clone(), false)), 1); let array = PrimitiveArray::::from(vec![ None, @@ -436,7 +436,7 @@ mod tests { let mut list_array_reader = FixedSizeListArrayReader::new( Box::new(item_array_reader), 0, - ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 0), + ArrowType::FixedSizeList(Arc::new(Field::new_list_field(ArrowType::Int32, true)), 0), 2, 1, true, @@ -481,9 +481,9 @@ mod tests { None, ])); - let inner_type = ArrowType::List(Arc::new(Field::new("item", ArrowType::Int32, true))); + let inner_type = ArrowType::List(Arc::new(Field::new_list_field(ArrowType::Int32, true))); let list_type = - ArrowType::FixedSizeList(Arc::new(Field::new("item", inner_type.clone(), true)), 2); + ArrowType::FixedSizeList(Arc::new(Field::new_list_field(inner_type.clone(), true)), 2); let item_array_reader = InMemoryArrayReader::new( ArrowType::Int32, @@ -534,7 +534,10 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new( "list", - ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 4), + ArrowType::FixedSizeList( + Arc::new(Field::new_list_field(ArrowType::Int32, true)), + 4, + ), true, ), Field::new("primitive", ArrowType::Int32, true), @@ -599,7 +602,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![Field::new( "list", - ArrowType::FixedSizeList(Arc::new(Field::new("item", ArrowType::Int32, true)), 4), + ArrowType::FixedSizeList(Arc::new(Field::new_list_field(ArrowType::Int32, true)), 4), true, )])); diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index ebff3286bed5..6e583ed00c19 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -265,7 +265,7 @@ mod tests { data_type: ArrowType, item_nullable: bool, ) -> ArrowType { - let field = Arc::new(Field::new("item", data_type, item_nullable)); + let field = Arc::new(Field::new_list_field(data_type, item_nullable)); GenericListArray::::DATA_TYPE_CONSTRUCTOR(field) } diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 010e9c2eed3f..a952e00e12ef 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -208,10 +208,10 @@ where // As there is not always a 1:1 mapping between Arrow and Parquet, there // are datatypes which we must convert explicitly. // These are: - // - date64: we should cast int32 to date32, then date32 to date64. - // - decimal: cast in32 to decimal, int64 to decimal + // - date64: cast int32 to date32, then date32 to date64. + // - decimal: cast int32 to decimal, int64 to decimal let array = match target_type { - ArrowType::Date64 => { + ArrowType::Date64 if *(array.data_type()) == ArrowType::Int32 => { // this is cheap as it internally reinterprets the data let a = arrow_cast::cast(&array, &ArrowType::Date32)?; arrow_cast::cast(&a, target_type)? @@ -305,9 +305,9 @@ mod tests { use crate::util::test_common::rand_gen::make_pages; use crate::util::InMemoryPageIterator; use arrow::datatypes::ArrowPrimitiveType; - use arrow_array::{Array, PrimitiveArray}; + use arrow_array::{Array, Date32Array, PrimitiveArray}; - use arrow::datatypes::DataType::Decimal128; + use arrow::datatypes::DataType::{Date32, Decimal128}; use rand::distributions::uniform::SampleUniform; use std::collections::VecDeque; @@ -783,4 +783,54 @@ mod tests { assert_ne!(array, &data_decimal_array) } } + + #[test] + fn test_primitive_array_reader_date32_type() { + // parquet `INT32` to date + let message_type = " + message test_schema { + REQUIRED INT32 date1 (DATE); + } + "; + let schema = parse_message_type(message_type) + .map(|t| Arc::new(SchemaDescriptor::new(Arc::new(t)))) + .unwrap(); + let column_desc = schema.column(0); + + // create the array reader + { + let mut data = Vec::new(); + let mut page_lists = Vec::new(); + make_column_chunks::( + column_desc.clone(), + Encoding::PLAIN, + 100, + -99999999, + 99999999, + &mut Vec::new(), + &mut Vec::new(), + &mut data, + &mut page_lists, + true, + 2, + ); + let page_iterator = InMemoryPageIterator::new(page_lists); + + let mut array_reader = + PrimitiveArrayReader::::new(Box::new(page_iterator), column_desc, None) + .unwrap(); + + // read data from the reader + // the data type is date + let array = array_reader.next_batch(50).unwrap(); + assert_eq!(array.data_type(), &Date32); + let array = array.as_any().downcast_ref::().unwrap(); + let data_date_array = data[0..50] + .iter() + .copied() + .map(Some) + .collect::(); + assert_eq!(array, &data_date_array); + } + } } diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index d3709c03e99a..378884a1c430 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -932,12 +932,12 @@ mod tests { use arrow_array::builder::*; use arrow_array::cast::AsArray; use arrow_array::types::{ - Decimal128Type, Decimal256Type, DecimalType, Float16Type, Float32Type, Float64Type, - Time32MillisecondType, Time64MicrosecondType, + Date32Type, Date64Type, Decimal128Type, Decimal256Type, DecimalType, Float16Type, + Float32Type, Float64Type, Time32MillisecondType, Time64MicrosecondType, }; use arrow_array::*; use arrow_buffer::{i256, ArrowNativeType, Buffer, IntervalDayTime}; - use arrow_data::ArrayDataBuilder; + use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ ArrowError, DataType as ArrowDataType, Field, Fields, Schema, SchemaRef, TimeUnit, }; @@ -1272,6 +1272,117 @@ mod tests { Ok(()) } + #[test] + fn test_date32_roundtrip() -> Result<()> { + use arrow_array::Date32Array; + + let schema = Arc::new(Schema::new(vec![Field::new( + "date32", + ArrowDataType::Date32, + false, + )])); + + let mut buf = Vec::with_capacity(1024); + + let mut writer = ArrowWriter::try_new(&mut buf, schema.clone(), None)?; + + let original = RecordBatch::try_new( + schema, + vec![Arc::new(Date32Array::from(vec![ + -1_000_000, -100_000, -10_000, -1_000, 0, 1_000, 10_000, 100_000, 1_000_000, + ]))], + )?; + + writer.write(&original)?; + writer.close()?; + + let mut reader = ParquetRecordBatchReader::try_new(Bytes::from(buf), 1024)?; + let ret = reader.next().unwrap()?; + assert_eq!(ret, original); + + // Ensure can be downcast to the correct type + ret.column(0).as_primitive::(); + + Ok(()) + } + + #[test] + fn test_date64_roundtrip() -> Result<()> { + use arrow_array::Date64Array; + + let schema = Arc::new(Schema::new(vec![ + Field::new("small-date64", ArrowDataType::Date64, false), + Field::new("big-date64", ArrowDataType::Date64, false), + Field::new("invalid-date64", ArrowDataType::Date64, false), + ])); + + let mut default_buf = Vec::with_capacity(1024); + let mut coerce_buf = Vec::with_capacity(1024); + + let coerce_props = WriterProperties::builder().set_coerce_types(true).build(); + + let mut default_writer = ArrowWriter::try_new(&mut default_buf, schema.clone(), None)?; + let mut coerce_writer = + ArrowWriter::try_new(&mut coerce_buf, schema.clone(), Some(coerce_props))?; + + static NUM_MILLISECONDS_IN_DAY: i64 = 1000 * 60 * 60 * 24; + + let original = RecordBatch::try_new( + schema, + vec![ + // small-date64 + Arc::new(Date64Array::from(vec![ + -1_000_000 * NUM_MILLISECONDS_IN_DAY, + -1_000 * NUM_MILLISECONDS_IN_DAY, + 0, + 1_000 * NUM_MILLISECONDS_IN_DAY, + 1_000_000 * NUM_MILLISECONDS_IN_DAY, + ])), + // big-date64 + Arc::new(Date64Array::from(vec![ + -10_000_000_000 * NUM_MILLISECONDS_IN_DAY, + -1_000_000_000 * NUM_MILLISECONDS_IN_DAY, + 0, + 1_000_000_000 * NUM_MILLISECONDS_IN_DAY, + 10_000_000_000 * NUM_MILLISECONDS_IN_DAY, + ])), + // invalid-date64 + Arc::new(Date64Array::from(vec![ + -1_000_000 * NUM_MILLISECONDS_IN_DAY + 1, + -1_000 * NUM_MILLISECONDS_IN_DAY + 1, + 1, + 1_000 * NUM_MILLISECONDS_IN_DAY + 1, + 1_000_000 * NUM_MILLISECONDS_IN_DAY + 1, + ])), + ], + )?; + + default_writer.write(&original)?; + coerce_writer.write(&original)?; + + default_writer.close()?; + coerce_writer.close()?; + + let mut default_reader = ParquetRecordBatchReader::try_new(Bytes::from(default_buf), 1024)?; + let mut coerce_reader = ParquetRecordBatchReader::try_new(Bytes::from(coerce_buf), 1024)?; + + let default_ret = default_reader.next().unwrap()?; + let coerce_ret = coerce_reader.next().unwrap()?; + + // Roundtrip should be successful when default writer used + assert_eq!(default_ret, original); + + // Only small-date64 should roundtrip successfully when coerce_types writer is used + assert_eq!(coerce_ret.column(0), original.column(0)); + assert_ne!(coerce_ret.column(1), original.column(1)); + assert_ne!(coerce_ret.column(2), original.column(2)); + + // Ensure both can be downcast to the correct type + default_ret.column(0).as_primitive::(); + coerce_ret.column(0).as_primitive::(); + + Ok(()) + } struct RandFixedLenGen {} impl RandGen for RandFixedLenGen { @@ -1542,8 +1653,7 @@ mod tests { let decimals = Decimal128Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); // [[], [1], [2, 3], null, [4], null, [6, 7, 8]] - let data = ArrayDataBuilder::new(ArrowDataType::List(Arc::new(Field::new( - "item", + let data = ArrayDataBuilder::new(ArrowDataType::List(Arc::new(Field::new_list_field( decimals.data_type().clone(), false, )))) @@ -2874,7 +2984,7 @@ mod tests { let arrow_field = Field::new( "emptylist", - ArrowDataType::List(Arc::new(Field::new("item", ArrowDataType::Null, true))), + ArrowDataType::List(Arc::new(Field::new_list_field(ArrowDataType::Null, true))), true, ); @@ -3346,7 +3456,7 @@ mod tests { fn test_row_group_batch(row_group_size: usize, batch_size: usize) { let schema = Arc::new(Schema::new(vec![Field::new( "list", - ArrowDataType::List(Arc::new(Field::new("item", ArrowDataType::Int32, true))), + ArrowDataType::List(Arc::new(Field::new_list_field(ArrowDataType::Int32, true))), true, )])); @@ -3584,9 +3694,7 @@ mod tests { .unwrap(); // Although `Vec>` of each row group is empty, // we should read the file successfully. - // FIXME: this test will fail when metadata parsing returns `None` for missing page - // indexes. https://github.com/apache/arrow-rs/issues/6447 - assert!(builder.metadata().offset_index().unwrap()[0].is_empty()); + assert!(builder.metadata().offset_index().is_none()); let reader = builder.build().unwrap(); let batches = reader.collect::, _>>().unwrap(); assert_eq!(batches.len(), 1); @@ -3905,7 +4013,7 @@ mod tests { fn test_list_selection() { let schema = Arc::new(Schema::new(vec![Field::new_list( "list", - Field::new("item", ArrowDataType::Utf8, true), + Field::new_list_field(ArrowDataType::Utf8, true), false, )])); let mut buf = Vec::with_capacity(1024); @@ -3961,7 +4069,11 @@ mod tests { let mut rng = thread_rng(); let schema = Arc::new(Schema::new(vec![Field::new_list( "list", - Field::new_list("item", Field::new("item", ArrowDataType::Int32, true), true), + Field::new_list( + Field::LIST_FIELD_DEFAULT_NAME, + Field::new_list_field(ArrowDataType::Int32, true), + true, + ), true, )])); let mut buf = Vec::with_capacity(1024); @@ -4065,4 +4177,93 @@ mod tests { } } } + + #[test] + fn test_read_old_nested_list() { + use arrow::datatypes::DataType; + use arrow::datatypes::ToByteSlice; + + let testdata = arrow::util::test_util::parquet_test_data(); + // message my_record { + // REQUIRED group a (LIST) { + // REPEATED group array (LIST) { + // REPEATED INT32 array; + // } + // } + // } + // should be read as list> + let path = format!("{testdata}/old_list_structure.parquet"); + let test_file = File::open(path).unwrap(); + + // create expected ListArray + let a_values = Int32Array::from(vec![1, 2, 3, 4]); + + // Construct a buffer for value offsets, for the nested array: [[1, 2], [3, 4]] + let a_value_offsets = arrow::buffer::Buffer::from([0, 2, 4].to_byte_slice()); + + // Construct a list array from the above two + let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( + "array", + DataType::Int32, + false, + )))) + .len(2) + .add_buffer(a_value_offsets) + .add_child_data(a_values.into_data()) + .build() + .unwrap(); + let a = ListArray::from(a_list_data); + + let builder = ParquetRecordBatchReaderBuilder::try_new(test_file).unwrap(); + let mut reader = builder.build().unwrap(); + let out = reader.next().unwrap().unwrap(); + assert_eq!(out.num_rows(), 1); + assert_eq!(out.num_columns(), 1); + // grab first column + let c0 = out.column(0); + let c0arr = c0.as_any().downcast_ref::().unwrap(); + // get first row: [[1, 2], [3, 4]] + let r0 = c0arr.value(0); + let r0arr = r0.as_any().downcast_ref::().unwrap(); + assert_eq!(r0arr, &a); + } + + #[test] + fn test_map_no_value() { + // File schema: + // message schema { + // required group my_map (MAP) { + // repeated group key_value { + // required int32 key; + // optional int32 value; + // } + // } + // required group my_map_no_v (MAP) { + // repeated group key_value { + // required int32 key; + // } + // } + // required group my_list (LIST) { + // repeated group list { + // required int32 element; + // } + // } + // } + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/map_no_value.parquet"); + let file = File::open(path).unwrap(); + + let mut reader = ParquetRecordBatchReaderBuilder::try_new(file) + .unwrap() + .build() + .unwrap(); + let out = reader.next().unwrap().unwrap(); + assert_eq!(out.num_rows(), 3); + assert_eq!(out.num_columns(), 3); + // my_map_no_v and my_list columns should now be equivalent + let c0 = out.column(1).as_list::(); + let c1 = out.column(2).as_list::(); + assert_eq!(c0.len(), c1.len()); + c0.iter().zip(c1.iter()).for_each(|(l, r)| assert_eq!(l, r)); + } } diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 8a7511be2afe..09f8ec7cc274 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -21,6 +21,7 @@ /// `arrow-rs/parquet/tests/arrow_reader/statistics.rs`. use crate::arrow::buffer::bit_util::sign_extend_be; use crate::arrow::parquet_column; +use crate::basic::Type as PhysicalType; use crate::data_type::{ByteArray, FixedLenByteArray}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData}; @@ -318,7 +319,7 @@ make_decimal_stats_iterator!( /// data_type: The data type of the statistics (e.g. `DataType::Int32`) /// iterator: The iterator of [`ParquetStatistics`] to extract the statistics from. macro_rules! get_statistics { - ($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => { + ($stat_type_prefix: ident, $data_type: ident, $iterator: ident, $physical_type: ident) => { paste! { match $data_type { DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter( @@ -370,10 +371,11 @@ macro_rules! get_statistics { DataType::Date32 => Ok(Arc::new(Date32Array::from_iter( [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.copied()), ))), - DataType::Date64 => Ok(Arc::new(Date64Array::from_iter( + DataType::Date64 if $physical_type == Some(PhysicalType::INT32) => Ok(Arc::new(Date64Array::from_iter( [<$stat_type_prefix Int32StatsIterator>]::new($iterator) - .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)), - ))), + .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000))))), + DataType::Date64 if $physical_type == Some(PhysicalType::INT64) => Ok(Arc::new(Date64Array::from_iter( + [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()),))), DataType::Timestamp(unit, timezone) =>{ let iter = [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()); Ok(match unit { @@ -487,7 +489,7 @@ macro_rules! get_statistics { Ok(Arc::new(arr)) }, DataType::Dictionary(_, value_type) => { - [<$stat_type_prefix:lower _ statistics>](value_type, $iterator) + [<$stat_type_prefix:lower _ statistics>](value_type, $iterator, $physical_type) }, DataType::Utf8View => { let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator); @@ -524,6 +526,7 @@ macro_rules! get_statistics { DataType::Map(_,_) | DataType::Duration(_) | DataType::Interval(_) | + DataType::Date64 | // required to cover $physical_type match guard DataType::Null | DataType::List(_) | DataType::ListView(_) | @@ -790,7 +793,7 @@ get_decimal_page_stats_iterator!( ); macro_rules! get_data_page_statistics { - ($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => { + ($stat_type_prefix: ident, $data_type: ident, $iterator: ident, $physical_type: ident) => { paste! { match $data_type { DataType::Boolean => { @@ -929,7 +932,7 @@ macro_rules! get_data_page_statistics { Ok(Arc::new(builder.finish())) }, DataType::Dictionary(_, value_type) => { - [<$stat_type_prefix:lower _ page_statistics>](value_type, $iterator) + [<$stat_type_prefix:lower _ page_statistics>](value_type, $iterator, $physical_type) }, DataType::Timestamp(unit, timezone) => { let iter = [<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten(); @@ -941,7 +944,7 @@ macro_rules! get_data_page_statistics { }) }, DataType::Date32 => Ok(Arc::new(Date32Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten()))), - DataType::Date64 => Ok( + DataType::Date64 if $physical_type == Some(PhysicalType::INT32)=> Ok( Arc::new( Date64Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator) .map(|x| { @@ -954,6 +957,7 @@ macro_rules! get_data_page_statistics { ) ) ), + DataType::Date64 if $physical_type == Some(PhysicalType::INT64) => Ok(Arc::new(Date64Array::from_iter([<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten()))), DataType::Decimal128(precision, scale) => Ok(Arc::new( Decimal128Array::from_iter([<$stat_type_prefix Decimal128DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision, *scale)?)), DataType::Decimal256(precision, scale) => Ok(Arc::new( @@ -1040,6 +1044,7 @@ macro_rules! get_data_page_statistics { } Ok(Arc::new(builder.finish())) }, + DataType::Date64 | // required to cover $physical_type match guard DataType::Null | DataType::Duration(_) | DataType::Interval(_) | @@ -1067,8 +1072,9 @@ macro_rules! get_data_page_statistics { fn min_statistics<'a, I: Iterator>>( data_type: &DataType, iterator: I, + physical_type: Option, ) -> Result { - get_statistics!(Min, data_type, iterator) + get_statistics!(Min, data_type, iterator, physical_type) } /// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`] @@ -1077,26 +1083,35 @@ fn min_statistics<'a, I: Iterator>>( fn max_statistics<'a, I: Iterator>>( data_type: &DataType, iterator: I, + physical_type: Option, ) -> Result { - get_statistics!(Max, data_type, iterator) + get_statistics!(Max, data_type, iterator, physical_type) } /// Extracts the min statistics from an iterator /// of parquet page [`Index`]'es to an [`ArrayRef`] -pub(crate) fn min_page_statistics<'a, I>(data_type: &DataType, iterator: I) -> Result +pub(crate) fn min_page_statistics<'a, I>( + data_type: &DataType, + iterator: I, + physical_type: Option, +) -> Result where I: Iterator, { - get_data_page_statistics!(Min, data_type, iterator) + get_data_page_statistics!(Min, data_type, iterator, physical_type) } /// Extracts the max statistics from an iterator /// of parquet page [`Index`]'es to an [`ArrayRef`] -pub(crate) fn max_page_statistics<'a, I>(data_type: &DataType, iterator: I) -> Result +pub(crate) fn max_page_statistics<'a, I>( + data_type: &DataType, + iterator: I, + physical_type: Option, +) -> Result where I: Iterator, { - get_data_page_statistics!(Max, data_type, iterator) + get_data_page_statistics!(Max, data_type, iterator, physical_type) } /// Extracts the null count statistics from an iterator @@ -1177,6 +1192,8 @@ pub struct StatisticsConverter<'a> { arrow_field: &'a Field, /// treat missing null_counts as 0 nulls missing_null_counts_as_zero: bool, + /// The physical type of the matched column in the Parquet schema + physical_type: Option, } impl<'a> StatisticsConverter<'a> { @@ -1304,6 +1321,7 @@ impl<'a> StatisticsConverter<'a> { parquet_column_index: parquet_index, arrow_field, missing_null_counts_as_zero: true, + physical_type: parquet_index.map(|idx| parquet_schema.column(idx).physical_type()), }) } @@ -1346,7 +1364,7 @@ impl<'a> StatisticsConverter<'a> { /// // get the minimum value for the column "foo" in the parquet file /// let min_values: ArrayRef = converter /// .row_group_mins(metadata.row_groups().iter()) - /// .unwrap(); + /// .unwrap(); /// // if "foo" is a Float64 value, the returned array will contain Float64 values /// assert_eq!(min_values, Arc::new(Float64Array::from(vec![Some(1.0), Some(2.0)])) as _); /// ``` @@ -1363,7 +1381,7 @@ impl<'a> StatisticsConverter<'a> { let iter = metadatas .into_iter() .map(|x| x.column(parquet_index).statistics()); - min_statistics(data_type, iter) + min_statistics(data_type, iter, self.physical_type) } /// Extract the maximum values from row group statistics in [`RowGroupMetaData`] @@ -1382,7 +1400,7 @@ impl<'a> StatisticsConverter<'a> { let iter = metadatas .into_iter() .map(|x| x.column(parquet_index).statistics()); - max_statistics(data_type, iter) + max_statistics(data_type, iter, self.physical_type) } /// Extract the null counts from row group statistics in [`RowGroupMetaData`] @@ -1490,7 +1508,7 @@ impl<'a> StatisticsConverter<'a> { (*num_data_pages, column_page_index_per_row_group_per_column) }); - min_page_statistics(data_type, iter) + min_page_statistics(data_type, iter, self.physical_type) } /// Extract the maximum values from Data Page statistics. @@ -1521,7 +1539,7 @@ impl<'a> StatisticsConverter<'a> { (*num_data_pages, column_page_index_per_row_group_per_column) }); - max_page_statistics(data_type, iter) + max_page_statistics(data_type, iter, self.physical_type) } /// Returns a [`UInt64Array`] with null counts for each data page. diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 3e828bbddd17..e4662b8f316c 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -632,7 +632,7 @@ mod tests { // based on the example at https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html // [[a, b, c], [d, e, f, g]], [[h], [i,j]] - let leaf_type = Field::new("item", DataType::Int32, false); + let leaf_type = Field::new_list_field(DataType::Int32, false); let inner_type = DataType::List(Arc::new(leaf_type)); let inner_field = Field::new("l2", inner_type.clone(), false); let outer_type = DataType::List(Arc::new(inner_field)); @@ -676,7 +676,7 @@ mod tests { fn test_calculate_one_level_1() { // This test calculates the levels for a non-null primitive array let array = Arc::new(Int32Array::from_iter(0..10)) as ArrayRef; - let field = Field::new("item", DataType::Int32, false); + let field = Field::new_list_field(DataType::Int32, false); let levels = calculate_array_levels(&array, &field).unwrap(); assert_eq!(levels.len(), 1); @@ -702,7 +702,7 @@ mod tests { Some(0), None, ])) as ArrayRef; - let field = Field::new("item", DataType::Int32, true); + let field = Field::new_list_field(DataType::Int32, true); let levels = calculate_array_levels(&array, &field).unwrap(); assert_eq!(levels.len(), 1); @@ -720,7 +720,7 @@ mod tests { #[test] fn test_calculate_array_levels_1() { - let leaf_field = Field::new("item", DataType::Int32, false); + let leaf_field = Field::new_list_field(DataType::Int32, false); let list_type = DataType::List(Arc::new(leaf_field)); // if all array values are defined (e.g. batch>) @@ -1046,7 +1046,7 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from_iter([0_i32, 1, 3, 3, 6, 10]); - let a_list_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let a_list_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let a_list_data = ArrayData::builder(a_list_type.clone()) .len(5) .add_buffer(a_value_offsets) @@ -1059,7 +1059,7 @@ mod tests { let a = ListArray::from(a_list_data); - let item_field = Field::new("item", a_list_type, true); + let item_field = Field::new_list_field(a_list_type, true); let mut builder = levels(&item_field, a); builder.write(2..4); let levels = builder.finish(); @@ -1334,7 +1334,7 @@ mod tests { // define schema let int_field = Field::new("a", DataType::Int32, true); let fields = Fields::from([Arc::new(int_field)]); - let item_field = Field::new("item", DataType::Struct(fields.clone()), true); + let item_field = Field::new_list_field(DataType::Struct(fields.clone()), true); let list_field = Field::new("list", DataType::List(Arc::new(item_field)), true); let int_builder = Int32Builder::with_capacity(10); @@ -1568,7 +1568,7 @@ mod tests { let a = builder.finish(); let values = a.values().clone(); - let item_field = Field::new("item", a.data_type().clone(), true); + let item_field = Field::new_list_field(a.data_type().clone(), true); let mut builder = levels(&item_field, a); builder.write(1..4); let levels = builder.finish(); @@ -1594,7 +1594,7 @@ mod tests { let field_a = Field::new("a", DataType::Int32, true); let field_b = Field::new("b", DataType::Int64, false); let fields = Fields::from([Arc::new(field_a), Arc::new(field_b)]); - let item_field = Field::new("item", DataType::Struct(fields.clone()), true); + let item_field = Field::new_list_field(DataType::Struct(fields.clone()), true); let list_field = Field::new( "list", DataType::FixedSizeList(Arc::new(item_field), 2), @@ -1758,7 +1758,7 @@ mod tests { let array = builder.finish(); let values = array.values().clone(); - let item_field = Field::new("item", array.data_type().clone(), true); + let item_field = Field::new_list_field(array.data_type().clone(), true); let mut builder = levels(&item_field, array); builder.write(0..3); let levels = builder.finish(); @@ -1797,7 +1797,7 @@ mod tests { let a = builder.finish(); let values = a.values().as_list::().values().clone(); - let item_field = Field::new("item", a.data_type().clone(), true); + let item_field = Field::new_list_field(a.data_type().clone(), true); let mut builder = levels(&item_field, a); builder.write(0..4); let levels = builder.finish(); @@ -1827,7 +1827,7 @@ mod tests { // [NULL, NULL, 3, 0] let dict = DictionaryArray::new(keys, Arc::new(values)); - let item_field = Field::new("item", dict.data_type().clone(), true); + let item_field = Field::new_list_field(dict.data_type().clone(), true); let mut builder = levels(&item_field, dict.clone()); builder.write(0..4); @@ -1846,7 +1846,7 @@ mod tests { #[test] fn mismatched_types() { let array = Arc::new(Int32Array::from_iter(0..10)) as ArrayRef; - let field = Field::new("item", DataType::Float64, false); + let field = Field::new_list_field(DataType::Float64, false); let err = LevelInfoBuilder::try_new(&field, Default::default(), &array) .unwrap_err() diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 99d54eef3bb5..871b140768cb 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -30,12 +30,10 @@ use arrow_array::types::*; use arrow_array::{ArrayRef, RecordBatch, RecordBatchWriter}; use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, IntervalUnit, SchemaRef}; -use super::schema::{ - add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema, - arrow_to_parquet_schema_with_root, decimal_length_from_precision, -}; +use super::schema::{add_encoded_arrow_schema_to_metadata, decimal_length_from_precision}; use crate::arrow::arrow_writer::byte_array::ByteArrayEncoder; +use crate::arrow::ArrowSchemaConverter; use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter}; use crate::column::writer::encoder::ColumnValueEncoder; use crate::column::writer::{ @@ -180,11 +178,12 @@ impl ArrowWriter { arrow_schema: SchemaRef, options: ArrowWriterOptions, ) -> Result { - let schema = match options.schema_root { - Some(s) => arrow_to_parquet_schema_with_root(&arrow_schema, &s)?, - None => arrow_to_parquet_schema(&arrow_schema)?, - }; let mut props = options.properties; + let mut converter = ArrowSchemaConverter::new().with_coerce_types(props.coerce_types()); + if let Some(schema_root) = &options.schema_root { + converter = converter.schema_root(schema_root); + } + let schema = converter.convert(&arrow_schema)?; if !options.skip_arrow_metadata { // add serialized arrow schema add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut props); @@ -390,9 +389,9 @@ impl ArrowWriterOptions { } /// Set the name of the root parquet schema element (defaults to `"arrow_schema"`) - pub fn with_schema_root(self, name: String) -> Self { + pub fn with_schema_root(self, schema_root: String) -> Self { Self { - schema_root: Some(name), + schema_root: Some(schema_root), ..self } } @@ -538,7 +537,7 @@ impl ArrowColumnChunk { /// # use std::sync::Arc; /// # use arrow_array::*; /// # use arrow_schema::*; -/// # use parquet::arrow::arrow_to_parquet_schema; +/// # use parquet::arrow::ArrowSchemaConverter; /// # use parquet::arrow::arrow_writer::{ArrowLeafColumn, compute_leaves, get_column_writers}; /// # use parquet::file::properties::WriterProperties; /// # use parquet::file::writer::SerializedFileWriter; @@ -549,8 +548,11 @@ impl ArrowColumnChunk { /// ])); /// /// // Compute the parquet schema -/// let parquet_schema = arrow_to_parquet_schema(schema.as_ref()).unwrap(); /// let props = Arc::new(WriterProperties::default()); +/// let parquet_schema = ArrowSchemaConverter::new() +/// .with_coerce_types(props.coerce_types()) +/// .convert(&schema) +/// .unwrap(); /// /// // Create writers for each of the leaf columns /// let col_writers = get_column_writers(&parquet_schema, &props, &schema).unwrap(); @@ -858,6 +860,12 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result { match column.data_type() { + ArrowDataType::Date64 => { + let array = arrow_cast::cast(column, &ArrowDataType::Int64)?; + + let array = array.as_primitive::(); + write_primitive(typed, array.values(), levels) + } ArrowDataType::Int64 => { let array = column.as_primitive::(); write_primitive(typed, array.values(), levels) @@ -1082,6 +1090,7 @@ mod tests { use arrow::datatypes::ToByteSlice; use arrow::datatypes::{DataType, Schema}; use arrow::error::Result as ArrowResult; + use arrow::util::data_gen::create_random_array; use arrow::util::pretty::pretty_format_batches; use arrow::{array::*, buffer::Buffer}; use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer}; @@ -1194,7 +1203,7 @@ mod tests { // define schema let schema = Schema::new(vec![Field::new( "a", - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))), true, )]); @@ -1206,8 +1215,7 @@ mod tests { let a_value_offsets = arrow::buffer::Buffer::from([0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two - let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( - "item", + let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new_list_field( DataType::Int32, false, )))) @@ -1234,7 +1242,7 @@ mod tests { // define schema let schema = Schema::new(vec![Field::new( "a", - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))), false, )]); @@ -1246,8 +1254,7 @@ mod tests { let a_value_offsets = arrow::buffer::Buffer::from([0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two - let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( - "item", + let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new_list_field( DataType::Int32, false, )))) @@ -1365,12 +1372,12 @@ mod tests { let struct_field_f = Arc::new(Field::new("f", DataType::Float32, true)); let struct_field_g = Arc::new(Field::new_list( "g", - Field::new("item", DataType::Int16, true), + Field::new_list_field(DataType::Int16, true), false, )); let struct_field_h = Arc::new(Field::new_list( "h", - Field::new("item", DataType::Int16, false), + Field::new_list_field(DataType::Int16, false), true, )); let struct_field_e = Arc::new(Field::new_struct( @@ -1743,7 +1750,7 @@ mod tests { "Expected a dictionary page" ); - let offset_indexes = read_offset_indexes(&file, column).unwrap(); + let offset_indexes = read_offset_indexes(&file, column).unwrap().unwrap(); let page_locations = offset_indexes[0].page_locations.clone(); @@ -2377,7 +2384,7 @@ mod tests { #[test] fn null_list_single_column() { - let null_field = Field::new("item", DataType::Null, true); + let null_field = Field::new_list_field(DataType::Null, true); let list_field = Field::new("emptylist", DataType::List(Arc::new(null_field)), true); let schema = Schema::new(vec![list_field]); @@ -2385,8 +2392,7 @@ mod tests { // Build [[], null, [null, null]] let a_values = NullArray::new(2); let a_value_offsets = arrow::buffer::Buffer::from([0, 0, 0, 2].to_byte_slice()); - let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( - "item", + let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new_list_field( DataType::Null, true, )))) @@ -2415,8 +2421,7 @@ mod tests { fn list_single_column() { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from([0, 1, 3, 3, 6, 10].to_byte_slice()); - let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( - "item", + let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new_list_field( DataType::Int32, false, )))) @@ -2489,6 +2494,56 @@ mod tests { one_column_roundtrip(values, false); } + #[test] + fn list_and_map_coerced_names() { + // Create map and list with non-Parquet naming + let list_field = + Field::new_list("my_list", Field::new("item", DataType::Int32, false), false); + let map_field = Field::new_map( + "my_map", + "entries", + Field::new("keys", DataType::Int32, false), + Field::new("values", DataType::Int32, true), + false, + true, + ); + + let list_array = create_random_array(&list_field, 100, 0.0, 0.0).unwrap(); + let map_array = create_random_array(&map_field, 100, 0.0, 0.0).unwrap(); + + let arrow_schema = Arc::new(Schema::new(vec![list_field, map_field])); + + // Write data to Parquet but coerce names to match spec + let props = Some(WriterProperties::builder().set_coerce_types(true).build()); + let file = tempfile::tempfile().unwrap(); + let mut writer = + ArrowWriter::try_new(file.try_clone().unwrap(), arrow_schema.clone(), props).unwrap(); + + let batch = RecordBatch::try_new(arrow_schema, vec![list_array, map_array]).unwrap(); + writer.write(&batch).unwrap(); + let file_metadata = writer.close().unwrap(); + + // Coerced name of "item" should be "element" + assert_eq!(file_metadata.schema[3].name, "element"); + // Coerced name of "entries" should be "key_value" + assert_eq!(file_metadata.schema[5].name, "key_value"); + // Coerced name of "keys" should be "key" + assert_eq!(file_metadata.schema[6].name, "key"); + // Coerced name of "values" should be "value" + assert_eq!(file_metadata.schema[7].name, "value"); + + // Double check schema after reading from the file + let reader = SerializedFileReader::new(file).unwrap(); + let file_schema = reader.metadata().file_metadata().schema(); + let fields = file_schema.get_fields(); + let list_field = &fields[0].get_fields()[0]; + assert_eq!(list_field.get_fields()[0].name(), "element"); + let map_field = &fields[1].get_fields()[0]; + assert_eq!(map_field.name(), "key_value"); + assert_eq!(map_field.get_fields()[0].name(), "key"); + assert_eq!(map_field.get_fields()[1].name(), "value"); + } + #[test] fn fallback_flush_data_page() { //tests if the Fallback::flush_data_page clears all buffers correctly @@ -2534,6 +2589,7 @@ mod tests { #[test] fn arrow_writer_string_dictionary() { // define schema + #[allow(deprecated)] let schema = Arc::new(Schema::new(vec![Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), @@ -2555,6 +2611,7 @@ mod tests { #[test] fn arrow_writer_primitive_dictionary() { // define schema + #[allow(deprecated)] let schema = Arc::new(Schema::new(vec![Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt32)), @@ -2577,6 +2634,7 @@ mod tests { #[test] fn arrow_writer_string_dictionary_unsigned_index() { // define schema + #[allow(deprecated)] let schema = Arc::new(Schema::new(vec![Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index b19f9830a7c9..526818845b5c 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -119,7 +119,7 @@ impl MetadataLoader { return Err(ParquetError::EOF(format!( "file size of {} is less than footer + metadata {}", file_size, - length + 8 + length + FOOTER_SIZE ))); } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 8b315cc9f784..c408456df147 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -158,7 +158,8 @@ pub trait AsyncFileReader: Send { fn get_metadata(&mut self) -> BoxFuture<'_, Result>>; } -impl AsyncFileReader for Box { +/// This allows Box to be used as an AsyncFileReader, +impl AsyncFileReader for Box { fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result> { self.as_mut().get_bytes(range) } @@ -927,7 +928,6 @@ mod tests { use crate::arrow::schema::parquet_to_arrow_schema_and_fields; use crate::arrow::ArrowWriter; use crate::file::metadata::ParquetMetaDataReader; - use crate::file::page_index::index_reader; use crate::file::properties::WriterProperties; use arrow::compute::kernels::cmp::eq; use arrow::error::Result as ArrowResult; @@ -1565,12 +1565,11 @@ mod tests { let data = Bytes::from(std::fs::read(path).unwrap()); let metadata = ParquetMetaDataReader::new() + .with_page_indexes(true) .parse_and_finish(&data) .unwrap(); - let offset_index = - index_reader::read_offset_indexes(&data, metadata.row_group(0).columns()) - .expect("reading offset index"); + let offset_index = metadata.offset_index().expect("reading offset index")[0].clone(); let mut metadata_builder = metadata.into_builder(); let mut row_groups = metadata_builder.take_row_groups(); @@ -1870,7 +1869,7 @@ mod tests { async fn test_nested_skip() { let schema = Arc::new(Schema::new(vec![ Field::new("col_1", DataType::UInt64, false), - Field::new_list("col_2", Field::new("item", DataType::Utf8, true), true), + Field::new_list("col_2", Field::new_list_field(DataType::Utf8, true), true), ])); // Default writer properties diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs index 8155b57d9ac6..c04d5710a971 100644 --- a/parquet/src/arrow/async_writer/mod.rs +++ b/parquet/src/arrow/async_writer/mod.rs @@ -89,7 +89,7 @@ pub trait AsyncFileWriter: Send { fn complete(&mut self) -> BoxFuture<'_, Result<()>>; } -impl AsyncFileWriter for Box { +impl AsyncFileWriter for Box { fn write(&mut self, bs: Bytes) -> BoxFuture<'_, Result<()>> { self.as_mut().write(bs) } diff --git a/parquet/src/arrow/buffer/view_buffer.rs b/parquet/src/arrow/buffer/view_buffer.rs index 2256f4877d68..fd7d6c213f04 100644 --- a/parquet/src/arrow/buffer/view_buffer.rs +++ b/parquet/src/arrow/buffer/view_buffer.rs @@ -130,7 +130,7 @@ mod tests { #[test] fn test_view_buffer_append_view() { let mut buffer = ViewBuffer::default(); - let string_buffer = Buffer::from(&b"0123456789long string to test string view"[..]); + let string_buffer = Buffer::from(b"0123456789long string to test string view"); let block_id = buffer.append_block(string_buffer); unsafe { @@ -157,7 +157,7 @@ mod tests { #[test] fn test_view_buffer_pad_null() { let mut buffer = ViewBuffer::default(); - let string_buffer = Buffer::from(&b"0123456789long string to test string view"[..]); + let string_buffer = Buffer::from(b"0123456789long string to test string view"); let block_id = buffer.append_block(string_buffer); unsafe { diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 2d09cd19203f..d77436bc1ff7 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -116,9 +116,13 @@ pub use self::async_writer::AsyncArrowWriter; use crate::schema::types::SchemaDescriptor; use arrow_schema::{FieldRef, Schema}; +// continue to export deprecated methods until they are removed +#[allow(deprecated)] +pub use self::schema::arrow_to_parquet_schema; + pub use self::schema::{ - arrow_to_parquet_schema, parquet_to_arrow_field_levels, parquet_to_arrow_schema, - parquet_to_arrow_schema_by_columns, FieldLevels, + parquet_to_arrow_field_levels, parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, + ArrowSchemaConverter, FieldLevels, }; /// Schema metadata key used to store serialized Arrow IPC schema diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index e487feabb848..16d46bd852dc 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -271,8 +271,13 @@ impl Visitor { return Err(arrow_err!("Child of map field must be repeated")); } + // According to the specification the values are optional (#1642). + // In this case, return the keys as a list. + if map_key_value.get_fields().len() == 1 { + return self.visit_list(map_type, context); + } + if map_key_value.get_fields().len() != 2 { - // According to the specification the values are optional (#1642) return Err(arrow_err!( "Child of map field must have two children, found {}", map_key_value.get_fields().len() @@ -448,15 +453,21 @@ impl Visitor { }; } + // test to see if the repeated field is a struct or one-tuple let items = repeated_field.get_fields(); if items.len() != 1 - || repeated_field.name() == "array" - || repeated_field.name() == format!("{}_tuple", list_type.name()) + || (!repeated_field.is_list() + && !repeated_field.has_single_repeated_child() + && (repeated_field.name() == "array" + || repeated_field.name() == format!("{}_tuple", list_type.name()))) { - // If the repeated field is a group with multiple fields, then its type is the element type and elements are required. + // If the repeated field is a group with multiple fields, then its type is the element + // type and elements are required. // - // If the repeated field is a group with one field and is named either array or uses the LIST-annotated group's name - // with _tuple appended then the repeated type is the element type and elements are required. + // If the repeated field is a group with one field and is named either array or uses + // the LIST-annotated group's name with _tuple appended then the repeated type is the + // element type and elements are required. But this rule only applies if the + // repeated field is not annotated, and the single child field is not `repeated`. let context = VisitorContext { rep_level: context.rep_level, def_level, @@ -541,8 +552,11 @@ fn convert_field(parquet_type: &Type, field: &ParquetField, arrow_hint: Option<& match arrow_hint { Some(hint) => { // If the inferred type is a dictionary, preserve dictionary metadata + #[allow(deprecated)] let field = match (&data_type, hint.dict_id(), hint.dict_is_ordered()) { - (DataType::Dictionary(_, _), Some(id), Some(ordered)) => { + (DataType::Dictionary(_, _), Some(id), Some(ordered)) => + { + #[allow(deprecated)] Field::new_dict(name, data_type, nullable, id, ordered) } _ => Field::new(name, data_type, nullable), diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 3ed3bd24e0a8..5d3d7b2a6541 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -15,13 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Provides API for converting parquet schema to arrow schema and vice versa. -//! -//! The main interfaces for converting parquet schema to arrow schema are -//! `parquet_to_arrow_schema`, `parquet_to_arrow_schema_by_columns` and -//! `parquet_to_arrow_field`. -//! -//! The interfaces for converting arrow schema to parquet schema is coming. +//! Converting Parquet schema <--> Arrow schema: [`ArrowSchemaConverter`] and [parquet_to_arrow_schema] use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -178,6 +172,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { /// Encodes the Arrow schema into the IPC format, and base64 encodes it fn encode_arrow_schema(schema: &Schema) -> String { let options = writer::IpcWriteOptions::default(); + #[allow(deprecated)] let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(true, options.preserve_dict_id()); let data_gen = writer::IpcDataGenerator::default(); @@ -225,23 +220,134 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut } } +/// Converter for Arrow schema to Parquet schema +/// +/// Example: +/// ``` +/// # use std::sync::Arc; +/// # use arrow_schema::{Field, Schema, DataType}; +/// # use parquet::arrow::ArrowSchemaConverter; +/// use parquet::schema::types::{SchemaDescriptor, Type}; +/// use parquet::basic; // note there are two `Type`s in the following example +/// // create an Arrow Schema +/// let arrow_schema = Schema::new(vec![ +/// Field::new("a", DataType::Int64, true), +/// Field::new("b", DataType::Date32, true), +/// ]); +/// // convert the Arrow schema to a Parquet schema +/// let parquet_schema = ArrowSchemaConverter::new() +/// .convert(&arrow_schema) +/// .unwrap(); +/// +/// let expected_parquet_schema = SchemaDescriptor::new( +/// Arc::new( +/// Type::group_type_builder("arrow_schema") +/// .with_fields(vec![ +/// Arc::new( +/// Type::primitive_type_builder("a", basic::Type::INT64) +/// .build().unwrap() +/// ), +/// Arc::new( +/// Type::primitive_type_builder("b", basic::Type::INT32) +/// .with_converted_type(basic::ConvertedType::DATE) +/// .with_logical_type(Some(basic::LogicalType::Date)) +/// .build().unwrap() +/// ), +/// ]) +/// .build().unwrap() +/// ) +/// ); +/// assert_eq!(parquet_schema, expected_parquet_schema); +/// ``` +#[derive(Debug)] +pub struct ArrowSchemaConverter<'a> { + /// Name of the root schema in Parquet + schema_root: &'a str, + /// Should we coerce Arrow types to compatible Parquet types? + /// + /// See docs on [Self::with_coerce_types]` + coerce_types: bool, +} + +impl Default for ArrowSchemaConverter<'_> { + fn default() -> Self { + Self::new() + } +} + +impl<'a> ArrowSchemaConverter<'a> { + /// Create a new converter + pub fn new() -> Self { + Self { + schema_root: "arrow_schema", + coerce_types: false, + } + } + + /// Should Arrow types be coerced into Parquet native types (default `false`). + /// + /// Setting this option to `true` will result in Parquet files that can be + /// read by more readers, but may lose precision for Arrow types such as + /// [`DataType::Date64`] which have no direct [corresponding Parquet type]. + /// + /// By default, this converter does not coerce to native Parquet types. Enabling type + /// coercion allows for meaningful representations that do not require + /// downstream readers to consider the embedded Arrow schema, and can allow + /// for greater compatibility with other Parquet implementations. However, + /// type coercion also prevents data from being losslessly round-tripped. + /// + /// # Discussion + /// + /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no + /// corresponding Parquet logical type. Thus, they can not be losslessly + /// round-tripped when stored using the appropriate Parquet logical type. + /// For example, some Date64 values may be truncated when stored with + /// parquet's native 32 bit date type. + /// + /// For [`List`] and [`Map`] types, some Parquet readers expect certain + /// schema elements to have specific names (earlier versions of the spec + /// were somewhat ambiguous on this point). Type coercion will use the names + /// prescribed by the Parquet specification, potentially losing naming + /// metadata from the Arrow schema. + /// + /// [`List`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + /// [`Map`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps + /// [corresponding Parquet type]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date + /// + pub fn with_coerce_types(mut self, coerce_types: bool) -> Self { + self.coerce_types = coerce_types; + self + } + + /// Set the root schema element name (defaults to `"arrow_schema"`). + pub fn schema_root(mut self, schema_root: &'a str) -> Self { + self.schema_root = schema_root; + self + } + + /// Convert the specified Arrow [`Schema`] to the desired Parquet [`SchemaDescriptor`] + /// + /// See example in [`ArrowSchemaConverter`] + pub fn convert(&self, schema: &Schema) -> Result { + let fields = schema + .fields() + .iter() + .map(|field| arrow_to_parquet_type(field, self.coerce_types).map(Arc::new)) + .collect::>()?; + let group = Type::group_type_builder(self.schema_root) + .with_fields(fields) + .build()?; + Ok(SchemaDescriptor::new(Arc::new(group))) + } +} + /// Convert arrow schema to parquet schema /// /// The name of the root schema element defaults to `"arrow_schema"`, this can be -/// overridden with [`arrow_to_parquet_schema_with_root`] +/// overridden with [`ArrowSchemaConverter`] +#[deprecated(since = "54.0.0", note = "Use `ArrowSchemaConverter` instead")] pub fn arrow_to_parquet_schema(schema: &Schema) -> Result { - arrow_to_parquet_schema_with_root(schema, "arrow_schema") -} - -/// Convert arrow schema to parquet schema specifying the name of the root schema element -pub fn arrow_to_parquet_schema_with_root(schema: &Schema, root: &str) -> Result { - let fields = schema - .fields() - .iter() - .map(|field| arrow_to_parquet_type(field).map(Arc::new)) - .collect::>()?; - let group = Type::group_type_builder(root).with_fields(fields).build()?; - Ok(SchemaDescriptor::new(Arc::new(group))) + ArrowSchemaConverter::new().convert(schema) } fn parse_key_value_metadata( @@ -298,7 +404,12 @@ pub fn decimal_length_from_precision(precision: u8) -> usize { } /// Convert an arrow field to a parquet `Type` -fn arrow_to_parquet_type(field: &Field) -> Result { +fn arrow_to_parquet_type(field: &Field, coerce_types: bool) -> Result { + const PARQUET_LIST_ELEMENT_NAME: &str = "element"; + const PARQUET_MAP_STRUCT_NAME: &str = "key_value"; + const PARQUET_KEY_FIELD_NAME: &str = "key"; + const PARQUET_VALUE_FIELD_NAME: &str = "value"; + let name = field.name().as_str(); let repetition = if field.is_nullable() { Repetition::OPTIONAL @@ -415,12 +526,20 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .with_id(id) .build(), - // date64 is cast to date32 (#1666) - DataType::Date64 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(Some(LogicalType::Date)) - .with_repetition(repetition) - .with_id(id) - .build(), + DataType::Date64 => { + if coerce_types { + Type::primitive_type_builder(name, PhysicalType::INT32) + .with_logical_type(Some(LogicalType::Date)) + .with_repetition(repetition) + .with_id(id) + .build() + } else { + Type::primitive_type_builder(name, PhysicalType::INT64) + .with_repetition(repetition) + .with_id(id) + .build() + } + } DataType::Time32(TimeUnit::Second) => { // Cannot represent seconds in LogicalType Type::primitive_type_builder(name, PhysicalType::INT32) @@ -515,10 +634,18 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_id(id) .build(), DataType::List(f) | DataType::FixedSizeList(f, _) | DataType::LargeList(f) => { + let field_ref = if coerce_types && f.name() != PARQUET_LIST_ELEMENT_NAME { + // Ensure proper naming per the Parquet specification + let ff = f.as_ref().clone().with_name(PARQUET_LIST_ELEMENT_NAME); + Arc::new(arrow_to_parquet_type(&ff, coerce_types)?) + } else { + Arc::new(arrow_to_parquet_type(f, coerce_types)?) + }; + Type::group_type_builder(name) .with_fields(vec![Arc::new( Type::group_type_builder("list") - .with_fields(vec![Arc::new(arrow_to_parquet_type(f)?)]) + .with_fields(vec![field_ref]) .with_repetition(Repetition::REPEATED) .build()?, )]) @@ -537,7 +664,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { // recursively convert children to types/nodes let fields = fields .iter() - .map(|f| arrow_to_parquet_type(f).map(Arc::new)) + .map(|f| arrow_to_parquet_type(f, coerce_types).map(Arc::new)) .collect::>()?; Type::group_type_builder(name) .with_fields(fields) @@ -547,13 +674,29 @@ fn arrow_to_parquet_type(field: &Field) -> Result { } DataType::Map(field, _) => { if let DataType::Struct(struct_fields) = field.data_type() { + // If coercing then set inner struct name to "key_value" + let map_struct_name = if coerce_types { + PARQUET_MAP_STRUCT_NAME + } else { + field.name() + }; + + // If coercing then ensure struct fields are named "key" and "value" + let fix_map_field = |name: &str, fld: &Arc| -> Result> { + if coerce_types && fld.name() != name { + let f = fld.as_ref().clone().with_name(name); + Ok(Arc::new(arrow_to_parquet_type(&f, coerce_types)?)) + } else { + Ok(Arc::new(arrow_to_parquet_type(fld, coerce_types)?)) + } + }; + let key_field = fix_map_field(PARQUET_KEY_FIELD_NAME, &struct_fields[0])?; + let val_field = fix_map_field(PARQUET_VALUE_FIELD_NAME, &struct_fields[1])?; + Type::group_type_builder(name) .with_fields(vec![Arc::new( - Type::group_type_builder(field.name()) - .with_fields(vec![ - Arc::new(arrow_to_parquet_type(&struct_fields[0])?), - Arc::new(arrow_to_parquet_type(&struct_fields[1])?), - ]) + Type::group_type_builder(map_struct_name) + .with_fields(vec![key_field, val_field]) .with_repetition(Repetition::REPEATED) .build()?, )]) @@ -571,7 +714,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { DataType::Dictionary(_, ref value) => { // Dictionary encoding not handled at the schema level let dict_field = field.clone().with_data_type(value.as_ref().clone()); - arrow_to_parquet_type(&dict_field) + arrow_to_parquet_type(&dict_field, coerce_types) } DataType::RunEndEncoded(_, _) => Err(arrow_err!( "Converting RunEndEncodedType to parquet not supported", @@ -1408,6 +1551,81 @@ mod tests { assert_eq!(arrow_fields, converted_arrow_fields); } + #[test] + fn test_coerced_map_list() { + // Create Arrow schema with non-Parquet naming + let arrow_fields = vec![ + Field::new_list( + "my_list", + Field::new("item", DataType::Boolean, true), + false, + ), + Field::new_map( + "my_map", + "entries", + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Int32, true), + false, + true, + ), + ]; + let arrow_schema = Schema::new(arrow_fields); + + // Create Parquet schema with coerced names + let message_type = " + message parquet_schema { + REQUIRED GROUP my_list (LIST) { + REPEATED GROUP list { + OPTIONAL BOOLEAN element; + } + } + OPTIONAL GROUP my_map (MAP) { + REPEATED GROUP key_value { + REQUIRED BINARY key (STRING); + OPTIONAL INT32 value; + } + } + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); + let converted_arrow_schema = ArrowSchemaConverter::new() + .with_coerce_types(true) + .convert(&arrow_schema) + .unwrap(); + assert_eq!( + parquet_schema.columns().len(), + converted_arrow_schema.columns().len() + ); + + // Create Parquet schema without coerced names + let message_type = " + message parquet_schema { + REQUIRED GROUP my_list (LIST) { + REPEATED GROUP list { + OPTIONAL BOOLEAN item; + } + } + OPTIONAL GROUP my_map (MAP) { + REPEATED GROUP entries { + REQUIRED BINARY keys (STRING); + OPTIONAL INT32 values; + } + } + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); + let converted_arrow_schema = ArrowSchemaConverter::new() + .with_coerce_types(false) + .convert(&arrow_schema) + .unwrap(); + assert_eq!( + parquet_schema.columns().len(), + converted_arrow_schema.columns().len() + ); + } + #[test] fn test_field_to_column_desc() { let message_type = " @@ -1557,7 +1775,7 @@ mod tests { Field::new("decimal256", DataType::Decimal256(39, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema).unwrap(); + let converted_arrow_schema = ArrowSchemaConverter::new().convert(&arrow_schema).unwrap(); assert_eq!( parquet_schema.columns().len(), @@ -1594,9 +1812,10 @@ mod tests { false, )]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema); + let converted_arrow_schema = ArrowSchemaConverter::new() + .with_coerce_types(true) + .convert(&arrow_schema); - assert!(converted_arrow_schema.is_err()); converted_arrow_schema.unwrap(); } @@ -1665,7 +1884,7 @@ mod tests { Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), Field::new_list( "c21", - Field::new("item", DataType::Boolean, true) + Field::new_list_field(DataType::Boolean, true) .with_metadata(meta(&[("Key", "Bar"), (PARQUET_FIELD_ID_META_KEY, "5")])), false, ) @@ -1673,7 +1892,7 @@ mod tests { Field::new( "c22", DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Boolean, true)), + Arc::new(Field::new_list_field(DataType::Boolean, true)), 5, ), false, @@ -1682,8 +1901,7 @@ mod tests { "c23", Field::new_large_list( "inner", - Field::new( - "item", + Field::new_list_field( DataType::Struct( vec![ Field::new("a", DataType::Int16, true), @@ -1714,6 +1932,7 @@ mod tests { // Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false), // Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false), // Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false), + #[allow(deprecated)] Field::new_dict( "c31", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), @@ -1728,8 +1947,7 @@ mod tests { "c34", Field::new_list( "inner", - Field::new( - "item", + Field::new_list_field( DataType::Struct( vec![ Field::new("a", DataType::Int16, true), @@ -1762,7 +1980,7 @@ mod tests { .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "8")])), Field::new_list( "my_value", - Field::new("item", DataType::Utf8, true) + Field::new_list_field(DataType::Utf8, true) .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "10")])), true, ) @@ -1777,7 +1995,7 @@ mod tests { Field::new("my_key", DataType::Utf8, false), Field::new_list( "my_value", - Field::new("item", DataType::Utf8, true) + Field::new_list_field(DataType::Utf8, true) .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "11")])), true, ), @@ -1868,7 +2086,9 @@ mod tests { // don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema let arrow_schema = crate::arrow::parquet_to_arrow_schema(&schema_descriptor, None)?; - let parq_schema_descr = crate::arrow::arrow_to_parquet_schema(&arrow_schema)?; + let parq_schema_descr = ArrowSchemaConverter::new() + .with_coerce_types(true) + .convert(&arrow_schema)?; let parq_fields = parq_schema_descr.root_schema().get_fields(); assert_eq!(parq_fields.len(), 2); assert_eq!(parq_fields[0].get_basic_info().id(), 1); diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 1926b87623bf..97e8c22f1b2f 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -302,6 +302,7 @@ pub enum Encoding { /// /// The RLE/bit-packing hybrid is more cpu and memory efficient and should be used instead. #[deprecated( + since = "51.0.0", note = "Please see documentation for compatibility issues and use the RLE/bit-packing hybrid encoding instead" )] BIT_PACKED, diff --git a/parquet/src/bin/parquet-rewrite.rs b/parquet/src/bin/parquet-rewrite.rs index ad0f7ae0df7d..5a1ec94d5502 100644 --- a/parquet/src/bin/parquet-rewrite.rs +++ b/parquet/src/bin/parquet-rewrite.rs @@ -199,6 +199,10 @@ struct Args { /// Sets writer version. #[clap(long)] writer_version: Option, + + /// Sets whether to coerce Arrow types to match Parquet specification + #[clap(long)] + coerce_types: Option, } fn main() { @@ -238,6 +242,7 @@ fn main() { if let Some(value) = args.dictionary_page_size_limit { writer_properties_builder = writer_properties_builder.set_dictionary_page_size_limit(value); } + #[allow(deprecated)] if let Some(value) = args.max_statistics_size { writer_properties_builder = writer_properties_builder.set_max_statistics_size(value); } @@ -262,6 +267,9 @@ fn main() { if let Some(value) = args.writer_version { writer_properties_builder = writer_properties_builder.set_writer_version(value.into()); } + if let Some(value) = args.coerce_types { + writer_properties_builder = writer_properties_builder.set_coerce_types(value); + } let writer_properties = writer_properties_builder.build(); let mut parquet_writer = ArrowWriter::try_new( File::create(&args.output).expect("Unable to open output file"), diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 2b43b4c3e45c..953dc057d7a3 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -185,31 +185,6 @@ where } } - /// Reads a batch of values of at most `batch_size`, returning a tuple containing the - /// actual number of non-null values read, followed by the corresponding number of levels, - /// i.e, the total number of values including nulls, empty lists, etc... - /// - /// If the max definition level is 0, `def_levels` will be ignored, otherwise it will be - /// populated with the number of levels read, with an error returned if it is `None`. - /// - /// If the max repetition level is 0, `rep_levels` will be ignored, otherwise it will be - /// populated with the number of levels read, with an error returned if it is `None`. - /// - /// `values` will be contiguously populated with the non-null values. Note that if the column - /// is not required, this may be less than either `batch_size` or the number of levels read - #[deprecated(note = "Use read_records")] - pub fn read_batch( - &mut self, - batch_size: usize, - def_levels: Option<&mut D::Buffer>, - rep_levels: Option<&mut R::Buffer>, - values: &mut V::Buffer, - ) -> Result<(usize, usize)> { - let (_, values, levels) = self.read_records(batch_size, def_levels, rep_levels, values)?; - - Ok((values, levels)) - } - /// Read up to `max_records` whole records, returning the number of complete /// records, non-null values and levels decoded. All levels for a given record /// will be read, i.e. the next repetition level, if any, will be 0 diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 9bd79840f760..8dc1d0db4476 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -347,7 +347,7 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> { data_pages: VecDeque, // column index and offset index column_index_builder: ColumnIndexBuilder, - offset_index_builder: OffsetIndexBuilder, + offset_index_builder: Option, // Below fields used to incrementally check boundary order across data pages. // We assume they are ascending/descending until proven wrong. @@ -394,6 +394,12 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { column_index_builder.to_invalid() } + // Disable offset_index_builder if requested by user. + let offset_index_builder = match props.offset_index_disabled() { + false => Some(OffsetIndexBuilder::new()), + _ => None, + }; + Self { descr, props, @@ -408,7 +414,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { page_metrics, column_metrics, column_index_builder, - offset_index_builder: OffsetIndexBuilder::new(), + offset_index_builder, encodings, data_page_boundary_ascending: true, data_page_boundary_descending: true, @@ -568,7 +574,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// anticipated encoded size. #[cfg(feature = "arrow")] pub(crate) fn get_estimated_total_bytes(&self) -> u64 { - self.column_metrics.total_bytes_written + self.data_pages + .iter() + .map(|page| page.data().len() as u64) + .sum::() + + self.column_metrics.total_bytes_written + self.encoder.estimated_data_page_size() as u64 + self.encoder.estimated_dict_page_size().unwrap_or_default() as u64 } @@ -613,7 +623,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .column_index_builder .valid() .then(|| self.column_index_builder.build_to_thrift()); - let offset_index = Some(self.offset_index_builder.build_to_thrift()); + + let offset_index = self.offset_index_builder.map(|b| b.build_to_thrift()); Ok(ColumnCloseResult { bytes_written: self.column_metrics.total_bytes_written, @@ -841,11 +852,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { ); // Update the offset index - self.offset_index_builder - .append_row_count(self.page_metrics.num_buffered_rows as i64); - - self.offset_index_builder - .append_unencoded_byte_array_data_bytes(page_variable_length_bytes); + if let Some(builder) = self.offset_index_builder.as_mut() { + builder.append_row_count(self.page_metrics.num_buffered_rows as i64); + builder.append_unencoded_byte_array_data_bytes(page_variable_length_bytes); + } } /// Determine if we should allow truncating min/max values for this column's statistics @@ -868,24 +878,67 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } + /// Returns `true` if this column's logical type is a UTF-8 string. + fn is_utf8(&self) -> bool { + self.get_descriptor().logical_type() == Some(LogicalType::String) + || self.get_descriptor().converted_type() == ConvertedType::UTF8 + } + + /// Truncates a binary statistic to at most `truncation_length` bytes. + /// + /// If truncation is not possible, returns `data`. + /// + /// The `bool` in the returned tuple indicates whether truncation occurred or not. + /// + /// UTF-8 Note: + /// If the column type indicates UTF-8, and `data` contains valid UTF-8, then the result will + /// also remain valid UTF-8, but may be less tnan `truncation_length` bytes to avoid splitting + /// on non-character boundaries. fn truncate_min_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { truncation_length .filter(|l| data.len() > *l) - .and_then(|l| match str::from_utf8(data) { - Ok(str_data) => truncate_utf8(str_data, l), - Err(_) => Some(data[..l].to_vec()), - }) + .and_then(|l| + // don't do extra work if this column isn't UTF-8 + if self.is_utf8() { + match str::from_utf8(data) { + Ok(str_data) => truncate_utf8(str_data, l), + Err(_) => Some(data[..l].to_vec()), + } + } else { + Some(data[..l].to_vec()) + } + ) .map(|truncated| (truncated, true)) .unwrap_or_else(|| (data.to_vec(), false)) } + /// Truncates a binary statistic to at most `truncation_length` bytes, and then increment the + /// final byte(s) to yield a valid upper bound. This may result in a result of less than + /// `truncation_length` bytes if the last byte(s) overflows. + /// + /// If truncation is not possible, returns `data`. + /// + /// The `bool` in the returned tuple indicates whether truncation occurred or not. + /// + /// UTF-8 Note: + /// If the column type indicates UTF-8, and `data` contains valid UTF-8, then the result will + /// also remain valid UTF-8 (but again may be less than `truncation_length` bytes). If `data` + /// does not contain valid UTF-8, then truncation will occur as if the column is non-string + /// binary. fn truncate_max_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { truncation_length .filter(|l| data.len() > *l) - .and_then(|l| match str::from_utf8(data) { - Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8), - Err(_) => increment(data[..l].to_vec()), - }) + .and_then(|l| + // don't do extra work if this column isn't UTF-8 + if self.is_utf8() { + match str::from_utf8(data) { + Ok(str_data) => truncate_and_increment_utf8(str_data, l), + Err(_) => increment(data[..l].to_vec()), + } + } else { + increment(data[..l].to_vec()) + } + ) .map(|truncated| (truncated, true)) .unwrap_or_else(|| (data.to_vec(), false)) } @@ -1174,8 +1227,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let page_spec = self.page_writer.write_page(page)?; // update offset index // compressed_size = header_size + compressed_data_size - self.offset_index_builder - .append_offset_and_size(page_spec.offset as i64, page_spec.compressed_size as i32); + if let Some(builder) = self.offset_index_builder.as_mut() { + builder + .append_offset_and_size(page_spec.offset as i64, page_spec.compressed_size as i32) + } self.update_metrics_for_page(page_spec); Ok(()) } @@ -1406,13 +1461,50 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool { (a[1..]) > (b[1..]) } -/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 string, -/// while being less than `length` bytes and non-empty +/// Truncate a UTF-8 slice to the longest prefix that is still a valid UTF-8 string, +/// while being less than `length` bytes and non-empty. Returns `None` if truncation +/// is not possible within those constraints. +/// +/// The caller guarantees that data.len() > length. fn truncate_utf8(data: &str, length: usize) -> Option> { let split = (1..=length).rfind(|x| data.is_char_boundary(*x))?; Some(data.as_bytes()[..split].to_vec()) } +/// Truncate a UTF-8 slice and increment it's final character. The returned value is the +/// longest such slice that is still a valid UTF-8 string while being less than `length` +/// bytes and non-empty. Returns `None` if no such transformation is possible. +/// +/// The caller guarantees that data.len() > length. +fn truncate_and_increment_utf8(data: &str, length: usize) -> Option> { + // UTF-8 is max 4 bytes, so start search 3 back from desired length + let lower_bound = length.saturating_sub(3); + let split = (lower_bound..=length).rfind(|x| data.is_char_boundary(*x))?; + increment_utf8(data.get(..split)?) +} + +/// Increment the final character in a UTF-8 string in such a way that the returned result +/// is still a valid UTF-8 string. The returned string may be shorter than the input if the +/// last character(s) cannot be incremented (due to overflow or producing invalid code points). +/// Returns `None` if the string cannot be incremented. +/// +/// Note that this implementation will not promote an N-byte code point to (N+1) bytes. +fn increment_utf8(data: &str) -> Option> { + for (idx, original_char) in data.char_indices().rev() { + let original_len = original_char.len_utf8(); + if let Some(next_char) = char::from_u32(original_char as u32 + 1) { + // do not allow increasing byte width of incremented char + if next_char.len_utf8() == original_len { + let mut result = data.as_bytes()[..idx + original_len].to_vec(); + next_char.encode_utf8(&mut result[idx..]); + return Some(result); + } + } + } + + None +} + /// Try and increment the bytes from right to left. /// /// Returns `None` if all bytes are set to `u8::MAX`. @@ -1429,29 +1521,15 @@ fn increment(mut data: Vec) -> Option> { None } -/// Try and increment the the string's bytes from right to left, returning when the result -/// is a valid UTF8 string. Returns `None` when it can't increment any byte. -fn increment_utf8(mut data: Vec) -> Option> { - for idx in (0..data.len()).rev() { - let original = data[idx]; - let (byte, overflow) = original.overflowing_add(1); - if !overflow { - data[idx] = byte; - if str::from_utf8(&data).is_ok() { - return Some(data); - } - data[idx] = original; - } - } - - None -} - #[cfg(test)] mod tests { - use crate::file::properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; + use crate::{ + file::{properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, writer::SerializedFileWriter}, + schema::parser::parse_message_type, + }; + use core::str; use rand::distributions::uniform::SampleUniform; - use std::sync::Arc; + use std::{fs::File, sync::Arc}; use crate::column::{ page::PageReader, @@ -3128,39 +3206,69 @@ mod tests { #[test] fn test_increment_utf8() { + let test_inc = |o: &str, expected: &str| { + if let Ok(v) = String::from_utf8(increment_utf8(o).unwrap()) { + // Got the expected result... + assert_eq!(v, expected); + // and it's greater than the original string + assert!(*v > *o); + // Also show that BinaryArray level comparison works here + let mut greater = ByteArray::new(); + greater.set_data(Bytes::from(v)); + let mut original = ByteArray::new(); + original.set_data(Bytes::from(o.as_bytes().to_vec())); + assert!(greater > original); + } else { + panic!("Expected incremented UTF8 string to also be valid."); + } + }; + // Basic ASCII case - let v = increment_utf8("hello".as_bytes().to_vec()).unwrap(); - assert_eq!(&v, "hellp".as_bytes()); + test_inc("hello", "hellp"); - // Also show that BinaryArray level comparison works here - let mut greater = ByteArray::new(); - greater.set_data(Bytes::from(v)); - let mut original = ByteArray::new(); - original.set_data(Bytes::from("hello".as_bytes().to_vec())); - assert!(greater > original); + // 1-byte ending in max 1-byte + test_inc("a\u{7f}", "b"); + + // 1-byte max should not truncate as it would need 2-byte code points + assert!(increment_utf8("\u{7f}\u{7f}").is_none()); // UTF8 string - let s = "❤️🧡💛💚💙💜"; - let v = increment_utf8(s.as_bytes().to_vec()).unwrap(); + test_inc("❤️🧡💛💚💙💜", "❤️🧡💛💚💙💝"); - if let Ok(new) = String::from_utf8(v) { - assert_ne!(&new, s); - assert_eq!(new, "❤️🧡💛💚💙💝"); - assert!(new.as_bytes().last().unwrap() > s.as_bytes().last().unwrap()); - } else { - panic!("Expected incremented UTF8 string to also be valid.") - } + // 2-byte without overflow + test_inc("éééé", "éééê"); - // Max UTF8 character - should be a No-Op - let s = char::MAX.to_string(); - assert_eq!(s.len(), 4); - let v = increment_utf8(s.as_bytes().to_vec()); - assert!(v.is_none()); + // 2-byte that overflows lowest byte + test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}"); + + // 2-byte ending in max 2-byte + test_inc("a\u{7ff}", "b"); + + // Max 2-byte should not truncate as it would need 3-byte code points + assert!(increment_utf8("\u{7ff}\u{7ff}").is_none()); + + // 3-byte without overflow [U+800, U+800] -> [U+800, U+801] (note that these + // characters should render right to left). + test_inc("ࠀࠀ", "ࠀࠁ"); + + // 3-byte ending in max 3-byte + test_inc("a\u{ffff}", "b"); + + // Max 3-byte should not truncate as it would need 4-byte code points + assert!(increment_utf8("\u{ffff}\u{ffff}").is_none()); + + // 4-byte without overflow + test_inc("𐀀𐀀", "𐀀𐀁"); + + // 4-byte ending in max unicode + test_inc("a\u{10ffff}", "b"); - // Handle multi-byte UTF8 characters - let s = "a\u{10ffff}"; - let v = increment_utf8(s.as_bytes().to_vec()); - assert_eq!(&v.unwrap(), "b\u{10ffff}".as_bytes()); + // Max 4-byte should not truncate + assert!(increment_utf8("\u{10ffff}\u{10ffff}").is_none()); + + // Skip over surrogate pair range (0xD800..=0xDFFF) + //test_inc("a\u{D7FF}", "a\u{e000}"); + test_inc("a\u{D7FF}", "b"); } #[test] @@ -3170,7 +3278,6 @@ mod tests { let r = truncate_utf8(data, data.as_bytes().len()).unwrap(); assert_eq!(r.len(), data.as_bytes().len()); assert_eq!(&r, data.as_bytes()); - println!("len is {}", data.len()); // We slice it away from the UTF8 boundary let r = truncate_utf8(data, 13).unwrap(); @@ -3180,6 +3287,90 @@ mod tests { // One multi-byte code point, and a length shorter than it, so we can't slice it let r = truncate_utf8("\u{0836}", 1); assert!(r.is_none()); + + // Test truncate and increment for max bounds on UTF-8 statistics + // 7-bit (i.e. ASCII) + let r = truncate_and_increment_utf8("yyyyyyyyy", 8).unwrap(); + assert_eq!(&r, "yyyyyyyz".as_bytes()); + + // 2-byte without overflow + let r = truncate_and_increment_utf8("ééééé", 7).unwrap(); + assert_eq!(&r, "ééê".as_bytes()); + + // 2-byte that overflows lowest byte + let r = truncate_and_increment_utf8("\u{ff}\u{ff}\u{ff}\u{ff}\u{ff}", 8).unwrap(); + assert_eq!(&r, "\u{ff}\u{ff}\u{ff}\u{100}".as_bytes()); + + // max 2-byte should not truncate as it would need 3-byte code points + let r = truncate_and_increment_utf8("߿߿߿߿߿", 8); + assert!(r.is_none()); + + // 3-byte without overflow [U+800, U+800, U+800] -> [U+800, U+801] (note that these + // characters should render right to left). + let r = truncate_and_increment_utf8("ࠀࠀࠀࠀ", 8).unwrap(); + assert_eq!(&r, "ࠀࠁ".as_bytes()); + + // max 3-byte should not truncate as it would need 4-byte code points + let r = truncate_and_increment_utf8("\u{ffff}\u{ffff}\u{ffff}", 8); + assert!(r.is_none()); + + // 4-byte without overflow + let r = truncate_and_increment_utf8("𐀀𐀀𐀀𐀀", 9).unwrap(); + assert_eq!(&r, "𐀀𐀁".as_bytes()); + + // max 4-byte should not truncate + let r = truncate_and_increment_utf8("\u{10ffff}\u{10ffff}", 8); + assert!(r.is_none()); + } + + #[test] + // Check fallback truncation of statistics that should be UTF-8, but aren't + // (see https://github.com/apache/arrow-rs/pull/6870). + fn test_byte_array_truncate_invalid_utf8_statistics() { + let message_type = " + message test_schema { + OPTIONAL BYTE_ARRAY a (UTF8); + } + "; + let schema = Arc::new(parse_message_type(message_type).unwrap()); + + // Create Vec containing non-UTF8 bytes + let data = vec![ByteArray::from(vec![128u8; 32]); 7]; + let def_levels = [1, 1, 1, 1, 0, 1, 0, 1, 0, 1]; + let file: File = tempfile::tempfile().unwrap(); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Chunk) + .set_statistics_truncate_length(Some(8)) + .build(), + ); + + let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&data, Some(&def_levels), None) + .unwrap(); + col_writer.close().unwrap(); + row_group_writer.close().unwrap(); + let file_metadata = writer.close().unwrap(); + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let stats = file_metadata.row_groups[0].columns[0] + .meta_data + .as_ref() + .unwrap() + .statistics + .as_ref() + .unwrap(); + assert!(!stats.is_max_value_exact.unwrap()); + // Truncation of invalid UTF-8 should fall back to binary truncation, so last byte should + // be incremented by 1. + assert_eq!( + stats.max_value, + Some([128, 128, 128, 128, 128, 128, 128, 129].to_vec()) + ); } #[test] @@ -3215,6 +3406,52 @@ mod tests { assert!(column_close_result.column_index.is_none()); } + #[test] + fn test_no_offset_index_when_disabled() { + // Test that offset indexes can be disabled + let descr = Arc::new(get_test_column_descr::(1, 0)); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::None) + .set_offset_index_disabled(true) + .build(), + ); + let column_writer = get_column_writer(descr, props, get_test_page_writer()); + let mut writer = get_typed_column_writer::(column_writer); + + let data = Vec::new(); + let def_levels = vec![0; 10]; + writer.write_batch(&data, Some(&def_levels), None).unwrap(); + writer.flush_data_pages().unwrap(); + + let column_close_result = writer.close().unwrap(); + assert!(column_close_result.offset_index.is_none()); + assert!(column_close_result.column_index.is_none()); + } + + #[test] + fn test_offset_index_overridden() { + // Test that offset indexes are not disabled when gathering page statistics + let descr = Arc::new(get_test_column_descr::(1, 0)); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .set_offset_index_disabled(true) + .build(), + ); + let column_writer = get_column_writer(descr, props, get_test_page_writer()); + let mut writer = get_typed_column_writer::(column_writer); + + let data = Vec::new(); + let def_levels = vec![0; 10]; + writer.write_batch(&data, Some(&def_levels), None).unwrap(); + writer.flush_data_pages().unwrap(); + + let column_close_result = writer.close().unwrap(); + assert!(column_close_result.offset_index.is_some()); + assert!(column_close_result.column_index.is_some()); + } + #[test] fn test_boundary_order() -> Result<()> { let descr = Arc::new(get_test_column_descr::(1, 0)); @@ -3368,6 +3605,26 @@ mod tests { assert!(stats.max_bytes_opt().is_none()); } + #[test] + #[cfg(feature = "arrow")] + fn test_column_writer_get_estimated_total_bytes() { + let page_writer = get_test_page_writer(); + let props = Default::default(); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + assert_eq!(writer.get_estimated_total_bytes(), 0); + + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_one_page = writer.get_estimated_total_bytes(); + assert_eq!(size_with_one_page, 20); + + writer.write_batch(&[5, 6, 7, 8], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_two_pages = writer.get_estimated_total_bytes(); + // different pages have different compressed lengths + assert_eq!(size_with_two_pages, 20 + 21); + } + fn write_multiple_pages( column_descr: &Arc, pages: &[&[Option]], diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 0c708c126503..d089ba7836e1 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -369,17 +369,17 @@ impl RleDecoder { } #[inline(never)] - pub fn get_batch(&mut self, buffer: &mut [T]) -> Result { + pub fn get_batch(&mut self, buffer: &mut [T]) -> Result { assert!(size_of::() <= 8); let mut values_read = 0; while values_read < buffer.len() { if self.rle_left > 0 { let num_values = cmp::min(buffer.len() - values_read, self.rle_left as usize); + let repeated_value = + T::try_from_le_slice(&self.current_value.as_mut().unwrap().to_ne_bytes())?; for i in 0..num_values { - let repeated_value = - T::try_from_le_slice(&self.current_value.as_mut().unwrap().to_ne_bytes())?; - buffer[values_read + i] = repeated_value; + buffer[values_read + i] = repeated_value.clone(); } self.rle_left -= num_values as u32; values_read += num_values; diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index f7fb1ead0ccc..d749287bba62 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -28,6 +28,7 @@ use arrow_schema::ArrowError; // Note: we don't implement PartialEq as the semantics for the // external variant are not well defined (#4469) #[derive(Debug)] +#[non_exhaustive] pub enum ParquetError { /// General Parquet error. /// Returned when code violates normal workflow of working with Parquet files. @@ -48,6 +49,9 @@ pub enum ParquetError { IndexOutOfBound(usize, usize), /// An external error variant External(Box), + /// Returned when a function needs more data to complete properly. The `usize` field indicates + /// the total number of bytes required, not the number of additional bytes. + NeedMoreData(usize), } impl std::fmt::Display for ParquetError { @@ -64,6 +68,7 @@ impl std::fmt::Display for ParquetError { write!(fmt, "Index {index} out of bound: {bound}") } ParquetError::External(e) => write!(fmt, "External: {e}"), + ParquetError::NeedMoreData(needed) => write!(fmt, "NeedMoreData: {needed}"), } } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 32b985710023..252cb99f3f36 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -190,7 +190,7 @@ impl ParquetMetaData { /// Creates Parquet metadata from file metadata, a list of row /// group metadata, and the column index structures. - #[deprecated(note = "Use ParquetMetaDataBuilder")] + #[deprecated(since = "53.1.0", note = "Use ParquetMetaDataBuilder")] pub fn new_with_page_index( file_metadata: FileMetaData, row_groups: Vec, @@ -230,12 +230,6 @@ impl ParquetMetaData { &self.row_groups } - /// Returns page indexes in this file. - #[deprecated(note = "Use Self::column_index")] - pub fn page_indexes(&self) -> Option<&ParquetColumnIndex> { - self.column_index.as_ref() - } - /// Returns the column index for this file if loaded /// /// Returns `None` if the parquet file does not have a `ColumnIndex` or @@ -246,12 +240,6 @@ impl ParquetMetaData { self.column_index.as_ref() } - /// Returns the offset index for this file if loaded - #[deprecated(note = "Use Self::offset_index")] - pub fn offset_indexes(&self) -> Option<&ParquetOffsetIndex> { - self.offset_index.as_ref() - } - /// Returns offset indexes in this file, if loaded /// /// Returns `None` if the parquet file does not have a `OffsetIndex` or diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 1a9957f00f1e..c6715a33b5ae 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -178,8 +178,10 @@ impl ParquetMetaDataReader { /// /// # Errors /// - /// This function will return [`ParquetError::IndexOutOfBound`] in the event `reader` does not - /// provide enough data to fully parse the metadata (see example below). + /// This function will return [`ParquetError::NeedMoreData`] in the event `reader` does not + /// provide enough data to fully parse the metadata (see example below). The returned error + /// will be populated with a `usize` field indicating the number of bytes required from the + /// tail of the file to completely parse the requested metadata. /// /// Other errors returned include [`ParquetError::General`] and [`ParquetError::EOF`]. /// @@ -192,11 +194,13 @@ impl ParquetMetaDataReader { /// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); } /// let file = open_parquet_file("some_path.parquet"); /// let len = file.len() as usize; - /// let bytes = get_bytes(&file, 1000..len); + /// // Speculatively read 1 kilobyte from the end of the file + /// let bytes = get_bytes(&file, len - 1024..len); /// let mut reader = ParquetMetaDataReader::new().with_page_indexes(true); /// match reader.try_parse_sized(&bytes, len) { /// Ok(_) => (), - /// Err(ParquetError::IndexOutOfBound(needed, _)) => { + /// Err(ParquetError::NeedMoreData(needed)) => { + /// // Read the needed number of bytes from the end of the file /// let bytes = get_bytes(&file, len - needed..len); /// reader.try_parse_sized(&bytes, len).unwrap(); /// } @@ -204,15 +208,44 @@ impl ParquetMetaDataReader { /// } /// let metadata = reader.finish().unwrap(); /// ``` + /// + /// Note that it is possible for the file metadata to be completely read, but there are + /// insufficient bytes available to read the page indexes. [`Self::has_metadata()`] can be used + /// to test for this. In the event the file metadata is present, re-parsing of the file + /// metadata can be skipped by using [`Self::read_page_indexes_sized()`], as shown below. + /// ```no_run + /// # use parquet::file::metadata::ParquetMetaDataReader; + /// # use parquet::errors::ParquetError; + /// # use crate::parquet::file::reader::Length; + /// # fn get_bytes(file: &std::fs::File, range: std::ops::Range) -> bytes::Bytes { unimplemented!(); } + /// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); } + /// let file = open_parquet_file("some_path.parquet"); + /// let len = file.len() as usize; + /// // Speculatively read 1 kilobyte from the end of the file + /// let mut bytes = get_bytes(&file, len - 1024..len); + /// let mut reader = ParquetMetaDataReader::new().with_page_indexes(true); + /// // Loop until `bytes` is large enough + /// loop { + /// match reader.try_parse_sized(&bytes, len) { + /// Ok(_) => break, + /// Err(ParquetError::NeedMoreData(needed)) => { + /// // Read the needed number of bytes from the end of the file + /// bytes = get_bytes(&file, len - needed..len); + /// // If file metadata was read only read page indexes, otherwise continue loop + /// if reader.has_metadata() { + /// reader.read_page_indexes_sized(&bytes, len); + /// break; + /// } + /// } + /// _ => panic!("unexpected error") + /// } + /// } + /// let metadata = reader.finish().unwrap(); + /// ``` pub fn try_parse_sized(&mut self, reader: &R, file_size: usize) -> Result<()> { self.metadata = match self.parse_metadata(reader) { Ok(metadata) => Some(metadata), - // FIXME: throughout this module ParquetError::IndexOutOfBound is used to indicate the - // need for more data. This is not it's intended use. The plan is to add a NeedMoreData - // value to the enum, but this would be a breaking change. This will be done as - // 54.0.0 draws nearer. - // https://github.com/apache/arrow-rs/issues/6447 - Err(ParquetError::IndexOutOfBound(needed, _)) => { + Err(ParquetError::NeedMoreData(needed)) => { // If reader is the same length as `file_size` then presumably there is no more to // read, so return an EOF error. if file_size == reader.len() as usize || needed > file_size { @@ -223,7 +256,7 @@ impl ParquetMetaDataReader { )); } else { // Ask for a larger buffer - return Err(ParquetError::IndexOutOfBound(needed, file_size)); + return Err(ParquetError::NeedMoreData(needed)); } } Err(e) => return Err(e), @@ -246,7 +279,8 @@ impl ParquetMetaDataReader { /// Read the page index structures when a [`ParquetMetaData`] has already been obtained. /// This variant is used when `reader` cannot access the entire Parquet file (e.g. it is /// a [`Bytes`] struct containing the tail of the file). - /// See [`Self::new_with_metadata()`] and [`Self::has_metadata()`]. + /// See [`Self::new_with_metadata()`] and [`Self::has_metadata()`]. Like + /// [`Self::try_parse_sized()`] this function may return [`ParquetError::NeedMoreData`]. pub fn read_page_indexes_sized( &mut self, reader: &R, @@ -269,7 +303,6 @@ impl ParquetMetaDataReader { // Get bounds needed for page indexes (if any are present in the file). let Some(range) = self.range_for_page_index() else { - self.empty_page_indexes(); return Ok(()); }; @@ -285,10 +318,7 @@ impl ParquetMetaDataReader { )); } else { // Ask for a larger buffer - return Err(ParquetError::IndexOutOfBound( - file_size - range.start, - file_size, - )); + return Err(ParquetError::NeedMoreData(file_size - range.start)); } } @@ -446,20 +476,6 @@ impl ParquetMetaDataReader { Ok(()) } - /// Set the column_index and offset_indexes to empty `Vec` for backwards compatibility - /// - /// See for details - fn empty_page_indexes(&mut self) { - let metadata = self.metadata.as_mut().unwrap(); - let num_row_groups = metadata.num_row_groups(); - if self.column_index { - metadata.set_column_index(Some(vec![vec![]; num_row_groups])); - } - if self.offset_index { - metadata.set_offset_index(Some(vec![vec![]; num_row_groups])); - } - } - fn range_for_page_index(&self) -> Option> { // sanity check self.metadata.as_ref()?; @@ -484,10 +500,7 @@ impl ParquetMetaDataReader { // check file is large enough to hold footer let file_size = chunk_reader.len(); if file_size < (FOOTER_SIZE as u64) { - return Err(ParquetError::IndexOutOfBound( - FOOTER_SIZE, - file_size as usize, - )); + return Err(ParquetError::NeedMoreData(FOOTER_SIZE)); } let mut footer = [0_u8; 8]; @@ -500,10 +513,7 @@ impl ParquetMetaDataReader { self.metadata_size = Some(footer_metadata_len); if footer_metadata_len > file_size as usize { - return Err(ParquetError::IndexOutOfBound( - footer_metadata_len, - file_size as usize, - )); + return Err(ParquetError::NeedMoreData(footer_metadata_len)); } let start = file_size - footer_metadata_len as u64; @@ -681,7 +691,7 @@ mod tests { let err = ParquetMetaDataReader::new() .parse_metadata(&test_file) .unwrap_err(); - assert!(matches!(err, ParquetError::IndexOutOfBound(8, _))); + assert!(matches!(err, ParquetError::NeedMoreData(8))); } #[test] @@ -700,7 +710,7 @@ mod tests { let err = ParquetMetaDataReader::new() .parse_metadata(&test_file) .unwrap_err(); - assert!(matches!(err, ParquetError::IndexOutOfBound(263, _))); + assert!(matches!(err, ParquetError::NeedMoreData(263))); } #[test] @@ -794,7 +804,7 @@ mod tests { // should fail match reader.try_parse_sized(&bytes, len).unwrap_err() { // expected error, try again with provided bounds - ParquetError::IndexOutOfBound(needed, _) => { + ParquetError::NeedMoreData(needed) => { let bytes = bytes_for_range(len - needed..len); reader.try_parse_sized(&bytes, len).unwrap(); let metadata = reader.finish().unwrap(); @@ -804,6 +814,26 @@ mod tests { _ => panic!("unexpected error"), }; + // not enough for file metadata, but keep trying until page indexes are read + let mut reader = ParquetMetaDataReader::new().with_page_indexes(true); + let mut bytes = bytes_for_range(452505..len); + loop { + match reader.try_parse_sized(&bytes, len) { + Ok(_) => break, + Err(ParquetError::NeedMoreData(needed)) => { + bytes = bytes_for_range(len - needed..len); + if reader.has_metadata() { + reader.read_page_indexes_sized(&bytes, len).unwrap(); + break; + } + } + _ => panic!("unexpected error"), + } + } + let metadata = reader.finish().unwrap(); + assert!(metadata.column_index.is_some()); + assert!(metadata.offset_index.is_some()); + // not enough for page index but lie about file size let bytes = bytes_for_range(323584..len); let reader_result = reader.try_parse_sized(&bytes, len - 323584).unwrap_err(); @@ -818,7 +848,7 @@ mod tests { // should fail match reader.try_parse_sized(&bytes, len).unwrap_err() { // expected error, try again with provided bounds - ParquetError::IndexOutOfBound(needed, _) => { + ParquetError::NeedMoreData(needed) => { let bytes = bytes_for_range(len - needed..len); reader.try_parse_sized(&bytes, len).unwrap(); reader.finish().unwrap(); diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 395e9afe122c..fd3639ac3069 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -43,8 +43,7 @@ pub(crate) fn acc_range(a: Option>, b: Option>) -> Opt /// /// Returns a vector of `index[column_number]`. /// -/// Returns an empty vector if this row group does not contain a -/// [`ColumnIndex`]. +/// Returns `None` if this row group does not contain a [`ColumnIndex`]. /// /// See [Page Index Documentation] for more details. /// @@ -52,26 +51,29 @@ pub(crate) fn acc_range(a: Option>, b: Option>) -> Opt pub fn read_columns_indexes( reader: &R, chunks: &[ColumnChunkMetaData], -) -> Result, ParquetError> { +) -> Result>, ParquetError> { let fetch = chunks .iter() .fold(None, |range, c| acc_range(range, c.column_index_range())); let fetch = match fetch { Some(r) => r, - None => return Ok(vec![Index::NONE; chunks.len()]), + None => return Ok(None), }; let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?; let get = |r: Range| &bytes[(r.start - fetch.start)..(r.end - fetch.start)]; - chunks - .iter() - .map(|c| match c.column_index_range() { - Some(r) => decode_column_index(get(r), c.column_type()), - None => Ok(Index::NONE), - }) - .collect() + Some( + chunks + .iter() + .map(|c| match c.column_index_range() { + Some(r) => decode_column_index(get(r), c.column_type()), + None => Ok(Index::NONE), + }) + .collect(), + ) + .transpose() } /// Reads [`OffsetIndex`], per-page [`PageLocation`] for all columns of a row @@ -116,8 +118,7 @@ pub fn read_pages_locations( /// /// Returns a vector of `offset_index[column_number]`. /// -/// Returns an empty vector if this row group does not contain an -/// [`OffsetIndex`]. +/// Returns `None` if this row group does not contain an [`OffsetIndex`]. /// /// See [Page Index Documentation] for more details. /// @@ -125,26 +126,29 @@ pub fn read_pages_locations( pub fn read_offset_indexes( reader: &R, chunks: &[ColumnChunkMetaData], -) -> Result, ParquetError> { +) -> Result>, ParquetError> { let fetch = chunks .iter() .fold(None, |range, c| acc_range(range, c.offset_index_range())); let fetch = match fetch { Some(r) => r, - None => return Ok(vec![]), + None => return Ok(None), }; let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?; let get = |r: Range| &bytes[(r.start - fetch.start)..(r.end - fetch.start)]; - chunks - .iter() - .map(|c| match c.offset_index_range() { - Some(r) => decode_offset_index(get(r)), - None => Err(general_err!("missing offset index")), - }) - .collect() + Some( + chunks + .iter() + .map(|c| match c.offset_index_range() { + Some(r) => decode_offset_index(get(r)), + None => Err(general_err!("missing offset index")), + }) + .collect(), + ) + .transpose() } pub(crate) fn decode_offset_index(data: &[u8]) -> Result { diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index efcb63258f99..dc918f6b5634 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -16,14 +16,13 @@ // under the License. //! Configuration via [`WriterProperties`] and [`ReaderProperties`] -use std::str::FromStr; -use std::{collections::HashMap, sync::Arc}; - use crate::basic::{Compression, Encoding}; use crate::compression::{CodecOptions, CodecOptionsBuilder}; use crate::file::metadata::KeyValue; use crate::format::SortingColumn; use crate::schema::types::ColumnPath; +use std::str::FromStr; +use std::{collections::HashMap, sync::Arc}; /// Default value for [`WriterProperties::data_page_size_limit`] pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; @@ -42,6 +41,7 @@ pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000; /// Default value for [`WriterProperties::statistics_enabled`] pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page; /// Default value for [`WriterProperties::max_statistics_size`] +#[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; /// Default value for [`WriterProperties::max_row_group_size`] pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; @@ -57,6 +57,10 @@ pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05; pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64; /// Default values for [`WriterProperties::statistics_truncate_length`] pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option = None; +/// Default value for [`WriterProperties::offset_index_disabled`] +pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false; +/// Default values for [`WriterProperties::coerce_types`] +pub const DEFAULT_COERCE_TYPES: bool = false; /// Parquet writer version. /// @@ -157,12 +161,14 @@ pub struct WriterProperties { bloom_filter_position: BloomFilterPosition, writer_version: WriterVersion, created_by: String, + offset_index_disabled: bool, pub(crate) key_value_metadata: Option>, default_column_properties: ColumnProperties, column_properties: HashMap, sorting_columns: Option>, column_index_truncate_length: Option, statistics_truncate_length: Option, + coerce_types: bool, } impl Default for WriterProperties { @@ -185,14 +191,6 @@ impl WriterProperties { WriterPropertiesBuilder::with_defaults() } - /// Returns data page size limit. - /// - /// Note: this is a best effort limit based on the write batch size - #[deprecated(since = "41.0.0", note = "Use data_page_size_limit")] - pub fn data_pagesize_limit(&self) -> usize { - self.data_page_size_limit - } - /// Returns data page size limit. /// /// Note: this is a best effort limit based on the write batch size @@ -202,14 +200,6 @@ impl WriterProperties { self.data_page_size_limit } - /// Returns dictionary page size limit. - /// - /// Note: this is a best effort limit based on the write batch size - #[deprecated(since = "41.0.0", note = "Use dictionary_page_size_limit")] - pub fn dictionary_pagesize_limit(&self) -> usize { - self.dictionary_page_size_limit - } - /// Returns dictionary page size limit. /// /// Note: this is a best effort limit based on the write batch size @@ -257,6 +247,22 @@ impl WriterProperties { &self.created_by } + /// Returns `true` if offset index writing is disabled. + pub fn offset_index_disabled(&self) -> bool { + // If page statistics are to be collected, then do not disable the offset indexes. + let default_page_stats_enabled = + self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page); + let column_page_stats_enabled = self + .column_properties + .iter() + .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page)); + if default_page_stats_enabled || column_page_stats_enabled { + return false; + } + + self.offset_index_disabled + } + /// Returns `key_value_metadata` KeyValue pairs. pub fn key_value_metadata(&self) -> Option<&Vec> { self.key_value_metadata.as_ref() @@ -281,6 +287,11 @@ impl WriterProperties { self.statistics_truncate_length } + /// Returns `true` if type coercion is enabled. + pub fn coerce_types(&self) -> bool { + self.coerce_types + } + /// Returns encoding for a data page, when dictionary encoding is enabled. /// This is not configurable. #[inline] @@ -340,7 +351,9 @@ impl WriterProperties { /// Returns max size for statistics. /// Only applicable if statistics are enabled. + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] pub fn max_statistics_size(&self, col: &ColumnPath) -> usize { + #[allow(deprecated)] self.column_properties .get(col) .and_then(|c| c.max_statistics_size()) @@ -371,12 +384,14 @@ pub struct WriterPropertiesBuilder { bloom_filter_position: BloomFilterPosition, writer_version: WriterVersion, created_by: String, + offset_index_disabled: bool, key_value_metadata: Option>, default_column_properties: ColumnProperties, column_properties: HashMap, sorting_columns: Option>, column_index_truncate_length: Option, statistics_truncate_length: Option, + coerce_types: bool, } impl WriterPropertiesBuilder { @@ -391,12 +406,14 @@ impl WriterPropertiesBuilder { bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION, writer_version: DEFAULT_WRITER_VERSION, created_by: DEFAULT_CREATED_BY.to_string(), + offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED, key_value_metadata: None, default_column_properties: Default::default(), column_properties: HashMap::new(), sorting_columns: None, column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH, + coerce_types: DEFAULT_COERCE_TYPES, } } @@ -411,12 +428,14 @@ impl WriterPropertiesBuilder { bloom_filter_position: self.bloom_filter_position, writer_version: self.writer_version, created_by: self.created_by, + offset_index_disabled: self.offset_index_disabled, key_value_metadata: self.key_value_metadata, default_column_properties: self.default_column_properties, column_properties: self.column_properties, sorting_columns: self.sorting_columns, column_index_truncate_length: self.column_index_truncate_length, statistics_truncate_length: self.statistics_truncate_length, + coerce_types: self.coerce_types, } } @@ -433,16 +452,6 @@ impl WriterPropertiesBuilder { self } - /// Sets best effort maximum size of a data page in bytes. - /// - /// Note: this is a best effort limit based on value of - /// [`set_write_batch_size`](Self::set_write_batch_size). - #[deprecated(since = "41.0.0", note = "Use set_data_page_size_limit")] - pub fn set_data_pagesize_limit(mut self, value: usize) -> Self { - self.data_page_size_limit = value; - self - } - /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`). /// /// The parquet writer will attempt to limit the sizes of each @@ -471,16 +480,6 @@ impl WriterPropertiesBuilder { self } - /// Sets best effort maximum dictionary page size, in bytes. - /// - /// Note: this is a best effort limit based on value of - /// [`set_write_batch_size`](Self::set_write_batch_size). - #[deprecated(since = "41.0.0", note = "Use set_dictionary_page_size_limit")] - pub fn set_dictionary_pagesize_limit(mut self, value: usize) -> Self { - self.dictionary_page_size_limit = value; - self - } - /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`). /// /// The parquet writer will attempt to limit the size of each @@ -532,6 +531,21 @@ impl WriterPropertiesBuilder { self } + /// Sets whether the writing of offset indexes is disabled (defaults to `false`). + /// + /// If statistics level is set to [`Page`] this setting will be overridden with `false`. + /// + /// Note: As the offset indexes are useful for accessing data by row number, + /// they are always written by default, regardless of whether other statistics + /// are enabled. Disabling this metadata may result in a degradation in read + /// performance, so use this option with care. + /// + /// [`Page`]: EnabledStatistics::Page + pub fn set_offset_index_disabled(mut self, value: bool) -> Self { + self.offset_index_disabled = value; + self + } + /// Sets "key_value_metadata" property (defaults to `None`). pub fn set_key_value_metadata(mut self, value: Option>) -> Self { self.key_value_metadata = value; @@ -590,7 +604,9 @@ impl WriterPropertiesBuilder { /// Sets default max statistics size for all columns (defaults to `4096`). /// /// Applicable only if statistics are enabled. + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] pub fn set_max_statistics_size(mut self, value: usize) -> Self { + #[allow(deprecated)] self.default_column_properties .set_max_statistics_size(value); self @@ -695,7 +711,9 @@ impl WriterPropertiesBuilder { /// Sets max size for statistics for a specific column. /// /// Takes precedence over [`Self::set_max_statistics_size`]. + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self { + #[allow(deprecated)] self.get_mut_props(col).set_max_statistics_size(value); self } @@ -767,6 +785,29 @@ impl WriterPropertiesBuilder { self.statistics_truncate_length = max_length; self } + + /// Should the writer coerce types to parquet native types (defaults to `false`). + /// + /// Leaving this option the default `false` will ensure the exact same data + /// written to parquet using this library will be read. + /// + /// Setting this option to `true` will result in parquet files that can be + /// read by more readers, but potentially lose information in the process. + /// + /// * Types such as [`DataType::Date64`], which have no direct corresponding + /// Parquet type, may be stored with lower precision. + /// + /// * The internal field names of `List` and `Map` types will be renamed if + /// necessary to match what is required by the newest Parquet specification. + /// + /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details + /// + /// [`DataType::Date64`]: arrow_schema::DataType::Date64 + /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types + pub fn set_coerce_types(mut self, coerce_types: bool) -> Self { + self.coerce_types = coerce_types; + self + } } /// Controls the level of statistics to be computed by the writer and stored in @@ -862,6 +903,7 @@ struct ColumnProperties { codec: Option, dictionary_enabled: Option, statistics_enabled: Option, + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] max_statistics_size: Option, /// bloom filter related properties bloom_filter_properties: Option, @@ -894,12 +936,14 @@ impl ColumnProperties { self.dictionary_enabled = Some(enabled); } - /// Sets whether or not statistics are enabled for this column. + /// Sets the statistics level for this column. fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) { self.statistics_enabled = Some(enabled); } /// Sets max size for statistics for this column. + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] + #[allow(deprecated)] fn set_max_statistics_size(&mut self, value: usize) { self.max_statistics_size = Some(value); } @@ -957,14 +1001,16 @@ impl ColumnProperties { self.dictionary_enabled } - /// Returns `Some(true)` if statistics are enabled for this column, if disabled then - /// returns `Some(false)`. If result is `None`, then no setting has been provided. + /// Returns optional statistics level requested for this column. If result is `None`, + /// then no setting has been provided. fn statistics_enabled(&self) -> Option { self.statistics_enabled } /// Returns optional max size in bytes for statistics. + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] fn max_statistics_size(&self) -> Option { + #[allow(deprecated)] self.max_statistics_size } @@ -1108,10 +1154,6 @@ mod tests { props.statistics_enabled(&ColumnPath::from("col")), DEFAULT_STATISTICS_ENABLED ); - assert_eq!( - props.max_statistics_size(&ColumnPath::from("col")), - DEFAULT_MAX_STATISTICS_SIZE - ); assert!(props .bloom_filter_properties(&ColumnPath::from("col")) .is_none()); @@ -1188,13 +1230,11 @@ mod tests { .set_compression(Compression::GZIP(Default::default())) .set_dictionary_enabled(false) .set_statistics_enabled(EnabledStatistics::None) - .set_max_statistics_size(50) // specific column settings .set_column_encoding(ColumnPath::from("col"), Encoding::RLE) .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY) .set_column_dictionary_enabled(ColumnPath::from("col"), true) .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk) - .set_column_max_statistics_size(ColumnPath::from("col"), 123) .set_column_bloom_filter_enabled(ColumnPath::from("col"), true) .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64) .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1) @@ -1226,7 +1266,6 @@ mod tests { props.statistics_enabled(&ColumnPath::from("a")), EnabledStatistics::None ); - assert_eq!(props.max_statistics_size(&ColumnPath::from("a")), 50); assert_eq!( props.encoding(&ColumnPath::from("col")), @@ -1241,7 +1280,6 @@ mod tests { props.statistics_enabled(&ColumnPath::from("col")), EnabledStatistics::Chunk ); - assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123); assert_eq!( props.bloom_filter_properties(&ColumnPath::from("col")), Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 }) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index f3ac13797a03..a942481f7e4d 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1262,8 +1262,8 @@ mod tests { let reader = SerializedFileReader::new_with_options(test_file, read_options)?; let metadata = reader.metadata(); assert_eq!(metadata.num_row_groups(), 0); - assert_eq!(metadata.column_index().unwrap().len(), 0); - assert_eq!(metadata.offset_index().unwrap().len(), 0); + assert!(metadata.column_index().is_none()); + assert!(metadata.offset_index().is_none()); // false, true predicate let test_file = get_test_file("alltypes_tiny_pages.parquet"); @@ -1275,8 +1275,8 @@ mod tests { let reader = SerializedFileReader::new_with_options(test_file, read_options)?; let metadata = reader.metadata(); assert_eq!(metadata.num_row_groups(), 0); - assert_eq!(metadata.column_index().unwrap().len(), 0); - assert_eq!(metadata.offset_index().unwrap().len(), 0); + assert!(metadata.column_index().is_none()); + assert!(metadata.offset_index().is_none()); // false, false predicate let test_file = get_test_file("alltypes_tiny_pages.parquet"); @@ -1288,8 +1288,8 @@ mod tests { let reader = SerializedFileReader::new_with_options(test_file, read_options)?; let metadata = reader.metadata(); assert_eq!(metadata.num_row_groups(), 0); - assert_eq!(metadata.column_index().unwrap().len(), 0); - assert_eq!(metadata.offset_index().unwrap().len(), 0); + assert!(metadata.column_index().is_none()); + assert!(metadata.offset_index().is_none()); Ok(()) } @@ -1379,13 +1379,15 @@ mod tests { let columns = metadata.row_group(0).columns(); let reversed: Vec<_> = columns.iter().cloned().rev().collect(); - let a = read_columns_indexes(&test_file, columns).unwrap(); - let mut b = read_columns_indexes(&test_file, &reversed).unwrap(); + let a = read_columns_indexes(&test_file, columns).unwrap().unwrap(); + let mut b = read_columns_indexes(&test_file, &reversed) + .unwrap() + .unwrap(); b.reverse(); assert_eq!(a, b); - let a = read_offset_indexes(&test_file, columns).unwrap(); - let mut b = read_offset_indexes(&test_file, &reversed).unwrap(); + let a = read_offset_indexes(&test_file, columns).unwrap().unwrap(); + let mut b = read_offset_indexes(&test_file, &reversed).unwrap().unwrap(); b.reverse(); assert_eq!(a, b); } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index b84c57a60e19..6b7707f03cd9 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1742,6 +1742,7 @@ mod tests { let props = WriterProperties::builder() .set_statistics_enabled(EnabledStatistics::None) .set_column_statistics_enabled("a".into(), EnabledStatistics::Page) + .set_offset_index_disabled(true) // this should be ignored because of the line above .build(); let mut file = Vec::with_capacity(1024); let mut file_writer = diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index c95ce3f9223b..1b0d81c7d9ab 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -52,6 +52,11 @@ pub struct Row { #[allow(clippy::len_without_is_empty)] impl Row { + /// Constructs a `Row` from the list of `fields` and returns it. + pub fn new(fields: Vec<(String, Field)>) -> Row { + Row { fields } + } + /// Get the number of fields in this row. pub fn len(&self) -> usize { self.fields.len() @@ -283,12 +288,6 @@ impl RowAccessor for Row { row_complex_accessor!(get_map, MapInternal, Map); } -/// Constructs a `Row` from the list of `fields` and returns it. -#[inline] -pub fn make_row(fields: Vec<(String, Field)>) -> Row { - Row { fields } -} - impl fmt::Display for Row { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{{")?; @@ -1386,7 +1385,7 @@ mod tests { ("z".to_string(), Field::Float(3.1)), ("a".to_string(), Field::Str("abc".to_string())), ]; - let row = Field::Group(make_row(fields)); + let row = Field::Group(Row::new(fields)); assert_eq!(format!("{row}"), "{x: null, Y: 2, z: 3.1, a: \"abc\"}"); let row = Field::ListInternal(make_list(vec![ @@ -1431,7 +1430,7 @@ mod tests { assert!(Field::Decimal(Decimal::from_i32(4, 8, 2)).is_primitive()); // complex types - assert!(!Field::Group(make_row(vec![ + assert!(!Field::Group(Row::new(vec![ ("x".to_string(), Field::Null), ("Y".to_string(), Field::Int(2)), ("z".to_string(), Field::Float(3.1)), @@ -1458,7 +1457,7 @@ mod tests { #[test] fn test_row_primitive_field_fmt() { // Primitives types - let row = make_row(vec![ + let row = Row::new(vec![ ("00".to_string(), Field::Null), ("01".to_string(), Field::Bool(false)), ("02".to_string(), Field::Byte(3)), @@ -1513,10 +1512,10 @@ mod tests { #[test] fn test_row_complex_field_fmt() { // Complex types - let row = make_row(vec![ + let row = Row::new(vec![ ( "00".to_string(), - Field::Group(make_row(vec![ + Field::Group(Row::new(vec![ ("x".to_string(), Field::Null), ("Y".to_string(), Field::Int(2)), ])), @@ -1548,7 +1547,7 @@ mod tests { #[test] fn test_row_primitive_accessors() { // primitives - let row = make_row(vec![ + let row = Row::new(vec![ ("a".to_string(), Field::Null), ("b".to_string(), Field::Bool(false)), ("c".to_string(), Field::Byte(3)), @@ -1590,7 +1589,7 @@ mod tests { #[test] fn test_row_primitive_invalid_accessors() { // primitives - let row = make_row(vec![ + let row = Row::new(vec![ ("a".to_string(), Field::Null), ("b".to_string(), Field::Bool(false)), ("c".to_string(), Field::Byte(3)), @@ -1619,10 +1618,10 @@ mod tests { #[test] fn test_row_complex_accessors() { - let row = make_row(vec![ + let row = Row::new(vec![ ( "a".to_string(), - Field::Group(make_row(vec![ + Field::Group(Row::new(vec![ ("x".to_string(), Field::Null), ("Y".to_string(), Field::Int(2)), ])), @@ -1653,10 +1652,10 @@ mod tests { #[test] fn test_row_complex_invalid_accessors() { - let row = make_row(vec![ + let row = Row::new(vec![ ( "a".to_string(), - Field::Group(make_row(vec![ + Field::Group(Row::new(vec![ ("x".to_string(), Field::Null), ("Y".to_string(), Field::Int(2)), ])), @@ -1802,7 +1801,7 @@ mod tests { #[test] fn test_list_complex_accessors() { - let list = make_list(vec![Field::Group(make_row(vec![ + let list = make_list(vec![Field::Group(Row::new(vec![ ("x".to_string(), Field::Null), ("Y".to_string(), Field::Int(2)), ]))]); @@ -1826,7 +1825,7 @@ mod tests { #[test] fn test_list_complex_invalid_accessors() { - let list = make_list(vec![Field::Group(make_row(vec![ + let list = make_list(vec![Field::Group(Row::new(vec![ ("x".to_string(), Field::Null), ("Y".to_string(), Field::Int(2)), ]))]); @@ -1961,7 +1960,7 @@ mod tests { ("Y".to_string(), Field::Double(2.2)), ("Z".to_string(), Field::Str("abc".to_string())), ]; - let row = Field::Group(make_row(fields)); + let row = Field::Group(Row::new(fields)); assert_eq!( row.to_json_value(), serde_json::json!({"X": 1, "Y": 2.2, "Z": "abc"}) @@ -1990,14 +1989,14 @@ mod tests { #[cfg(test)] #[allow(clippy::many_single_char_names)] mod api_tests { - use super::{make_list, make_map, make_row}; + use super::{make_list, make_map, Row}; use crate::record::Field; #[test] fn test_field_visibility() { - let row = make_row(vec![( + let row = Row::new(vec![( "a".to_string(), - Field::Group(make_row(vec![ + Field::Group(Row::new(vec![ ("x".to_string(), Field::Null), ("Y".to_string(), Field::Int(2)), ])), @@ -2009,7 +2008,7 @@ mod api_tests { match column.1 { Field::Group(r) => { assert_eq!( - &make_row(vec![ + &Row::new(vec![ ("x".to_string(), Field::Null), ("Y".to_string(), Field::Int(2)), ]), @@ -2027,7 +2026,7 @@ mod api_tests { fn test_list_element_access() { let expected = vec![ Field::Int(1), - Field::Group(make_row(vec![ + Field::Group(Row::new(vec![ ("x".to_string(), Field::Null), ("Y".to_string(), Field::Int(2)), ])), diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index fd6ca7cdd57a..9e70f7a980db 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -24,7 +24,7 @@ use crate::basic::{ConvertedType, Repetition}; use crate::errors::{ParquetError, Result}; use crate::file::reader::{FileReader, RowGroupReader}; use crate::record::{ - api::{make_list, make_map, make_row, Field, Row}, + api::{make_list, make_map, Field, Row}, triplet::TripletIter, }; use crate::schema::types::{ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr}; @@ -217,11 +217,15 @@ impl TreeBuilder { Repetition::REPEATED, "Invalid map type: {field:?}" ); - assert_eq!( - key_value_type.get_fields().len(), - 2, - "Invalid map type: {field:?}" - ); + // Parquet spec allows no value. In that case treat as a list. #1642 + if key_value_type.get_fields().len() != 1 { + // If not a list, then there can only be 2 fields in the struct + assert_eq!( + key_value_type.get_fields().len(), + 2, + "Invalid map type: {field:?}" + ); + } path.push(String::from(key_value_type.name())); @@ -239,25 +243,35 @@ impl TreeBuilder { row_group_reader, )?; - let value_type = &key_value_type.get_fields()[1]; - let value_reader = self.reader_tree( - value_type.clone(), - path, - curr_def_level + 1, - curr_rep_level + 1, - paths, - row_group_reader, - )?; + if key_value_type.get_fields().len() == 1 { + path.pop(); + Reader::RepeatedReader( + field, + curr_def_level, + curr_rep_level, + Box::new(key_reader), + ) + } else { + let value_type = &key_value_type.get_fields()[1]; + let value_reader = self.reader_tree( + value_type.clone(), + path, + curr_def_level + 1, + curr_rep_level + 1, + paths, + row_group_reader, + )?; - path.pop(); + path.pop(); - Reader::KeyValueReader( - field, - curr_def_level, - curr_rep_level, - Box::new(key_reader), - Box::new(value_reader), - ) + Reader::KeyValueReader( + field, + curr_def_level, + curr_rep_level, + Box::new(key_reader), + Box::new(value_reader), + ) + } } // A repeated field that is neither contained by a `LIST`- or // `MAP`-annotated group nor annotated by `LIST` or `MAP` @@ -345,6 +359,19 @@ impl Reader { /// /// #backward-compatibility-rules fn is_element_type(repeated_type: &Type) -> bool { + // For legacy 2-level list types whose element type is a 2-level list + // + // // ARRAY> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group array (LIST) { + // repeated int32 array; + // }; + // } + // + if repeated_type.is_list() || repeated_type.has_single_repeated_child() { + return false; + } + // For legacy 2-level list types with primitive element type, e.g.: // // // ARRAY (nullable list, non-null elements) @@ -399,7 +426,7 @@ impl Reader { for reader in readers { fields.push((String::from(reader.field_name()), reader.read_field()?)); } - Ok(make_row(fields)) + Ok(Row::new(fields)) } _ => panic!("Cannot call read() on {self}"), } @@ -434,7 +461,7 @@ impl Reader { fields.push((String::from(reader.field_name()), Field::Null)); } } - let row = make_row(fields); + let row = Row::new(fields); Field::Group(row) } Reader::RepeatedReader(_, def_level, rep_level, ref mut reader) => { @@ -826,7 +853,7 @@ mod tests { macro_rules! row { ($($e:tt)*) => { { - make_row(vec![$($e)*]) + Row::new(vec![$($e)*]) } } } @@ -1459,8 +1486,7 @@ mod tests { } #[test] - #[should_panic(expected = "Invalid map type")] - fn test_file_reader_rows_invalid_map_type() { + fn test_file_reader_rows_nested_map_type() { let schema = " message spark_schema { OPTIONAL group a (MAP) { @@ -1823,6 +1849,36 @@ mod tests { assert_eq!(rows, expected_rows); } + #[test] + fn test_map_no_value() { + // File schema: + // message schema { + // required group my_map (MAP) { + // repeated group key_value { + // required int32 key; + // optional int32 value; + // } + // } + // required group my_map_no_v (MAP) { + // repeated group key_value { + // required int32 key; + // } + // } + // required group my_list (LIST) { + // repeated group list { + // required int32 element; + // } + // } + // } + let rows = test_file_reader_rows("map_no_value.parquet", None).unwrap(); + + // the my_map_no_v and my_list columns should be equivalent lists by this point + for row in rows { + let cols = row.into_columns(); + assert_eq!(cols[1].1, cols[2].1); + } + } + fn test_file_reader_rows(file_name: &str, schema: Option) -> Result> { let file = get_test_file(file_name); let file_reader: Box = Box::new(SerializedFileReader::new(file)?); @@ -1839,4 +1895,21 @@ mod tests { let iter = row_group_reader.get_row_iter(schema)?; Ok(iter.map(|row| row.unwrap()).collect()) } + + #[test] + fn test_read_old_nested_list() { + let rows = test_file_reader_rows("old_list_structure.parquet", None).unwrap(); + let expected_rows = vec![row![( + "a".to_string(), + Field::ListInternal(make_list( + [ + make_list([1, 2].map(Field::Int).to_vec()), + make_list([3, 4].map(Field::Int).to_vec()) + ] + .map(Field::ListInternal) + .to_vec() + )) + ),]]; + assert_eq!(rows, expected_rows); + } } diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 8aff612dba5d..0347f7da46d6 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -202,6 +202,29 @@ impl Type { self.get_basic_info().has_repetition() && self.get_basic_info().repetition() != Repetition::REQUIRED } + + /// Returns `true` if this type is annotated as a list. + pub(crate) fn is_list(&self) -> bool { + if self.is_group() { + let basic_info = self.get_basic_info(); + if let Some(logical_type) = basic_info.logical_type() { + return logical_type == LogicalType::List; + } + return basic_info.converted_type() == ConvertedType::LIST; + } + false + } + + /// Returns `true` if this type is a group with a single child field that is `repeated`. + pub(crate) fn has_single_repeated_child(&self) -> bool { + if self.is_group() { + let children = self.get_fields(); + return children.len() == 1 + && children[0].get_basic_info().has_repetition() + && children[0].get_basic_info().repetition() == Repetition::REPEATED; + } + false + } } /// A builder for primitive types. All attributes are optional @@ -927,6 +950,32 @@ impl ColumnDescriptor { /// /// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for /// each primitive (leaf) column. +/// +/// # Example +/// ``` +/// # use std::sync::Arc; +/// use parquet::schema::types::{SchemaDescriptor, Type}; +/// use parquet::basic; // note there are two `Type`s that are different +/// // Schema for a table with two columns: "a" (int64) and "b" (int32, stored as a date) +/// let descriptor = SchemaDescriptor::new( +/// Arc::new( +/// Type::group_type_builder("my_schema") +/// .with_fields(vec![ +/// Arc::new( +/// Type::primitive_type_builder("a", basic::Type::INT64) +/// .build().unwrap() +/// ), +/// Arc::new( +/// Type::primitive_type_builder("b", basic::Type::INT32) +/// .with_converted_type(basic::ConvertedType::DATE) +/// .with_logical_type(Some(basic::LogicalType::Date)) +/// .build().unwrap() +/// ), +/// ]) +/// .build().unwrap() +/// ) +/// ); +/// ``` #[derive(PartialEq)] pub struct SchemaDescriptor { /// The top-level logical schema (the "message" type). diff --git a/parquet_derive/LICENSE.txt b/parquet_derive/LICENSE.txt new file mode 120000 index 000000000000..4ab43736a839 --- /dev/null +++ b/parquet_derive/LICENSE.txt @@ -0,0 +1 @@ +../LICENSE.txt \ No newline at end of file diff --git a/parquet_derive/NOTICE.txt b/parquet_derive/NOTICE.txt new file mode 120000 index 000000000000..eb9f24e040b5 --- /dev/null +++ b/parquet_derive/NOTICE.txt @@ -0,0 +1 @@ +../NOTICE.txt \ No newline at end of file